Tag: python

  • Data Quality check

    We will use basic company data (http://download.companieshouse.gov.uk/en_output.html) for this article. A PySpark script is written for carrying out minimal data quality checks on the dataset from the Companies House.

    The downloadable data snapshot containing basic company data of live companies on the Companies House register. This snapshot is provided as ZIP files containing data in CSV format and is split into multiple files.

    Data Structure

    First, look at the data and see the structure of the dataset. Perform basic checks to identify duplicates, null, missing values, etc.

    A list of the data fields contained in the snapshot can be found here PDF

    ch_file = '[DIR_PATH]/BasicCompanyData-2021-04-01-part1_6.csv'
    df_ch = spark.read.option("header",True).option("multiline",True).option("escape", "\"").csv(ch_file)
    df_ch.dtypes
    [('CompanyName', 'string'),
     ('CompanyNumber', 'string'),
     ('RegAddressCareOf', 'string'),
     ('RegAddressPOBox', 'string'),
     ('RegAddressAddressLine1', 'string'),
     ('RegAddressAddressLine2', 'string'),
     ('RegAddressPostTown', 'string'),
     ('RegAddressCounty', 'string'),
     ('RegAddressCountry', 'string'),
     ('RegAddressPostCode', 'string'),
     ('CompanyCategory', 'string'),
     ('CompanyStatus', 'string'),
     ('CountryOfOrigin', 'string'),
     ('DissolutionDate', 'string'),
     ('IncorporationDate', 'string'),
     ('AccountsAccountRefDay', 'string'),
     ('AccountsAccountRefMonth', 'string'),
     ('AccountsNextDueDate', 'string'),
     ('AccountsLastMadeUpDate', 'string'),
     ('AccountsAccountCategory', 'string'),
     ('ReturnsNextDueDate', 'string'),
     ('ReturnsLastMadeUpDate', 'string'),
     ('MortgagesNumMortCharges', 'string'),
     ('MortgagesNumMortOutstanding', 'string'),
     ('MortgagesNumMortPartSatisfied', 'string'),
     ('MortgagesNumMortSatisfied', 'string'),
     ('SICCodeSicText_1', 'string'),
     ('SICCodeSicText_2', 'string'),
     ('SICCodeSicText_3', 'string'),
     ('SICCodeSicText_4', 'string'),
     ('LimitedPartnershipsNumGenPartners', 'string'),
     ('LimitedPartnershipsNumLimPartners', 'string'),
     ('URI', 'string'),
     ('PreviousName_1CONDATE', 'string'),
     ('PreviousName_1CompanyName', 'string'),
     ('PreviousName_2CONDATE', 'string'),
     ('PreviousName_2CompanyName', 'string'),
     ('PreviousName_3CONDATE', 'string'),
     ('PreviousName_3CompanyName', 'string'),
     ('PreviousName_4CONDATE', 'string'),
     ('PreviousName_4CompanyName', 'string'),
     ('PreviousName_5CONDATE', 'string'),
     ('PreviousName_5CompanyName', 'string'),
     ('PreviousName_6CONDATE', 'string'),
     ('PreviousName_6CompanyName', 'string'),
     ('PreviousName_7CONDATE', 'string'),
     ('PreviousName_7CompanyName', 'string'),
     ('PreviousName_8CONDATE', 'string'),
     ('PreviousName_8CompanyName', 'string'),
     ('PreviousName_9CONDATE', 'string'),
     ('PreviousName_9CompanyName', 'string'),
     ('PreviousName_10CONDATE', 'string'),
     ('PreviousName_10CompanyName', 'string'),
     ('ConfStmtNextDueDate', 'string'),
     ('ConfStmtLastMadeUpDate', 'string')]
    """
    quick QA checks for null values, min and max length of each field values 
    """
    df = df_ch.agg(*[(F.count(F.when(F.isnull(c), c)).alias(c + '_null'))for c in df_ch.columns],
                       *[(F.max(F.length(c))).alias(c + '_max')for c in df_ch.columns],
                       *[(F.min(F.length(c))).alias(c + '_min')for c in df_ch.columns])
    
    # max and min length of each field values
    df.show(truncate=False,vertical=True)
    -RECORD 0----------------------------------------
     CompanyName_null                       | 0      
     CompanyNumber_null                     | 0      
     RegAddressCareOf_null                  | 837999 
     RegAddressPOBox_null                   | 847719 
     RegAddressAddressLine1_null            | 6289   
     RegAddressAddressLine2_null            | 318447 
     RegAddressPostTown_null                | 17085  
     RegAddressCounty_null                  | 542944 
     RegAddressCountry_null                 | 258467 
     RegAddressPostCode_null                | 9978   
     CompanyCategory_null                   | 0      
     CompanyStatus_null                     | 0      
     CountryOfOrigin_null                   | 0      
     DissolutionDate_null                   | 849999 
     IncorporationDate_null                 | 0      
     AccountsAccountRefDay_null             | 18072  
     AccountsAccountRefMonth_null           | 18072  
     AccountsNextDueDate_null               | 20591  
     AccountsLastMadeUpDate_null            | 236837 
     AccountsAccountCategory_null           | 240774 
     ReturnsNextDueDate_null                | 21452  
     ReturnsLastMadeUpDate_null             | 470009 
     MortgagesNumMortCharges_null           | 0      
     MortgagesNumMortOutstanding_null       | 0      
     MortgagesNumMortPartSatisfied_null     | 0      
     MortgagesNumMortSatisfied_null         | 0      
     SICCodeSicText_1_null                  | 0      
     SICCodeSicText_2_null                  | 736926 
     SICCodeSicText_3_null                  | 804878 
     SICCodeSicText_4_null                  | 830027 
     LimitedPartnershipsNumGenPartners_null | 0      
     LimitedPartnershipsNumLimPartners_null | 0      
     URI_null                               | 0      
     PreviousName_1CONDATE_null             | 764037 
     PreviousName_1CompanyName_null         | 764037 
     PreviousName_2CONDATE_null             | 835087 
     PreviousName_2CompanyName_null         | 835087 
     PreviousName_3CONDATE_null             | 846750 
     PreviousName_3CompanyName_null         | 846750 
     PreviousName_4CONDATE_null             | 849197 
     PreviousName_4CompanyName_null         | 849197 
     PreviousName_5CONDATE_null             | 849741 
     PreviousName_5CompanyName_null         | 849741 
     PreviousName_6CONDATE_null             | 849926 
     PreviousName_6CompanyName_null         | 849926 
     PreviousName_7CONDATE_null             | 849975 
     PreviousName_7CompanyName_null         | 849975 
     PreviousName_8CONDATE_null             | 849987 
     PreviousName_8CompanyName_null         | 849987 
     PreviousName_9CONDATE_null             | 849992 
     PreviousName_9CompanyName_null         | 849992 
     PreviousName_10CONDATE_null            | 849994 
     PreviousName_10CompanyName_null        | 849994 
     ConfStmtNextDueDate_null               | 8659   
     ConfStmtLastMadeUpDate_null            | 177697 
     CompanyName_max                        | 160    
     CompanyNumber_max                      | 8      
     RegAddressCareOf_max                   | 65     
     RegAddressPOBox_max                    | 10     
     RegAddressAddressLine1_max             | 99     
     RegAddressAddressLine2_max             | 56     
     RegAddressPostTown_max                 | 41     
     RegAddressCounty_max                   | 50     
     RegAddressCountry_max                  | 24     
     RegAddressPostCode_max                 | 14     
     CompanyCategory_max                    | 89     
     CompanyStatus_max                      | 48     
     CountryOfOrigin_max                    | 23     
     DissolutionDate_max                    | null   
     IncorporationDate_max                  | 10     
     AccountsAccountRefDay_max              | 2      
     AccountsAccountRefMonth_max            | 2      
     AccountsNextDueDate_max                | 10     
     AccountsLastMadeUpDate_max             | 10     
     AccountsAccountCategory_max            | 27     
     ReturnsNextDueDate_max                 | 10     
     ReturnsLastMadeUpDate_max              | 10     
     MortgagesNumMortCharges_max            | 4      
     MortgagesNumMortOutstanding_max        | 4      
     MortgagesNumMortPartSatisfied_max      | 2      
     MortgagesNumMortSatisfied_max          | 4      
     SICCodeSicText_1_max                   | 166    
     SICCodeSicText_2_max                   | 166    
     SICCodeSicText_3_max                   | 166    
     SICCodeSicText_4_max                   | 166    
     LimitedPartnershipsNumGenPartners_max  | 2      
     LimitedPartnershipsNumLimPartners_max  | 3      
     URI_max                                | 47     
     PreviousName_1CONDATE_max              | 10     
     PreviousName_1CompanyName_max          | 114    
     PreviousName_2CONDATE_max              | 10     
     PreviousName_2CompanyName_max          | 91     
     PreviousName_3CONDATE_max              | 10     
     PreviousName_3CompanyName_max          | 88     
     PreviousName_4CONDATE_max              | 10     
     PreviousName_4CompanyName_max          | 72     
     PreviousName_5CONDATE_max              | 10     
     PreviousName_5CompanyName_max          | 58     
     PreviousName_6CONDATE_max              | 10     
     PreviousName_6CompanyName_max          | 49     
     PreviousName_7CONDATE_max              | 10     
     PreviousName_7CompanyName_max          | 42     
     PreviousName_8CONDATE_max              | 10     
     PreviousName_8CompanyName_max          | 35     
     PreviousName_9CONDATE_max              | 10     
     PreviousName_9CompanyName_max          | 42     
     PreviousName_10CONDATE_max             | 10     
     PreviousName_10CompanyName_max         | 32     
     ConfStmtNextDueDate_max                | 10     
     ConfStmtLastMadeUpDate_max             | 10     
     CompanyName_min                        | 1      
     CompanyNumber_min                      | 8      
     RegAddressCareOf_min                   | 1      
     RegAddressPOBox_min                    | 1      
     RegAddressAddressLine1_min             | 1      
     RegAddressAddressLine2_min             | 1      
     RegAddressPostTown_min                 | 1      
     RegAddressCounty_min                   | 1      
     RegAddressCountry_min                  | 4      
     RegAddressPostCode_min                 | 1      
     CompanyCategory_min                    | 16     
     CompanyStatus_min                      | 6      
     CountryOfOrigin_min                    | 1      
     DissolutionDate_min                    | null   
     IncorporationDate_min                  | 10     
     AccountsAccountRefDay_min              | 1      
     AccountsAccountRefMonth_min            | 1      
     AccountsNextDueDate_min                | 10     
     AccountsLastMadeUpDate_min             | 10     
     AccountsAccountCategory_min            | 4      
     ReturnsNextDueDate_min                 | 10     
     ReturnsLastMadeUpDate_min              | 10     
     MortgagesNumMortCharges_min            | 1      
     MortgagesNumMortOutstanding_min        | 1      
     MortgagesNumMortPartSatisfied_min      | 1      
     MortgagesNumMortSatisfied_min          | 1      
     SICCodeSicText_1_min                   | 11     
     SICCodeSicText_2_min                   | 11     
     SICCodeSicText_3_min                   | 11     
     SICCodeSicText_4_min                   | 11     
     LimitedPartnershipsNumGenPartners_min  | 1      
     LimitedPartnershipsNumLimPartners_min  | 1      
     URI_min                                | 47     
     PreviousName_1CONDATE_min              | 10     
     PreviousName_1CompanyName_min          | 3      
     PreviousName_2CONDATE_min              | 10     
     PreviousName_2CompanyName_min          | 4      
     PreviousName_3CONDATE_min              | 10     
     PreviousName_3CompanyName_min          | 6      
     PreviousName_4CONDATE_min              | 10     
     PreviousName_4CompanyName_min          | 8      
     PreviousName_5CONDATE_min              | 10     
     PreviousName_5CompanyName_min          | 7      
     PreviousName_6CONDATE_min              | 10     
     PreviousName_6CompanyName_min          | 8      
     PreviousName_7CONDATE_min              | 10     
     PreviousName_7CompanyName_min          | 10     
     PreviousName_8CONDATE_min              | 10     
     PreviousName_8CompanyName_min          | 11     
     PreviousName_9CONDATE_min              | 10     
     PreviousName_9CompanyName_min          | 16     
     PreviousName_10CONDATE_min             | 10     
     PreviousName_10CompanyName_min         | 13     
     ConfStmtNextDueDate_min                | 10     
     ConfStmtLastMadeUpDate_min             | 10     
    Diagnostic GroupBy started(limit 20 records): 202104061544
    +-------------------------------------------+----------+-----+----------+
    |CompanyName                                |total_rows|count|proportion|
    +-------------------------------------------+----------+-----+----------+
    |! LIMITED                                  |849999    |1    |0.0       |
    |! LTD                                      |849999    |1    |0.0       |
    |!? LTD                                     |849999    |1    |0.0       |
    |!BIG IMPACT GRAPHICS LIMITED               |849999    |1    |0.0       |
    |!L PRODUCTIONS LIMITED                     |849999    |1    |0.0       |
    |!NKED LTD                                  |849999    |1    |0.0       |
    |!NNOV8 LIMITED                             |849999    |1    |0.0       |
    |!NSPIRED INVESTMENTS LTD                   |849999    |1    |0.0       |
    |!NSPIRED LTD                               |849999    |1    |0.0       |
    |!NVERTD DESIGNS LIMITED                    |849999    |1    |0.0       |
    |!OBAC LIMITED                              |849999    |1    |0.0       |
    |!OBAC UK LIMITED                           |849999    |1    |0.0       |
    |!YOZO FASS LIMITED                         |849999    |1    |0.0       |
    |" BORA " 2 LTD                             |849999    |1    |0.0       |
    |"1 C O LIMITED"                            |849999    |1    |0.0       |
    |"1ST RATE" PSYCHOLOGY SERVICES LTD         |849999    |1    |0.0       |
    |"2 ECOUTE" LIMITED                         |849999    |1    |0.0       |
    |"20-20 VOICE" CANCER                       |849999    |1    |0.0       |
    |"243 RUGBY ROAD MANAGEMENT COMPANY LIMITED"|849999    |1    |0.0       |
    |"309" WEST END LANE MANAGEMENT LIMITED     |849999    |1    |0.0       |
    +-------------------------------------------+----------+-----+----------+
    
    None
    +-------------+----------+-----+----------+
    |CompanyNumber|total_rows|count|proportion|
    +-------------+----------+-----+----------+
    |00000118     |849999    |1    |0.0       |
    |00000133     |849999    |1    |0.0       |
    |00000258     |849999    |1    |0.0       |
    |00000529     |849999    |1    |0.0       |
    |00000687     |849999    |1    |0.0       |
    |00001122     |849999    |1    |0.0       |
    |00001140     |849999    |1    |0.0       |
    |00002119     |849999    |1    |0.0       |
    |00002537     |849999    |1    |0.0       |
    |00003002     |849999    |1    |0.0       |
    |00004190     |849999    |1    |0.0       |
    |00004606     |849999    |1    |0.0       |
    |00005188     |849999    |1    |0.0       |
    |00005295     |849999    |1    |0.0       |
    |00005335     |849999    |1    |0.0       |
    |00005728     |849999    |1    |0.0       |
    |00006278     |849999    |1    |0.0       |
    |00006470     |849999    |1    |0.0       |
    |00006480     |849999    |1    |0.0       |
    |00007241     |849999    |1    |0.0       |
    +-------------+----------+-----+----------+
    
    None
    +-----------------------------------------------------------------------------------------+----------+------+----------+
    |CompanyCategory                                                                          |total_rows|count |proportion|
    +-----------------------------------------------------------------------------------------+----------+------+----------+
    |Charitable Incorporated Organisation                                                     |849999    |3493  |0.0       |
    |Community Interest Company                                                               |849999    |3626  |0.0       |
    |Converted/Closed                                                                         |849999    |1     |0.0       |
    |Industrial and Provident Society                                                         |849999    |47    |0.0       |
    |Investment Company with Variable Capital                                                 |849999    |51    |0.0       |
    |Investment Company with Variable Capital (Securities)                                    |849999    |1     |0.0       |
    |Investment Company with Variable Capital(Umbrella)                                       |849999    |19    |0.0       |
    |Limited Liability Partnership                                                            |849999    |9144  |0.01      |
    |Limited Partnership                                                                      |849999    |11051 |0.01      |
    |Old Public Company                                                                       |849999    |13    |0.0       |
    |Other Company Type                                                                       |849999    |1     |0.0       |
    |Other company type                                                                       |849999    |2304  |0.0       |
    |PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)|849999    |5842  |0.01      |
    |PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital)                    |849999    |25038 |0.03      |
    |PRIV LTD SECT. 30 (Private limited company, section 30 of the Companies Act)             |849999    |2     |0.0       |
    |Private Limited Company                                                                  |849999    |784661|0.92      |
    |Private Unlimited                                                                        |849999    |19    |0.0       |
    |Private Unlimited Company                                                                |849999    |691   |0.0       |
    |Protected Cell Company                                                                   |849999    |1     |0.0       |
    |Public Limited Company                                                                   |849999    |1118  |0.0       |
    +-----------------------------------------------------------------------------------------+----------+------+----------+
    
    None
    +------------------------------------------------+----------+------+----------+
    |CompanyStatus                                   |total_rows|count |proportion|
    +------------------------------------------------+----------+------+----------+
    |ADMINISTRATION ORDER                            |849999    |28    |0.0       |
    |ADMINISTRATIVE RECEIVER                         |849999    |17    |0.0       |
    |Active                                          |849999    |808781|0.95      |
    |Active - Proposal to Strike off                 |849999    |23946 |0.03      |
    |In Administration                               |849999    |508   |0.0       |
    |In Administration/Administrative Receiver       |849999    |43    |0.0       |
    |In Administration/Receiver Manager              |849999    |5     |0.0       |
    |Liquidation                                     |849999    |16241 |0.02      |
    |Live but Receiver Manager on at least one charge|849999    |225   |0.0       |
    |RECEIVER MANAGER / ADMINISTRATIVE RECEIVER      |849999    |5     |0.0       |
    |RECEIVERSHIP                                    |849999    |35    |0.0       |
    |Voluntary Arrangement                           |849999    |165   |0.0       |
    +------------------------------------------------+----------+------+----------+
    
    None
    +------------------+----------+-----+----------+
    |RegAddressPostCode|total_rows|count|proportion|
    +------------------+----------+-----+----------+
    |null              |849999    |9978 |0.01      |
    |0161              |849999    |1    |0.0       |
    |02 806            |849999    |1    |0.0       |
    |04640             |849999    |1    |0.0       |
    |049 318           |849999    |1    |0.0       |
    |08540             |849999    |1    |0.0       |
    |0L16 1QE          |849999    |1    |0.0       |
    |1                 |849999    |1    |0.0       |
    |10 004            |849999    |1    |0.0       |
    |100041            |849999    |1    |0.0       |
    |10010             |849999    |1    |0.0       |
    |10018             |849999    |1    |0.0       |
    |1010              |849999    |1    |0.0       |
    |1018 GE           |849999    |1    |0.0       |
    |1050              |849999    |1    |0.0       |
    |110042            |849999    |1    |0.0       |
    |1160              |849999    |1    |0.0       |
    |12 0AF            |849999    |1    |0.0       |
    |121006            |849999    |1    |0.0       |
    |13900             |849999    |1    |0.0       |
    +------------------+----------+-----+----------+
    
    None
    +---------------------------+----------+------+----------+
    |AccountsAccountCategory    |total_rows|count |proportion|
    +---------------------------+----------+------+----------+
    |null                       |849999    |240774|0.28      |
    |ACCOUNTS TYPE NOT AVAILABLE|849999    |866   |0.0       |
    |AUDIT EXEMPTION SUBSIDIARY |849999    |2253  |0.0       |
    |AUDITED ABRIDGED           |849999    |249   |0.0       |
    |DORMANT                    |849999    |104986|0.12      |
    |FILING EXEMPTION SUBSIDIARY|849999    |70    |0.0       |
    |FULL                       |849999    |17019 |0.02      |
    |GROUP                      |849999    |4046  |0.0       |
    |INITIAL                    |849999    |1     |0.0       |
    |MEDIUM                     |849999    |76    |0.0       |
    |NO ACCOUNTS FILED          |849999    |236789|0.28      |
    |PARTIAL EXEMPTION          |849999    |4     |0.0       |
    |SMALL                      |849999    |11822 |0.01      |
    |TOTAL EXEMPTION FULL       |849999    |201577|0.24      |
    |TOTAL EXEMPTION SMALL      |849999    |3789  |0.0       |
    |UNAUDITED ABRIDGED         |849999    |25678 |0.03      |
    +---------------------------+----------+------+----------+
    
    None
    +------------------------------------------------------------------------+----------+-----+----------+
    |SICCodeSicText_1                                                        |total_rows|count|proportion|
    +------------------------------------------------------------------------+----------+-----+----------+
    |0111 - Grow cereals & other crops                                       |849999    |2    |0.0       |
    |01110 - Growing of cereals (except rice), leguminous crops and oil seeds|849999    |672  |0.0       |
    |0112 - Grow vegetables & nursery products                               |849999    |1    |0.0       |
    |01120 - Growing of rice                                                 |849999    |13   |0.0       |
    |01130 - Growing of vegetables and melons, roots and tubers              |849999    |193  |0.0       |
    |01140 - Growing of sugar cane                                           |849999    |7    |0.0       |
    |01150 - Growing of tobacco                                              |849999    |7    |0.0       |
    |01160 - Growing of fibre crops                                          |849999    |19   |0.0       |
    |01190 - Growing of other non-perennial crops                            |849999    |52   |0.0       |
    |01210 - Growing of grapes                                               |849999    |20   |0.0       |
    |01220 - Growing of tropical and subtropical fruits                      |849999    |9    |0.0       |
    |01230 - Growing of citrus fruits                                        |849999    |2    |0.0       |
    |01240 - Growing of pome fruits and stone fruits                         |849999    |36   |0.0       |
    |01250 - Growing of other tree and bush fruits and nuts                  |849999    |63   |0.0       |
    |01260 - Growing of oleaginous fruits                                    |849999    |5    |0.0       |
    |01270 - Growing of beverage crops                                       |849999    |7    |0.0       |
    |01280 - Growing of spices, aromatic, drug and pharmaceutical crops      |849999    |50   |0.0       |
    |01290 - Growing of other perennial crops                                |849999    |44   |0.0       |
    |0130 - Crops combined with animals, mixed farms                         |849999    |1    |0.0       |
    |01300 - Plant propagation                                               |849999    |92   |0.0       |
    +------------------------------------------------------------------------+----------+-----+----------+
    
    None
    Diagnostic Group By completed:  0:00:11.038049
    
    #save dataframe as HIVE parquet table
    df_ch.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404")

    Identify data quality issues

    issue_col = "qa_issues"
    df = df_ch.withColumn(issue_col,F.lit(""))
        
    postcode_cols = [
      "RegAddressPostCode"
    ]
        
    for col in postcode_cols:
      df = df.withColumn(issue_col, dc.add_to_string_col(
          df[issue_col], f"check_postcode({col})",
          dc.check_postcode(df[col])))
      df.cache().count()
    
    date_cols = [    
      'DissolutionDate',
      'IncorporationDate',
      'AccountsNextDueDate',
      'AccountsLastMadeUpDate',
      'ReturnsNextDueDate',
      'ReturnsLastMadeUpDate',
      'ConfStmtNextDueDate',
      'ConfStmtLastMadeUpDate'
    ]
    
    for col in date_cols:
        df = df.withColumn(issue_col, dc.add_to_string_col(
          df[issue_col], f"check_date({col})",
          dc.check_date(df[col],"dd/MM/yyyy")))
        df.cache().count()
    
    cols = [
      'CompanyName',
      'CompanyNumber',
      'RegAddressAddressLine1',
      'RegAddressPostCode',
      'CompanyCategory',
      'CompanyStatus',
      'CountryOfOrigin',
      'IncorporationDate',
      'MortgagesNumMortCharges',
      'MortgagesNumMortOutstanding',
      'MortgagesNumMortPartSatisfied',
      'MortgagesNumMortSatisfied',
      'SICCodeSicText_1',
      'LimitedPartnershipsNumGenPartners',
      'LimitedPartnershipsNumLimPartners',
      'ConfStmtNextDueDate'
    ]   
    
    for col in cols:
        df = df.withColumn(issue_col, dc.add_to_string_col(
          df[issue_col], f"check_not_null({col})",
          dc.check_not_null(df[col])))
        df.cache().count()
    
    df_issues = df.filter(F.col('qa_issues') != '')

    Result

    print(df_issues.count())
    df_issues.show(5,truncate=False,vertical=True)
    
    13343
    -RECORD 0------------------------------------------------------------------------------------------------------------------------------------------
     CompanyName                       | "20-20 VOICE" CANCER                                                                                          
     CompanyNumber                     | CE000952                                                                                                      
     RegAddressCareOf                  | null                                                                                                          
     RegAddressPOBox                   | null                                                                                                          
     RegAddressAddressLine1            | null                                                                                                          
     RegAddressAddressLine2            | null                                                                                                          
     RegAddressPostTown                | null                                                                                                          
     RegAddressCounty                  | null                                                                                                          
     RegAddressCountry                 | null                                                                                                          
     RegAddressPostCode                | null                                                                                                          
     CompanyCategory                   | Charitable Incorporated Organisation                                                                          
     CompanyStatus                     | Active                                                                                                        
     CountryOfOrigin                   | United Kingdom                                                                                                
     DissolutionDate                   | null                                                                                                          
     IncorporationDate                 | 17/04/2014                                                                                                    
     AccountsAccountRefDay             | null                                                                                                          
     AccountsAccountRefMonth           | null                                                                                                          
     AccountsNextDueDate               | null                                                                                                          
     AccountsLastMadeUpDate            | null                                                                                                          
     AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
     ReturnsNextDueDate                | null                                                                                                          
     ReturnsLastMadeUpDate             | null                                                                                                          
     MortgagesNumMortCharges           | 0                                                                                                             
     MortgagesNumMortOutstanding       | 0                                                                                                             
     MortgagesNumMortPartSatisfied     | 0                                                                                                             
     MortgagesNumMortSatisfied         | 0                                                                                                             
     SICCodeSicText_1                  | None Supplied                                                                                                 
     SICCodeSicText_2                  | null                                                                                                          
     SICCodeSicText_3                  | null                                                                                                          
     SICCodeSicText_4                  | null                                                                                                          
     LimitedPartnershipsNumGenPartners | 0                                                                                                             
     LimitedPartnershipsNumLimPartners | 0                                                                                                             
     URI                               | http://business.data.gov.uk/id/company/CE000952                                                               
     PreviousName_1CONDATE             | null                                                                                                          
     PreviousName_1CompanyName         | null                                                                                                          
     PreviousName_2CONDATE             | null                                                                                                          
     PreviousName_2CompanyName         | null                                                                                                          
     PreviousName_3CONDATE             | null                                                                                                          
     PreviousName_3CompanyName         | null                                                                                                          
     PreviousName_4CONDATE             | null                                                                                                          
     PreviousName_4CompanyName         | null                                                                                                          
     PreviousName_5CONDATE             | null                                                                                                          
     PreviousName_5CompanyName         | null                                                                                                          
     PreviousName_6CONDATE             | null                                                                                                          
     PreviousName_6CompanyName         | null                                                                                                          
     PreviousName_7CONDATE             | null                                                                                                          
     PreviousName_7CompanyName         | null                                                                                                          
     PreviousName_8CONDATE             | null                                                                                                          
     PreviousName_8CompanyName         | null                                                                                                          
     PreviousName_9CONDATE             | null                                                                                                          
     PreviousName_9CompanyName         | null                                                                                                          
     PreviousName_10CONDATE            | null                                                                                                          
     PreviousName_10CompanyName        | null                                                                                                          
     ConfStmtNextDueDate               | null                                                                                                          
     ConfStmtLastMadeUpDate            | null                                                                                                          
     qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
    -RECORD 1------------------------------------------------------------------------------------------------------------------------------------------
     CompanyName                       | "ALTAI CASHMERE" LLC                                                                                          
     CompanyNumber                     | FC027187                                                                                                      
     RegAddressCareOf                  | null                                                                                                          
     RegAddressPOBox                   | null                                                                                                          
     RegAddressAddressLine1            | BUILDING LEFT TO THE "TUUL"                                                                                   
     RegAddressAddressLine2            | DRY CLEANING                                                                                                  
     RegAddressPostTown                | KHAN-UUL DISTRICT                                                                                             
     RegAddressCounty                  | ULAANBAATAR, MONGOLIA                                                                                         
     RegAddressCountry                 | MONGOLIA                                                                                                      
     RegAddressPostCode                | null                                                                                                          
     CompanyCategory                   | Other company type                                                                                            
     CompanyStatus                     | Active                                                                                                        
     CountryOfOrigin                   | MONGOLIA                                                                                                      
     DissolutionDate                   | null                                                                                                          
     IncorporationDate                 | 14/06/2006                                                                                                    
     AccountsAccountRefDay             | 30                                                                                                            
     AccountsAccountRefMonth           | 6                                                                                                             
     AccountsNextDueDate               | null                                                                                                          
     AccountsLastMadeUpDate            | null                                                                                                          
     AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
     ReturnsNextDueDate                | null                                                                                                          
     ReturnsLastMadeUpDate             | null                                                                                                          
     MortgagesNumMortCharges           | 0                                                                                                             
     MortgagesNumMortOutstanding       | 0                                                                                                             
     MortgagesNumMortPartSatisfied     | 0                                                                                                             
     MortgagesNumMortSatisfied         | 0                                                                                                             
     SICCodeSicText_1                  | None Supplied                                                                                                 
     SICCodeSicText_2                  | null                                                                                                          
     SICCodeSicText_3                  | null                                                                                                          
     SICCodeSicText_4                  | null                                                                                                          
     LimitedPartnershipsNumGenPartners | 0                                                                                                             
     LimitedPartnershipsNumLimPartners | 0                                                                                                             
     URI                               | http://business.data.gov.uk/id/company/FC027187                                                               
     PreviousName_1CONDATE             | null                                                                                                          
     PreviousName_1CompanyName         | null                                                                                                          
     PreviousName_2CONDATE             | null                                                                                                          
     PreviousName_2CompanyName         | null                                                                                                          
     PreviousName_3CONDATE             | null                                                                                                          
     PreviousName_3CompanyName         | null                                                                                                          
     PreviousName_4CONDATE             | null                                                                                                          
     PreviousName_4CompanyName         | null                                                                                                          
     PreviousName_5CONDATE             | null                                                                                                          
     PreviousName_5CompanyName         | null                                                                                                          
     PreviousName_6CONDATE             | null                                                                                                          
     PreviousName_6CompanyName         | null                                                                                                          
     PreviousName_7CONDATE             | null                                                                                                          
     PreviousName_7CompanyName         | null                                                                                                          
     PreviousName_8CONDATE             | null                                                                                                          
     PreviousName_8CompanyName         | null                                                                                                          
     PreviousName_9CONDATE             | null                                                                                                          
     PreviousName_9CompanyName         | null                                                                                                          
     PreviousName_10CONDATE            | null                                                                                                          
     PreviousName_10CompanyName        | null                                                                                                          
     ConfStmtNextDueDate               | 26/12/2016                                                                                                    
     ConfStmtLastMadeUpDate            | null                                                                                                          
     qa_issues                         | check_not_null(RegAddressPostCode)                                                                            
    -RECORD 2------------------------------------------------------------------------------------------------------------------------------------------
     CompanyName                       | "ANDI'S GIFT"                                                                                                 
     CompanyNumber                     | CE012989                                                                                                      
     RegAddressCareOf                  | null                                                                                                          
     RegAddressPOBox                   | null                                                                                                          
     RegAddressAddressLine1            | null                                                                                                          
     RegAddressAddressLine2            | null                                                                                                          
     RegAddressPostTown                | null                                                                                                          
     RegAddressCounty                  | null                                                                                                          
     RegAddressCountry                 | null                                                                                                          
     RegAddressPostCode                | null                                                                                                          
     CompanyCategory                   | Charitable Incorporated Organisation                                                                          
     CompanyStatus                     | Active                                                                                                        
     CountryOfOrigin                   | United Kingdom                                                                                                
     DissolutionDate                   | null                                                                                                          
     IncorporationDate                 | 22/01/2018                                                                                                    
     AccountsAccountRefDay             | null                                                                                                          
     AccountsAccountRefMonth           | null                                                                                                          
     AccountsNextDueDate               | null                                                                                                          
     AccountsLastMadeUpDate            | null                                                                                                          
     AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
     ReturnsNextDueDate                | null                                                                                                          
     ReturnsLastMadeUpDate             | null                                                                                                          
     MortgagesNumMortCharges           | 0                                                                                                             
     MortgagesNumMortOutstanding       | 0                                                                                                             
     MortgagesNumMortPartSatisfied     | 0                                                                                                             
     MortgagesNumMortSatisfied         | 0                                                                                                             
     SICCodeSicText_1                  | None Supplied                                                                                                 
     SICCodeSicText_2                  | null                                                                                                          
     SICCodeSicText_3                  | null                                                                                                          
     SICCodeSicText_4                  | null                                                                                                          
     LimitedPartnershipsNumGenPartners | 0                                                                                                             
     LimitedPartnershipsNumLimPartners | 0                                                                                                             
     URI                               | http://business.data.gov.uk/id/company/CE012989                                                               
     PreviousName_1CONDATE             | null                                                                                                          
     PreviousName_1CompanyName         | null                                                                                                          
     PreviousName_2CONDATE             | null                                                                                                          
     PreviousName_2CompanyName         | null                                                                                                          
     PreviousName_3CONDATE             | null                                                                                                          
     PreviousName_3CompanyName         | null                                                                                                          
     PreviousName_4CONDATE             | null                                                                                                          
     PreviousName_4CompanyName         | null                                                                                                          
     PreviousName_5CONDATE             | null                                                                                                          
     PreviousName_5CompanyName         | null                                                                                                          
     PreviousName_6CONDATE             | null                                                                                                          
     PreviousName_6CompanyName         | null                                                                                                          
     PreviousName_7CONDATE             | null                                                                                                          
     PreviousName_7CompanyName         | null                                                                                                          
     PreviousName_8CONDATE             | null                                                                                                          
     PreviousName_8CompanyName         | null                                                                                                          
     PreviousName_9CONDATE             | null                                                                                                          
     PreviousName_9CompanyName         | null                                                                                                          
     PreviousName_10CONDATE            | null                                                                                                          
     PreviousName_10CompanyName        | null                                                                                                          
     ConfStmtNextDueDate               | null                                                                                                          
     ConfStmtLastMadeUpDate            | null                                                                                                          
     qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
    -RECORD 3------------------------------------------------------------------------------------------------------------------------------------------
     CompanyName                       | "ARTHUR BALFOUR",CONSERVATIVE WORKING MEN'S CLUB LIMITED                                                      
     CompanyNumber                     | IP10067R                                                                                                      
     RegAddressCareOf                  | null                                                                                                          
     RegAddressPOBox                   | null                                                                                                          
     RegAddressAddressLine1            | null                                                                                                          
     RegAddressAddressLine2            | null                                                                                                          
     RegAddressPostTown                | null                                                                                                          
     RegAddressCounty                  | null                                                                                                          
     RegAddressCountry                 | null                                                                                                          
     RegAddressPostCode                | null                                                                                                          
     CompanyCategory                   | Registered Society                                                                                            
     CompanyStatus                     | Active                                                                                                        
     CountryOfOrigin                   | United Kingdom                                                                                                
     DissolutionDate                   | null                                                                                                          
     IncorporationDate                 | 01/01/1981                                                                                                    
     AccountsAccountRefDay             | null                                                                                                          
     AccountsAccountRefMonth           | null                                                                                                          
     AccountsNextDueDate               | null                                                                                                          
     AccountsLastMadeUpDate            | null                                                                                                          
     AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
     ReturnsNextDueDate                | null                                                                                                          
     ReturnsLastMadeUpDate             | null                                                                                                          
     MortgagesNumMortCharges           | 0                                                                                                             
     MortgagesNumMortOutstanding       | 0                                                                                                             
     MortgagesNumMortPartSatisfied     | 0                                                                                                             
     MortgagesNumMortSatisfied         | 0                                                                                                             
     SICCodeSicText_1                  | None Supplied                                                                                                 
     SICCodeSicText_2                  | null                                                                                                          
     SICCodeSicText_3                  | null                                                                                                          
     SICCodeSicText_4                  | null                                                                                                          
     LimitedPartnershipsNumGenPartners | 0                                                                                                             
     LimitedPartnershipsNumLimPartners | 0                                                                                                             
     URI                               | http://business.data.gov.uk/id/company/IP10067R                                                               
     PreviousName_1CONDATE             | null                                                                                                          
     PreviousName_1CompanyName         | null                                                                                                          
     PreviousName_2CONDATE             | null                                                                                                          
     PreviousName_2CompanyName         | null                                                                                                          
     PreviousName_3CONDATE             | null                                                                                                          
     PreviousName_3CompanyName         | null                                                                                                          
     PreviousName_4CONDATE             | null                                                                                                          
     PreviousName_4CompanyName         | null                                                                                                          
     PreviousName_5CONDATE             | null                                                                                                          
     PreviousName_5CompanyName         | null                                                                                                          
     PreviousName_6CONDATE             | null                                                                                                          
     PreviousName_6CompanyName         | null                                                                                                          
     PreviousName_7CONDATE             | null                                                                                                          
     PreviousName_7CompanyName         | null                                                                                                          
     PreviousName_8CONDATE             | null                                                                                                          
     PreviousName_8CompanyName         | null                                                                                                          
     PreviousName_9CONDATE             | null                                                                                                          
     PreviousName_9CompanyName         | null                                                                                                          
     PreviousName_10CONDATE            | null                                                                                                          
     PreviousName_10CompanyName        | null                                                                                                          
     ConfStmtNextDueDate               | 15/01/2017                                                                                                    
     ConfStmtLastMadeUpDate            | null                                                                                                          
     qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)                                     
    -RECORD 4------------------------------------------------------------------------------------------------------------------------------------------
     CompanyName                       | "AUTOCONTEX" AUTOMOBIL CONTAINER EXPRESS GMBH                                                                 
     CompanyNumber                     | FC018262                                                                                                      
     RegAddressCareOf                  | null                                                                                                          
     RegAddressPOBox                   | null                                                                                                          
     RegAddressAddressLine1            | BRANCH REGISTRATION                                                                                           
     RegAddressAddressLine2            | REFER TO PARENT REGISTRY                                                                                      
     RegAddressPostTown                | null                                                                                                          
     RegAddressCounty                  | null                                                                                                          
     RegAddressCountry                 | GERMANY                                                                                                       
     RegAddressPostCode                | null                                                                                                          
     CompanyCategory                   | Other company type                                                                                            
     CompanyStatus                     | Active                                                                                                        
     CountryOfOrigin                   | WEST GERMANY                                                                                                  
     DissolutionDate                   | null                                                                                                          
     IncorporationDate                 | 01/11/1994                                                                                                    
     AccountsAccountRefDay             | 30                                                                                                            
     AccountsAccountRefMonth           | 11                                                                                                            
     AccountsNextDueDate               | null                                                                                                          
     AccountsLastMadeUpDate            | 31/12/2002                                                                                                    
     AccountsAccountCategory           | FULL                                                                                                          
     ReturnsNextDueDate                | null                                                                                                          
     ReturnsLastMadeUpDate             | null                                                                                                          
     MortgagesNumMortCharges           | 2                                                                                                             
     MortgagesNumMortOutstanding       | 2                                                                                                             
     MortgagesNumMortPartSatisfied     | 0                                                                                                             
     MortgagesNumMortSatisfied         | 0                                                                                                             
     SICCodeSicText_1                  | None Supplied                                                                                                 
     SICCodeSicText_2                  | null                                                                                                          
     SICCodeSicText_3                  | null                                                                                                          
     SICCodeSicText_4                  | null                                                                                                          
     LimitedPartnershipsNumGenPartners | 0                                                                                                             
     LimitedPartnershipsNumLimPartners | 0                                                                                                             
     URI                               | http://business.data.gov.uk/id/company/FC018262                                                               
     PreviousName_1CONDATE             | null                                                                                                          
     PreviousName_1CompanyName         | null                                                                                                          
     PreviousName_2CONDATE             | null                                                                                                          
     PreviousName_2CompanyName         | null                                                                                                          
     PreviousName_3CONDATE             | null                                                                                                          
     PreviousName_3CompanyName         | null                                                                                                          
     PreviousName_4CONDATE             | null                                                                                                          
     PreviousName_4CompanyName         | null                                                                                                          
     PreviousName_5CONDATE             | null                                                                                                          
     PreviousName_5CompanyName         | null                                                                                                          
     PreviousName_6CONDATE             | null                                                                                                          
     PreviousName_6CompanyName         | null                                                                                                          
     PreviousName_7CONDATE             | null                                                                                                          
     PreviousName_7CompanyName         | null                                                                                                          
     PreviousName_8CONDATE             | null                                                                                                          
     PreviousName_8CompanyName         | null                                                                                                          
     PreviousName_9CONDATE             | null                                                                                                          
     PreviousName_9CompanyName         | null                                                                                                          
     PreviousName_10CONDATE            | null                                                                                                          
     PreviousName_10CompanyName        | null                                                                                                          
     ConfStmtNextDueDate               | 01/12/2016                                                                                                    
     ConfStmtLastMadeUpDate            | null                                                                                                          
     qa_issues                         | check_not_null(RegAddressPostCode)                                                                            
    only showing top 5 rows
    
    #save issues as a HIVE table for later processing
    df_issues.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404_issues")
    
    stmt_issues = f"""
    SELECT qa_issues,count(*) as no_rows
    from ch_20210404_issues group by 1 order by no_rows"""
    spark.sql(stmt_issues).show(truncate=False,vertical=True)
    
    -RECORD 0------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(ConfStmtNextDueDate)                                    
     no_rows   | 1                                                                                                             
    -RECORD 1------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressAddressLine1)                                                                        
     no_rows   | 6                                                                                                             
    -RECORD 2------------------------------------------------------------------------------------------------------------------
     qa_issues | check_postcode(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)                                        
     no_rows   | 96                                                                                                            
    -RECORD 3------------------------------------------------------------------------------------------------------------------
     qa_issues | check_postcode(RegAddressPostCode)                                                                            
     no_rows   | 248                                                                                                           
    -RECORD 4------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)                                        
     no_rows   | 966                                                                                                           
    -RECORD 5------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)                                     
     no_rows   | 1700                                                                                                          
    -RECORD 6------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressPostCode)                                                                            
     no_rows   | 2730                                                                                                          
    -RECORD 7------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(ConfStmtNextDueDate)                                                                           
     no_rows   | 3014                                                                                                          
    -RECORD 8------------------------------------------------------------------------------------------------------------------
     qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
     no_rows   | 4582                                                                                                          
    stmt_issues = f"""
    SELECT 
    distinct explode(split(qa_issues, ',')) AS qa_history_issues,
    substring_index(qa_history_issues, "(", 1) AS qa_check,
    replace(substring_index(qa_history_issues, "(", -1),')','') AS field_name
    from ch_20210404_issues order by 2,3"""
    df_grp = spark.sql(stmt_issues)
    df_grp.show(truncate=False)
    
    +--------------------------------------+--------------+----------------------+
    |qa_history_issues                     |qa_check      |field_name            |
    +--------------------------------------+--------------+----------------------+
    |check_not_null(ConfStmtNextDueDate)   |check_not_null|ConfStmtNextDueDate   |
    |check_not_null(RegAddressAddressLine1)|check_not_null|RegAddressAddressLine1|
    |check_not_null(RegAddressPostCode)    |check_not_null|RegAddressPostCode    |
    |check_postcode(RegAddressPostCode)    |check_postcode|RegAddressPostCode    |
    +--------------------------------------+--------------+----------------------+