Data Quality check

We will use basic company data (http://download.companieshouse.gov.uk/en_output.html) for this article. A PySpark script is written for carrying out minimal data quality checks on the dataset from the Companies House.

The downloadable data snapshot containing basic company data of live companies on the Companies House register. This snapshot is provided as ZIP files containing data in CSV format and is split into multiple files.

Data Structure

First, look at the data and see the structure of the dataset. Perform basic checks to identify duplicates, null, missing values, etc.

A list of the data fields contained in the snapshot can be found here PDF

ch_file = '[DIR_PATH]/BasicCompanyData-2021-04-01-part1_6.csv'
df_ch = spark.read.option("header",True).option("multiline",True).option("escape", "\"").csv(ch_file)
df_ch.dtypes
[('CompanyName', 'string'),
 ('CompanyNumber', 'string'),
 ('RegAddressCareOf', 'string'),
 ('RegAddressPOBox', 'string'),
 ('RegAddressAddressLine1', 'string'),
 ('RegAddressAddressLine2', 'string'),
 ('RegAddressPostTown', 'string'),
 ('RegAddressCounty', 'string'),
 ('RegAddressCountry', 'string'),
 ('RegAddressPostCode', 'string'),
 ('CompanyCategory', 'string'),
 ('CompanyStatus', 'string'),
 ('CountryOfOrigin', 'string'),
 ('DissolutionDate', 'string'),
 ('IncorporationDate', 'string'),
 ('AccountsAccountRefDay', 'string'),
 ('AccountsAccountRefMonth', 'string'),
 ('AccountsNextDueDate', 'string'),
 ('AccountsLastMadeUpDate', 'string'),
 ('AccountsAccountCategory', 'string'),
 ('ReturnsNextDueDate', 'string'),
 ('ReturnsLastMadeUpDate', 'string'),
 ('MortgagesNumMortCharges', 'string'),
 ('MortgagesNumMortOutstanding', 'string'),
 ('MortgagesNumMortPartSatisfied', 'string'),
 ('MortgagesNumMortSatisfied', 'string'),
 ('SICCodeSicText_1', 'string'),
 ('SICCodeSicText_2', 'string'),
 ('SICCodeSicText_3', 'string'),
 ('SICCodeSicText_4', 'string'),
 ('LimitedPartnershipsNumGenPartners', 'string'),
 ('LimitedPartnershipsNumLimPartners', 'string'),
 ('URI', 'string'),
 ('PreviousName_1CONDATE', 'string'),
 ('PreviousName_1CompanyName', 'string'),
 ('PreviousName_2CONDATE', 'string'),
 ('PreviousName_2CompanyName', 'string'),
 ('PreviousName_3CONDATE', 'string'),
 ('PreviousName_3CompanyName', 'string'),
 ('PreviousName_4CONDATE', 'string'),
 ('PreviousName_4CompanyName', 'string'),
 ('PreviousName_5CONDATE', 'string'),
 ('PreviousName_5CompanyName', 'string'),
 ('PreviousName_6CONDATE', 'string'),
 ('PreviousName_6CompanyName', 'string'),
 ('PreviousName_7CONDATE', 'string'),
 ('PreviousName_7CompanyName', 'string'),
 ('PreviousName_8CONDATE', 'string'),
 ('PreviousName_8CompanyName', 'string'),
 ('PreviousName_9CONDATE', 'string'),
 ('PreviousName_9CompanyName', 'string'),
 ('PreviousName_10CONDATE', 'string'),
 ('PreviousName_10CompanyName', 'string'),
 ('ConfStmtNextDueDate', 'string'),
 ('ConfStmtLastMadeUpDate', 'string')]
"""
quick QA checks for null values, min and max length of each field values 
"""
df = df_ch.agg(*[(F.count(F.when(F.isnull(c), c)).alias(c + '_null'))for c in df_ch.columns],
                   *[(F.max(F.length(c))).alias(c + '_max')for c in df_ch.columns],
                   *[(F.min(F.length(c))).alias(c + '_min')for c in df_ch.columns])
# max and min length of each field values
df.show(truncate=False,vertical=True)
-RECORD 0----------------------------------------
 CompanyName_null                       | 0      
 CompanyNumber_null                     | 0      
 RegAddressCareOf_null                  | 837999 
 RegAddressPOBox_null                   | 847719 
 RegAddressAddressLine1_null            | 6289   
 RegAddressAddressLine2_null            | 318447 
 RegAddressPostTown_null                | 17085  
 RegAddressCounty_null                  | 542944 
 RegAddressCountry_null                 | 258467 
 RegAddressPostCode_null                | 9978   
 CompanyCategory_null                   | 0      
 CompanyStatus_null                     | 0      
 CountryOfOrigin_null                   | 0      
 DissolutionDate_null                   | 849999 
 IncorporationDate_null                 | 0      
 AccountsAccountRefDay_null             | 18072  
 AccountsAccountRefMonth_null           | 18072  
 AccountsNextDueDate_null               | 20591  
 AccountsLastMadeUpDate_null            | 236837 
 AccountsAccountCategory_null           | 240774 
 ReturnsNextDueDate_null                | 21452  
 ReturnsLastMadeUpDate_null             | 470009 
 MortgagesNumMortCharges_null           | 0      
 MortgagesNumMortOutstanding_null       | 0      
 MortgagesNumMortPartSatisfied_null     | 0      
 MortgagesNumMortSatisfied_null         | 0      
 SICCodeSicText_1_null                  | 0      
 SICCodeSicText_2_null                  | 736926 
 SICCodeSicText_3_null                  | 804878 
 SICCodeSicText_4_null                  | 830027 
 LimitedPartnershipsNumGenPartners_null | 0      
 LimitedPartnershipsNumLimPartners_null | 0      
 URI_null                               | 0      
 PreviousName_1CONDATE_null             | 764037 
 PreviousName_1CompanyName_null         | 764037 
 PreviousName_2CONDATE_null             | 835087 
 PreviousName_2CompanyName_null         | 835087 
 PreviousName_3CONDATE_null             | 846750 
 PreviousName_3CompanyName_null         | 846750 
 PreviousName_4CONDATE_null             | 849197 
 PreviousName_4CompanyName_null         | 849197 
 PreviousName_5CONDATE_null             | 849741 
 PreviousName_5CompanyName_null         | 849741 
 PreviousName_6CONDATE_null             | 849926 
 PreviousName_6CompanyName_null         | 849926 
 PreviousName_7CONDATE_null             | 849975 
 PreviousName_7CompanyName_null         | 849975 
 PreviousName_8CONDATE_null             | 849987 
 PreviousName_8CompanyName_null         | 849987 
 PreviousName_9CONDATE_null             | 849992 
 PreviousName_9CompanyName_null         | 849992 
 PreviousName_10CONDATE_null            | 849994 
 PreviousName_10CompanyName_null        | 849994 
 ConfStmtNextDueDate_null               | 8659   
 ConfStmtLastMadeUpDate_null            | 177697 
 CompanyName_max                        | 160    
 CompanyNumber_max                      | 8      
 RegAddressCareOf_max                   | 65     
 RegAddressPOBox_max                    | 10     
 RegAddressAddressLine1_max             | 99     
 RegAddressAddressLine2_max             | 56     
 RegAddressPostTown_max                 | 41     
 RegAddressCounty_max                   | 50     
 RegAddressCountry_max                  | 24     
 RegAddressPostCode_max                 | 14     
 CompanyCategory_max                    | 89     
 CompanyStatus_max                      | 48     
 CountryOfOrigin_max                    | 23     
 DissolutionDate_max                    | null   
 IncorporationDate_max                  | 10     
 AccountsAccountRefDay_max              | 2      
 AccountsAccountRefMonth_max            | 2      
 AccountsNextDueDate_max                | 10     
 AccountsLastMadeUpDate_max             | 10     
 AccountsAccountCategory_max            | 27     
 ReturnsNextDueDate_max                 | 10     
 ReturnsLastMadeUpDate_max              | 10     
 MortgagesNumMortCharges_max            | 4      
 MortgagesNumMortOutstanding_max        | 4      
 MortgagesNumMortPartSatisfied_max      | 2      
 MortgagesNumMortSatisfied_max          | 4      
 SICCodeSicText_1_max                   | 166    
 SICCodeSicText_2_max                   | 166    
 SICCodeSicText_3_max                   | 166    
 SICCodeSicText_4_max                   | 166    
 LimitedPartnershipsNumGenPartners_max  | 2      
 LimitedPartnershipsNumLimPartners_max  | 3      
 URI_max                                | 47     
 PreviousName_1CONDATE_max              | 10     
 PreviousName_1CompanyName_max          | 114    
 PreviousName_2CONDATE_max              | 10     
 PreviousName_2CompanyName_max          | 91     
 PreviousName_3CONDATE_max              | 10     
 PreviousName_3CompanyName_max          | 88     
 PreviousName_4CONDATE_max              | 10     
 PreviousName_4CompanyName_max          | 72     
 PreviousName_5CONDATE_max              | 10     
 PreviousName_5CompanyName_max          | 58     
 PreviousName_6CONDATE_max              | 10     
 PreviousName_6CompanyName_max          | 49     
 PreviousName_7CONDATE_max              | 10     
 PreviousName_7CompanyName_max          | 42     
 PreviousName_8CONDATE_max              | 10     
 PreviousName_8CompanyName_max          | 35     
 PreviousName_9CONDATE_max              | 10     
 PreviousName_9CompanyName_max          | 42     
 PreviousName_10CONDATE_max             | 10     
 PreviousName_10CompanyName_max         | 32     
 ConfStmtNextDueDate_max                | 10     
 ConfStmtLastMadeUpDate_max             | 10     
 CompanyName_min                        | 1      
 CompanyNumber_min                      | 8      
 RegAddressCareOf_min                   | 1      
 RegAddressPOBox_min                    | 1      
 RegAddressAddressLine1_min             | 1      
 RegAddressAddressLine2_min             | 1      
 RegAddressPostTown_min                 | 1      
 RegAddressCounty_min                   | 1      
 RegAddressCountry_min                  | 4      
 RegAddressPostCode_min                 | 1      
 CompanyCategory_min                    | 16     
 CompanyStatus_min                      | 6      
 CountryOfOrigin_min                    | 1      
 DissolutionDate_min                    | null   
 IncorporationDate_min                  | 10     
 AccountsAccountRefDay_min              | 1      
 AccountsAccountRefMonth_min            | 1      
 AccountsNextDueDate_min                | 10     
 AccountsLastMadeUpDate_min             | 10     
 AccountsAccountCategory_min            | 4      
 ReturnsNextDueDate_min                 | 10     
 ReturnsLastMadeUpDate_min              | 10     
 MortgagesNumMortCharges_min            | 1      
 MortgagesNumMortOutstanding_min        | 1      
 MortgagesNumMortPartSatisfied_min      | 1      
 MortgagesNumMortSatisfied_min          | 1      
 SICCodeSicText_1_min                   | 11     
 SICCodeSicText_2_min                   | 11     
 SICCodeSicText_3_min                   | 11     
 SICCodeSicText_4_min                   | 11     
 LimitedPartnershipsNumGenPartners_min  | 1      
 LimitedPartnershipsNumLimPartners_min  | 1      
 URI_min                                | 47     
 PreviousName_1CONDATE_min              | 10     
 PreviousName_1CompanyName_min          | 3      
 PreviousName_2CONDATE_min              | 10     
 PreviousName_2CompanyName_min          | 4      
 PreviousName_3CONDATE_min              | 10     
 PreviousName_3CompanyName_min          | 6      
 PreviousName_4CONDATE_min              | 10     
 PreviousName_4CompanyName_min          | 8      
 PreviousName_5CONDATE_min              | 10     
 PreviousName_5CompanyName_min          | 7      
 PreviousName_6CONDATE_min              | 10     
 PreviousName_6CompanyName_min          | 8      
 PreviousName_7CONDATE_min              | 10     
 PreviousName_7CompanyName_min          | 10     
 PreviousName_8CONDATE_min              | 10     
 PreviousName_8CompanyName_min          | 11     
 PreviousName_9CONDATE_min              | 10     
 PreviousName_9CompanyName_min          | 16     
 PreviousName_10CONDATE_min             | 10     
 PreviousName_10CompanyName_min         | 13     
 ConfStmtNextDueDate_min                | 10     
 ConfStmtLastMadeUpDate_min             | 10     
Diagnostic GroupBy started(limit 20 records): 202104061544
+-------------------------------------------+----------+-----+----------+
|CompanyName                                |total_rows|count|proportion|
+-------------------------------------------+----------+-----+----------+
|! LIMITED                                  |849999    |1    |0.0       |
|! LTD                                      |849999    |1    |0.0       |
|!? LTD                                     |849999    |1    |0.0       |
|!BIG IMPACT GRAPHICS LIMITED               |849999    |1    |0.0       |
|!L PRODUCTIONS LIMITED                     |849999    |1    |0.0       |
|!NKED LTD                                  |849999    |1    |0.0       |
|!NNOV8 LIMITED                             |849999    |1    |0.0       |
|!NSPIRED INVESTMENTS LTD                   |849999    |1    |0.0       |
|!NSPIRED LTD                               |849999    |1    |0.0       |
|!NVERTD DESIGNS LIMITED                    |849999    |1    |0.0       |
|!OBAC LIMITED                              |849999    |1    |0.0       |
|!OBAC UK LIMITED                           |849999    |1    |0.0       |
|!YOZO FASS LIMITED                         |849999    |1    |0.0       |
|" BORA " 2 LTD                             |849999    |1    |0.0       |
|"1 C O LIMITED"                            |849999    |1    |0.0       |
|"1ST RATE" PSYCHOLOGY SERVICES LTD         |849999    |1    |0.0       |
|"2 ECOUTE" LIMITED                         |849999    |1    |0.0       |
|"20-20 VOICE" CANCER                       |849999    |1    |0.0       |
|"243 RUGBY ROAD MANAGEMENT COMPANY LIMITED"|849999    |1    |0.0       |
|"309" WEST END LANE MANAGEMENT LIMITED     |849999    |1    |0.0       |
+-------------------------------------------+----------+-----+----------+

None
+-------------+----------+-----+----------+
|CompanyNumber|total_rows|count|proportion|
+-------------+----------+-----+----------+
|00000118     |849999    |1    |0.0       |
|00000133     |849999    |1    |0.0       |
|00000258     |849999    |1    |0.0       |
|00000529     |849999    |1    |0.0       |
|00000687     |849999    |1    |0.0       |
|00001122     |849999    |1    |0.0       |
|00001140     |849999    |1    |0.0       |
|00002119     |849999    |1    |0.0       |
|00002537     |849999    |1    |0.0       |
|00003002     |849999    |1    |0.0       |
|00004190     |849999    |1    |0.0       |
|00004606     |849999    |1    |0.0       |
|00005188     |849999    |1    |0.0       |
|00005295     |849999    |1    |0.0       |
|00005335     |849999    |1    |0.0       |
|00005728     |849999    |1    |0.0       |
|00006278     |849999    |1    |0.0       |
|00006470     |849999    |1    |0.0       |
|00006480     |849999    |1    |0.0       |
|00007241     |849999    |1    |0.0       |
+-------------+----------+-----+----------+

None
+-----------------------------------------------------------------------------------------+----------+------+----------+
|CompanyCategory                                                                          |total_rows|count |proportion|
+-----------------------------------------------------------------------------------------+----------+------+----------+
|Charitable Incorporated Organisation                                                     |849999    |3493  |0.0       |
|Community Interest Company                                                               |849999    |3626  |0.0       |
|Converted/Closed                                                                         |849999    |1     |0.0       |
|Industrial and Provident Society                                                         |849999    |47    |0.0       |
|Investment Company with Variable Capital                                                 |849999    |51    |0.0       |
|Investment Company with Variable Capital (Securities)                                    |849999    |1     |0.0       |
|Investment Company with Variable Capital(Umbrella)                                       |849999    |19    |0.0       |
|Limited Liability Partnership                                                            |849999    |9144  |0.01      |
|Limited Partnership                                                                      |849999    |11051 |0.01      |
|Old Public Company                                                                       |849999    |13    |0.0       |
|Other Company Type                                                                       |849999    |1     |0.0       |
|Other company type                                                                       |849999    |2304  |0.0       |
|PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)|849999    |5842  |0.01      |
|PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital)                    |849999    |25038 |0.03      |
|PRIV LTD SECT. 30 (Private limited company, section 30 of the Companies Act)             |849999    |2     |0.0       |
|Private Limited Company                                                                  |849999    |784661|0.92      |
|Private Unlimited                                                                        |849999    |19    |0.0       |
|Private Unlimited Company                                                                |849999    |691   |0.0       |
|Protected Cell Company                                                                   |849999    |1     |0.0       |
|Public Limited Company                                                                   |849999    |1118  |0.0       |
+-----------------------------------------------------------------------------------------+----------+------+----------+

None
+------------------------------------------------+----------+------+----------+
|CompanyStatus                                   |total_rows|count |proportion|
+------------------------------------------------+----------+------+----------+
|ADMINISTRATION ORDER                            |849999    |28    |0.0       |
|ADMINISTRATIVE RECEIVER                         |849999    |17    |0.0       |
|Active                                          |849999    |808781|0.95      |
|Active - Proposal to Strike off                 |849999    |23946 |0.03      |
|In Administration                               |849999    |508   |0.0       |
|In Administration/Administrative Receiver       |849999    |43    |0.0       |
|In Administration/Receiver Manager              |849999    |5     |0.0       |
|Liquidation                                     |849999    |16241 |0.02      |
|Live but Receiver Manager on at least one charge|849999    |225   |0.0       |
|RECEIVER MANAGER / ADMINISTRATIVE RECEIVER      |849999    |5     |0.0       |
|RECEIVERSHIP                                    |849999    |35    |0.0       |
|Voluntary Arrangement                           |849999    |165   |0.0       |
+------------------------------------------------+----------+------+----------+

None
+------------------+----------+-----+----------+
|RegAddressPostCode|total_rows|count|proportion|
+------------------+----------+-----+----------+
|null              |849999    |9978 |0.01      |
|0161              |849999    |1    |0.0       |
|02 806            |849999    |1    |0.0       |
|04640             |849999    |1    |0.0       |
|049 318           |849999    |1    |0.0       |
|08540             |849999    |1    |0.0       |
|0L16 1QE          |849999    |1    |0.0       |
|1                 |849999    |1    |0.0       |
|10 004            |849999    |1    |0.0       |
|100041            |849999    |1    |0.0       |
|10010             |849999    |1    |0.0       |
|10018             |849999    |1    |0.0       |
|1010              |849999    |1    |0.0       |
|1018 GE           |849999    |1    |0.0       |
|1050              |849999    |1    |0.0       |
|110042            |849999    |1    |0.0       |
|1160              |849999    |1    |0.0       |
|12 0AF            |849999    |1    |0.0       |
|121006            |849999    |1    |0.0       |
|13900             |849999    |1    |0.0       |
+------------------+----------+-----+----------+

None
+---------------------------+----------+------+----------+
|AccountsAccountCategory    |total_rows|count |proportion|
+---------------------------+----------+------+----------+
|null                       |849999    |240774|0.28      |
|ACCOUNTS TYPE NOT AVAILABLE|849999    |866   |0.0       |
|AUDIT EXEMPTION SUBSIDIARY |849999    |2253  |0.0       |
|AUDITED ABRIDGED           |849999    |249   |0.0       |
|DORMANT                    |849999    |104986|0.12      |
|FILING EXEMPTION SUBSIDIARY|849999    |70    |0.0       |
|FULL                       |849999    |17019 |0.02      |
|GROUP                      |849999    |4046  |0.0       |
|INITIAL                    |849999    |1     |0.0       |
|MEDIUM                     |849999    |76    |0.0       |
|NO ACCOUNTS FILED          |849999    |236789|0.28      |
|PARTIAL EXEMPTION          |849999    |4     |0.0       |
|SMALL                      |849999    |11822 |0.01      |
|TOTAL EXEMPTION FULL       |849999    |201577|0.24      |
|TOTAL EXEMPTION SMALL      |849999    |3789  |0.0       |
|UNAUDITED ABRIDGED         |849999    |25678 |0.03      |
+---------------------------+----------+------+----------+

None
+------------------------------------------------------------------------+----------+-----+----------+
|SICCodeSicText_1                                                        |total_rows|count|proportion|
+------------------------------------------------------------------------+----------+-----+----------+
|0111 - Grow cereals & other crops                                       |849999    |2    |0.0       |
|01110 - Growing of cereals (except rice), leguminous crops and oil seeds|849999    |672  |0.0       |
|0112 - Grow vegetables & nursery products                               |849999    |1    |0.0       |
|01120 - Growing of rice                                                 |849999    |13   |0.0       |
|01130 - Growing of vegetables and melons, roots and tubers              |849999    |193  |0.0       |
|01140 - Growing of sugar cane                                           |849999    |7    |0.0       |
|01150 - Growing of tobacco                                              |849999    |7    |0.0       |
|01160 - Growing of fibre crops                                          |849999    |19   |0.0       |
|01190 - Growing of other non-perennial crops                            |849999    |52   |0.0       |
|01210 - Growing of grapes                                               |849999    |20   |0.0       |
|01220 - Growing of tropical and subtropical fruits                      |849999    |9    |0.0       |
|01230 - Growing of citrus fruits                                        |849999    |2    |0.0       |
|01240 - Growing of pome fruits and stone fruits                         |849999    |36   |0.0       |
|01250 - Growing of other tree and bush fruits and nuts                  |849999    |63   |0.0       |
|01260 - Growing of oleaginous fruits                                    |849999    |5    |0.0       |
|01270 - Growing of beverage crops                                       |849999    |7    |0.0       |
|01280 - Growing of spices, aromatic, drug and pharmaceutical crops      |849999    |50   |0.0       |
|01290 - Growing of other perennial crops                                |849999    |44   |0.0       |
|0130 - Crops combined with animals, mixed farms                         |849999    |1    |0.0       |
|01300 - Plant propagation                                               |849999    |92   |0.0       |
+------------------------------------------------------------------------+----------+-----+----------+

None
Diagnostic Group By completed:  0:00:11.038049
#save dataframe as HIVE parquet table
df_ch.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404")

Identify data quality issues

issue_col = "qa_issues"
df = df_ch.withColumn(issue_col,F.lit(""))
    
postcode_cols = [
  "RegAddressPostCode"
]
    
for col in postcode_cols:
  df = df.withColumn(issue_col, dc.add_to_string_col(
      df[issue_col], f"check_postcode({col})",
      dc.check_postcode(df[col])))
  df.cache().count()

date_cols = [    
  'DissolutionDate',
  'IncorporationDate',
  'AccountsNextDueDate',
  'AccountsLastMadeUpDate',
  'ReturnsNextDueDate',
  'ReturnsLastMadeUpDate',
  'ConfStmtNextDueDate',
  'ConfStmtLastMadeUpDate'
]

for col in date_cols:
    df = df.withColumn(issue_col, dc.add_to_string_col(
      df[issue_col], f"check_date({col})",
      dc.check_date(df[col],"dd/MM/yyyy")))
    df.cache().count()

cols = [
  'CompanyName',
  'CompanyNumber',
  'RegAddressAddressLine1',
  'RegAddressPostCode',
  'CompanyCategory',
  'CompanyStatus',
  'CountryOfOrigin',
  'IncorporationDate',
  'MortgagesNumMortCharges',
  'MortgagesNumMortOutstanding',
  'MortgagesNumMortPartSatisfied',
  'MortgagesNumMortSatisfied',
  'SICCodeSicText_1',
  'LimitedPartnershipsNumGenPartners',
  'LimitedPartnershipsNumLimPartners',
  'ConfStmtNextDueDate'
]   

for col in cols:
    df = df.withColumn(issue_col, dc.add_to_string_col(
      df[issue_col], f"check_not_null({col})",
      dc.check_not_null(df[col])))
    df.cache().count()

df_issues = df.filter(F.col('qa_issues') != '')

Result

print(df_issues.count())
df_issues.show(5,truncate=False,vertical=True)

13343
-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------
 CompanyName                       | "20-20 VOICE" CANCER                                                                                          
 CompanyNumber                     | CE000952                                                                                                      
 RegAddressCareOf                  | null                                                                                                          
 RegAddressPOBox                   | null                                                                                                          
 RegAddressAddressLine1            | null                                                                                                          
 RegAddressAddressLine2            | null                                                                                                          
 RegAddressPostTown                | null                                                                                                          
 RegAddressCounty                  | null                                                                                                          
 RegAddressCountry                 | null                                                                                                          
 RegAddressPostCode                | null                                                                                                          
 CompanyCategory                   | Charitable Incorporated Organisation                                                                          
 CompanyStatus                     | Active                                                                                                        
 CountryOfOrigin                   | United Kingdom                                                                                                
 DissolutionDate                   | null                                                                                                          
 IncorporationDate                 | 17/04/2014                                                                                                    
 AccountsAccountRefDay             | null                                                                                                          
 AccountsAccountRefMonth           | null                                                                                                          
 AccountsNextDueDate               | null                                                                                                          
 AccountsLastMadeUpDate            | null                                                                                                          
 AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
 ReturnsNextDueDate                | null                                                                                                          
 ReturnsLastMadeUpDate             | null                                                                                                          
 MortgagesNumMortCharges           | 0                                                                                                             
 MortgagesNumMortOutstanding       | 0                                                                                                             
 MortgagesNumMortPartSatisfied     | 0                                                                                                             
 MortgagesNumMortSatisfied         | 0                                                                                                             
 SICCodeSicText_1                  | None Supplied                                                                                                 
 SICCodeSicText_2                  | null                                                                                                          
 SICCodeSicText_3                  | null                                                                                                          
 SICCodeSicText_4                  | null                                                                                                          
 LimitedPartnershipsNumGenPartners | 0                                                                                                             
 LimitedPartnershipsNumLimPartners | 0                                                                                                             
 URI                               | http://business.data.gov.uk/id/company/CE000952                                                               
 PreviousName_1CONDATE             | null                                                                                                          
 PreviousName_1CompanyName         | null                                                                                                          
 PreviousName_2CONDATE             | null                                                                                                          
 PreviousName_2CompanyName         | null                                                                                                          
 PreviousName_3CONDATE             | null                                                                                                          
 PreviousName_3CompanyName         | null                                                                                                          
 PreviousName_4CONDATE             | null                                                                                                          
 PreviousName_4CompanyName         | null                                                                                                          
 PreviousName_5CONDATE             | null                                                                                                          
 PreviousName_5CompanyName         | null                                                                                                          
 PreviousName_6CONDATE             | null                                                                                                          
 PreviousName_6CompanyName         | null                                                                                                          
 PreviousName_7CONDATE             | null                                                                                                          
 PreviousName_7CompanyName         | null                                                                                                          
 PreviousName_8CONDATE             | null                                                                                                          
 PreviousName_8CompanyName         | null                                                                                                          
 PreviousName_9CONDATE             | null                                                                                                          
 PreviousName_9CompanyName         | null                                                                                                          
 PreviousName_10CONDATE            | null                                                                                                          
 PreviousName_10CompanyName        | null                                                                                                          
 ConfStmtNextDueDate               | null                                                                                                          
 ConfStmtLastMadeUpDate            | null                                                                                                          
 qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
-RECORD 1------------------------------------------------------------------------------------------------------------------------------------------
 CompanyName                       | "ALTAI CASHMERE" LLC                                                                                          
 CompanyNumber                     | FC027187                                                                                                      
 RegAddressCareOf                  | null                                                                                                          
 RegAddressPOBox                   | null                                                                                                          
 RegAddressAddressLine1            | BUILDING LEFT TO THE "TUUL"                                                                                   
 RegAddressAddressLine2            | DRY CLEANING                                                                                                  
 RegAddressPostTown                | KHAN-UUL DISTRICT                                                                                             
 RegAddressCounty                  | ULAANBAATAR, MONGOLIA                                                                                         
 RegAddressCountry                 | MONGOLIA                                                                                                      
 RegAddressPostCode                | null                                                                                                          
 CompanyCategory                   | Other company type                                                                                            
 CompanyStatus                     | Active                                                                                                        
 CountryOfOrigin                   | MONGOLIA                                                                                                      
 DissolutionDate                   | null                                                                                                          
 IncorporationDate                 | 14/06/2006                                                                                                    
 AccountsAccountRefDay             | 30                                                                                                            
 AccountsAccountRefMonth           | 6                                                                                                             
 AccountsNextDueDate               | null                                                                                                          
 AccountsLastMadeUpDate            | null                                                                                                          
 AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
 ReturnsNextDueDate                | null                                                                                                          
 ReturnsLastMadeUpDate             | null                                                                                                          
 MortgagesNumMortCharges           | 0                                                                                                             
 MortgagesNumMortOutstanding       | 0                                                                                                             
 MortgagesNumMortPartSatisfied     | 0                                                                                                             
 MortgagesNumMortSatisfied         | 0                                                                                                             
 SICCodeSicText_1                  | None Supplied                                                                                                 
 SICCodeSicText_2                  | null                                                                                                          
 SICCodeSicText_3                  | null                                                                                                          
 SICCodeSicText_4                  | null                                                                                                          
 LimitedPartnershipsNumGenPartners | 0                                                                                                             
 LimitedPartnershipsNumLimPartners | 0                                                                                                             
 URI                               | http://business.data.gov.uk/id/company/FC027187                                                               
 PreviousName_1CONDATE             | null                                                                                                          
 PreviousName_1CompanyName         | null                                                                                                          
 PreviousName_2CONDATE             | null                                                                                                          
 PreviousName_2CompanyName         | null                                                                                                          
 PreviousName_3CONDATE             | null                                                                                                          
 PreviousName_3CompanyName         | null                                                                                                          
 PreviousName_4CONDATE             | null                                                                                                          
 PreviousName_4CompanyName         | null                                                                                                          
 PreviousName_5CONDATE             | null                                                                                                          
 PreviousName_5CompanyName         | null                                                                                                          
 PreviousName_6CONDATE             | null                                                                                                          
 PreviousName_6CompanyName         | null                                                                                                          
 PreviousName_7CONDATE             | null                                                                                                          
 PreviousName_7CompanyName         | null                                                                                                          
 PreviousName_8CONDATE             | null                                                                                                          
 PreviousName_8CompanyName         | null                                                                                                          
 PreviousName_9CONDATE             | null                                                                                                          
 PreviousName_9CompanyName         | null                                                                                                          
 PreviousName_10CONDATE            | null                                                                                                          
 PreviousName_10CompanyName        | null                                                                                                          
 ConfStmtNextDueDate               | 26/12/2016                                                                                                    
 ConfStmtLastMadeUpDate            | null                                                                                                          
 qa_issues                         | check_not_null(RegAddressPostCode)                                                                            
-RECORD 2------------------------------------------------------------------------------------------------------------------------------------------
 CompanyName                       | "ANDI'S GIFT"                                                                                                 
 CompanyNumber                     | CE012989                                                                                                      
 RegAddressCareOf                  | null                                                                                                          
 RegAddressPOBox                   | null                                                                                                          
 RegAddressAddressLine1            | null                                                                                                          
 RegAddressAddressLine2            | null                                                                                                          
 RegAddressPostTown                | null                                                                                                          
 RegAddressCounty                  | null                                                                                                          
 RegAddressCountry                 | null                                                                                                          
 RegAddressPostCode                | null                                                                                                          
 CompanyCategory                   | Charitable Incorporated Organisation                                                                          
 CompanyStatus                     | Active                                                                                                        
 CountryOfOrigin                   | United Kingdom                                                                                                
 DissolutionDate                   | null                                                                                                          
 IncorporationDate                 | 22/01/2018                                                                                                    
 AccountsAccountRefDay             | null                                                                                                          
 AccountsAccountRefMonth           | null                                                                                                          
 AccountsNextDueDate               | null                                                                                                          
 AccountsLastMadeUpDate            | null                                                                                                          
 AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
 ReturnsNextDueDate                | null                                                                                                          
 ReturnsLastMadeUpDate             | null                                                                                                          
 MortgagesNumMortCharges           | 0                                                                                                             
 MortgagesNumMortOutstanding       | 0                                                                                                             
 MortgagesNumMortPartSatisfied     | 0                                                                                                             
 MortgagesNumMortSatisfied         | 0                                                                                                             
 SICCodeSicText_1                  | None Supplied                                                                                                 
 SICCodeSicText_2                  | null                                                                                                          
 SICCodeSicText_3                  | null                                                                                                          
 SICCodeSicText_4                  | null                                                                                                          
 LimitedPartnershipsNumGenPartners | 0                                                                                                             
 LimitedPartnershipsNumLimPartners | 0                                                                                                             
 URI                               | http://business.data.gov.uk/id/company/CE012989                                                               
 PreviousName_1CONDATE             | null                                                                                                          
 PreviousName_1CompanyName         | null                                                                                                          
 PreviousName_2CONDATE             | null                                                                                                          
 PreviousName_2CompanyName         | null                                                                                                          
 PreviousName_3CONDATE             | null                                                                                                          
 PreviousName_3CompanyName         | null                                                                                                          
 PreviousName_4CONDATE             | null                                                                                                          
 PreviousName_4CompanyName         | null                                                                                                          
 PreviousName_5CONDATE             | null                                                                                                          
 PreviousName_5CompanyName         | null                                                                                                          
 PreviousName_6CONDATE             | null                                                                                                          
 PreviousName_6CompanyName         | null                                                                                                          
 PreviousName_7CONDATE             | null                                                                                                          
 PreviousName_7CompanyName         | null                                                                                                          
 PreviousName_8CONDATE             | null                                                                                                          
 PreviousName_8CompanyName         | null                                                                                                          
 PreviousName_9CONDATE             | null                                                                                                          
 PreviousName_9CompanyName         | null                                                                                                          
 PreviousName_10CONDATE            | null                                                                                                          
 PreviousName_10CompanyName        | null                                                                                                          
 ConfStmtNextDueDate               | null                                                                                                          
 ConfStmtLastMadeUpDate            | null                                                                                                          
 qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
-RECORD 3------------------------------------------------------------------------------------------------------------------------------------------
 CompanyName                       | "ARTHUR BALFOUR",CONSERVATIVE WORKING MEN'S CLUB LIMITED                                                      
 CompanyNumber                     | IP10067R                                                                                                      
 RegAddressCareOf                  | null                                                                                                          
 RegAddressPOBox                   | null                                                                                                          
 RegAddressAddressLine1            | null                                                                                                          
 RegAddressAddressLine2            | null                                                                                                          
 RegAddressPostTown                | null                                                                                                          
 RegAddressCounty                  | null                                                                                                          
 RegAddressCountry                 | null                                                                                                          
 RegAddressPostCode                | null                                                                                                          
 CompanyCategory                   | Registered Society                                                                                            
 CompanyStatus                     | Active                                                                                                        
 CountryOfOrigin                   | United Kingdom                                                                                                
 DissolutionDate                   | null                                                                                                          
 IncorporationDate                 | 01/01/1981                                                                                                    
 AccountsAccountRefDay             | null                                                                                                          
 AccountsAccountRefMonth           | null                                                                                                          
 AccountsNextDueDate               | null                                                                                                          
 AccountsLastMadeUpDate            | null                                                                                                          
 AccountsAccountCategory           | NO ACCOUNTS FILED                                                                                             
 ReturnsNextDueDate                | null                                                                                                          
 ReturnsLastMadeUpDate             | null                                                                                                          
 MortgagesNumMortCharges           | 0                                                                                                             
 MortgagesNumMortOutstanding       | 0                                                                                                             
 MortgagesNumMortPartSatisfied     | 0                                                                                                             
 MortgagesNumMortSatisfied         | 0                                                                                                             
 SICCodeSicText_1                  | None Supplied                                                                                                 
 SICCodeSicText_2                  | null                                                                                                          
 SICCodeSicText_3                  | null                                                                                                          
 SICCodeSicText_4                  | null                                                                                                          
 LimitedPartnershipsNumGenPartners | 0                                                                                                             
 LimitedPartnershipsNumLimPartners | 0                                                                                                             
 URI                               | http://business.data.gov.uk/id/company/IP10067R                                                               
 PreviousName_1CONDATE             | null                                                                                                          
 PreviousName_1CompanyName         | null                                                                                                          
 PreviousName_2CONDATE             | null                                                                                                          
 PreviousName_2CompanyName         | null                                                                                                          
 PreviousName_3CONDATE             | null                                                                                                          
 PreviousName_3CompanyName         | null                                                                                                          
 PreviousName_4CONDATE             | null                                                                                                          
 PreviousName_4CompanyName         | null                                                                                                          
 PreviousName_5CONDATE             | null                                                                                                          
 PreviousName_5CompanyName         | null                                                                                                          
 PreviousName_6CONDATE             | null                                                                                                          
 PreviousName_6CompanyName         | null                                                                                                          
 PreviousName_7CONDATE             | null                                                                                                          
 PreviousName_7CompanyName         | null                                                                                                          
 PreviousName_8CONDATE             | null                                                                                                          
 PreviousName_8CompanyName         | null                                                                                                          
 PreviousName_9CONDATE             | null                                                                                                          
 PreviousName_9CompanyName         | null                                                                                                          
 PreviousName_10CONDATE            | null                                                                                                          
 PreviousName_10CompanyName        | null                                                                                                          
 ConfStmtNextDueDate               | 15/01/2017                                                                                                    
 ConfStmtLastMadeUpDate            | null                                                                                                          
 qa_issues                         | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)                                     
-RECORD 4------------------------------------------------------------------------------------------------------------------------------------------
 CompanyName                       | "AUTOCONTEX" AUTOMOBIL CONTAINER EXPRESS GMBH                                                                 
 CompanyNumber                     | FC018262                                                                                                      
 RegAddressCareOf                  | null                                                                                                          
 RegAddressPOBox                   | null                                                                                                          
 RegAddressAddressLine1            | BRANCH REGISTRATION                                                                                           
 RegAddressAddressLine2            | REFER TO PARENT REGISTRY                                                                                      
 RegAddressPostTown                | null                                                                                                          
 RegAddressCounty                  | null                                                                                                          
 RegAddressCountry                 | GERMANY                                                                                                       
 RegAddressPostCode                | null                                                                                                          
 CompanyCategory                   | Other company type                                                                                            
 CompanyStatus                     | Active                                                                                                        
 CountryOfOrigin                   | WEST GERMANY                                                                                                  
 DissolutionDate                   | null                                                                                                          
 IncorporationDate                 | 01/11/1994                                                                                                    
 AccountsAccountRefDay             | 30                                                                                                            
 AccountsAccountRefMonth           | 11                                                                                                            
 AccountsNextDueDate               | null                                                                                                          
 AccountsLastMadeUpDate            | 31/12/2002                                                                                                    
 AccountsAccountCategory           | FULL                                                                                                          
 ReturnsNextDueDate                | null                                                                                                          
 ReturnsLastMadeUpDate             | null                                                                                                          
 MortgagesNumMortCharges           | 2                                                                                                             
 MortgagesNumMortOutstanding       | 2                                                                                                             
 MortgagesNumMortPartSatisfied     | 0                                                                                                             
 MortgagesNumMortSatisfied         | 0                                                                                                             
 SICCodeSicText_1                  | None Supplied                                                                                                 
 SICCodeSicText_2                  | null                                                                                                          
 SICCodeSicText_3                  | null                                                                                                          
 SICCodeSicText_4                  | null                                                                                                          
 LimitedPartnershipsNumGenPartners | 0                                                                                                             
 LimitedPartnershipsNumLimPartners | 0                                                                                                             
 URI                               | http://business.data.gov.uk/id/company/FC018262                                                               
 PreviousName_1CONDATE             | null                                                                                                          
 PreviousName_1CompanyName         | null                                                                                                          
 PreviousName_2CONDATE             | null                                                                                                          
 PreviousName_2CompanyName         | null                                                                                                          
 PreviousName_3CONDATE             | null                                                                                                          
 PreviousName_3CompanyName         | null                                                                                                          
 PreviousName_4CONDATE             | null                                                                                                          
 PreviousName_4CompanyName         | null                                                                                                          
 PreviousName_5CONDATE             | null                                                                                                          
 PreviousName_5CompanyName         | null                                                                                                          
 PreviousName_6CONDATE             | null                                                                                                          
 PreviousName_6CompanyName         | null                                                                                                          
 PreviousName_7CONDATE             | null                                                                                                          
 PreviousName_7CompanyName         | null                                                                                                          
 PreviousName_8CONDATE             | null                                                                                                          
 PreviousName_8CompanyName         | null                                                                                                          
 PreviousName_9CONDATE             | null                                                                                                          
 PreviousName_9CompanyName         | null                                                                                                          
 PreviousName_10CONDATE            | null                                                                                                          
 PreviousName_10CompanyName        | null                                                                                                          
 ConfStmtNextDueDate               | 01/12/2016                                                                                                    
 ConfStmtLastMadeUpDate            | null                                                                                                          
 qa_issues                         | check_not_null(RegAddressPostCode)                                                                            
only showing top 5 rows
#save issues as a HIVE table for later processing
df_issues.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404_issues")

stmt_issues = f"""
SELECT qa_issues,count(*) as no_rows
from ch_20210404_issues group by 1 order by no_rows"""
spark.sql(stmt_issues).show(truncate=False,vertical=True)

-RECORD 0------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(ConfStmtNextDueDate)                                    
 no_rows   | 1                                                                                                             
-RECORD 1------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressAddressLine1)                                                                        
 no_rows   | 6                                                                                                             
-RECORD 2------------------------------------------------------------------------------------------------------------------
 qa_issues | check_postcode(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)                                        
 no_rows   | 96                                                                                                            
-RECORD 3------------------------------------------------------------------------------------------------------------------
 qa_issues | check_postcode(RegAddressPostCode)                                                                            
 no_rows   | 248                                                                                                           
-RECORD 4------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)                                        
 no_rows   | 966                                                                                                           
-RECORD 5------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)                                     
 no_rows   | 1700                                                                                                          
-RECORD 6------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressPostCode)                                                                            
 no_rows   | 2730                                                                                                          
-RECORD 7------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(ConfStmtNextDueDate)                                                                           
 no_rows   | 3014                                                                                                          
-RECORD 8------------------------------------------------------------------------------------------------------------------
 qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate) 
 no_rows   | 4582                                                                                                          
stmt_issues = f"""
SELECT 
distinct explode(split(qa_issues, ',')) AS qa_history_issues,
substring_index(qa_history_issues, "(", 1) AS qa_check,
replace(substring_index(qa_history_issues, "(", -1),')','') AS field_name
from ch_20210404_issues order by 2,3"""
df_grp = spark.sql(stmt_issues)
df_grp.show(truncate=False)

+--------------------------------------+--------------+----------------------+
|qa_history_issues                     |qa_check      |field_name            |
+--------------------------------------+--------------+----------------------+
|check_not_null(ConfStmtNextDueDate)   |check_not_null|ConfStmtNextDueDate   |
|check_not_null(RegAddressAddressLine1)|check_not_null|RegAddressAddressLine1|
|check_not_null(RegAddressPostCode)    |check_not_null|RegAddressPostCode    |
|check_postcode(RegAddressPostCode)    |check_postcode|RegAddressPostCode    |
+--------------------------------------+--------------+----------------------+