We will use basic company data (http://download.companieshouse.gov.uk/en_output.html) for this article. A PySpark script is written for carrying out minimal data quality checks on the dataset from the Companies House.
The downloadable data snapshot containing basic company data of live companies on the Companies House register. This snapshot is provided as ZIP files containing data in CSV format and is split into multiple files.
Data Structure
First, look at the data and see the structure of the dataset. Perform basic checks to identify duplicates, null, missing values, etc.
A list of the data fields contained in the snapshot can be found here PDF
ch_file = '[DIR_PATH]/BasicCompanyData-2021-04-01-part1_6.csv'
df_ch = spark.read.option("header",True).option("multiline",True).option("escape", "\"").csv(ch_file)
df_ch.dtypes
[('CompanyName', 'string'),
('CompanyNumber', 'string'),
('RegAddressCareOf', 'string'),
('RegAddressPOBox', 'string'),
('RegAddressAddressLine1', 'string'),
('RegAddressAddressLine2', 'string'),
('RegAddressPostTown', 'string'),
('RegAddressCounty', 'string'),
('RegAddressCountry', 'string'),
('RegAddressPostCode', 'string'),
('CompanyCategory', 'string'),
('CompanyStatus', 'string'),
('CountryOfOrigin', 'string'),
('DissolutionDate', 'string'),
('IncorporationDate', 'string'),
('AccountsAccountRefDay', 'string'),
('AccountsAccountRefMonth', 'string'),
('AccountsNextDueDate', 'string'),
('AccountsLastMadeUpDate', 'string'),
('AccountsAccountCategory', 'string'),
('ReturnsNextDueDate', 'string'),
('ReturnsLastMadeUpDate', 'string'),
('MortgagesNumMortCharges', 'string'),
('MortgagesNumMortOutstanding', 'string'),
('MortgagesNumMortPartSatisfied', 'string'),
('MortgagesNumMortSatisfied', 'string'),
('SICCodeSicText_1', 'string'),
('SICCodeSicText_2', 'string'),
('SICCodeSicText_3', 'string'),
('SICCodeSicText_4', 'string'),
('LimitedPartnershipsNumGenPartners', 'string'),
('LimitedPartnershipsNumLimPartners', 'string'),
('URI', 'string'),
('PreviousName_1CONDATE', 'string'),
('PreviousName_1CompanyName', 'string'),
('PreviousName_2CONDATE', 'string'),
('PreviousName_2CompanyName', 'string'),
('PreviousName_3CONDATE', 'string'),
('PreviousName_3CompanyName', 'string'),
('PreviousName_4CONDATE', 'string'),
('PreviousName_4CompanyName', 'string'),
('PreviousName_5CONDATE', 'string'),
('PreviousName_5CompanyName', 'string'),
('PreviousName_6CONDATE', 'string'),
('PreviousName_6CompanyName', 'string'),
('PreviousName_7CONDATE', 'string'),
('PreviousName_7CompanyName', 'string'),
('PreviousName_8CONDATE', 'string'),
('PreviousName_8CompanyName', 'string'),
('PreviousName_9CONDATE', 'string'),
('PreviousName_9CompanyName', 'string'),
('PreviousName_10CONDATE', 'string'),
('PreviousName_10CompanyName', 'string'),
('ConfStmtNextDueDate', 'string'),
('ConfStmtLastMadeUpDate', 'string')]
"""
quick QA checks for null values, min and max length of each field values
"""
df = df_ch.agg(*[(F.count(F.when(F.isnull(c), c)).alias(c + '_null'))for c in df_ch.columns],
*[(F.max(F.length(c))).alias(c + '_max')for c in df_ch.columns],
*[(F.min(F.length(c))).alias(c + '_min')for c in df_ch.columns])
# max and min length of each field values
df.show(truncate=False,vertical=True)
-RECORD 0----------------------------------------
CompanyName_null | 0
CompanyNumber_null | 0
RegAddressCareOf_null | 837999
RegAddressPOBox_null | 847719
RegAddressAddressLine1_null | 6289
RegAddressAddressLine2_null | 318447
RegAddressPostTown_null | 17085
RegAddressCounty_null | 542944
RegAddressCountry_null | 258467
RegAddressPostCode_null | 9978
CompanyCategory_null | 0
CompanyStatus_null | 0
CountryOfOrigin_null | 0
DissolutionDate_null | 849999
IncorporationDate_null | 0
AccountsAccountRefDay_null | 18072
AccountsAccountRefMonth_null | 18072
AccountsNextDueDate_null | 20591
AccountsLastMadeUpDate_null | 236837
AccountsAccountCategory_null | 240774
ReturnsNextDueDate_null | 21452
ReturnsLastMadeUpDate_null | 470009
MortgagesNumMortCharges_null | 0
MortgagesNumMortOutstanding_null | 0
MortgagesNumMortPartSatisfied_null | 0
MortgagesNumMortSatisfied_null | 0
SICCodeSicText_1_null | 0
SICCodeSicText_2_null | 736926
SICCodeSicText_3_null | 804878
SICCodeSicText_4_null | 830027
LimitedPartnershipsNumGenPartners_null | 0
LimitedPartnershipsNumLimPartners_null | 0
URI_null | 0
PreviousName_1CONDATE_null | 764037
PreviousName_1CompanyName_null | 764037
PreviousName_2CONDATE_null | 835087
PreviousName_2CompanyName_null | 835087
PreviousName_3CONDATE_null | 846750
PreviousName_3CompanyName_null | 846750
PreviousName_4CONDATE_null | 849197
PreviousName_4CompanyName_null | 849197
PreviousName_5CONDATE_null | 849741
PreviousName_5CompanyName_null | 849741
PreviousName_6CONDATE_null | 849926
PreviousName_6CompanyName_null | 849926
PreviousName_7CONDATE_null | 849975
PreviousName_7CompanyName_null | 849975
PreviousName_8CONDATE_null | 849987
PreviousName_8CompanyName_null | 849987
PreviousName_9CONDATE_null | 849992
PreviousName_9CompanyName_null | 849992
PreviousName_10CONDATE_null | 849994
PreviousName_10CompanyName_null | 849994
ConfStmtNextDueDate_null | 8659
ConfStmtLastMadeUpDate_null | 177697
CompanyName_max | 160
CompanyNumber_max | 8
RegAddressCareOf_max | 65
RegAddressPOBox_max | 10
RegAddressAddressLine1_max | 99
RegAddressAddressLine2_max | 56
RegAddressPostTown_max | 41
RegAddressCounty_max | 50
RegAddressCountry_max | 24
RegAddressPostCode_max | 14
CompanyCategory_max | 89
CompanyStatus_max | 48
CountryOfOrigin_max | 23
DissolutionDate_max | null
IncorporationDate_max | 10
AccountsAccountRefDay_max | 2
AccountsAccountRefMonth_max | 2
AccountsNextDueDate_max | 10
AccountsLastMadeUpDate_max | 10
AccountsAccountCategory_max | 27
ReturnsNextDueDate_max | 10
ReturnsLastMadeUpDate_max | 10
MortgagesNumMortCharges_max | 4
MortgagesNumMortOutstanding_max | 4
MortgagesNumMortPartSatisfied_max | 2
MortgagesNumMortSatisfied_max | 4
SICCodeSicText_1_max | 166
SICCodeSicText_2_max | 166
SICCodeSicText_3_max | 166
SICCodeSicText_4_max | 166
LimitedPartnershipsNumGenPartners_max | 2
LimitedPartnershipsNumLimPartners_max | 3
URI_max | 47
PreviousName_1CONDATE_max | 10
PreviousName_1CompanyName_max | 114
PreviousName_2CONDATE_max | 10
PreviousName_2CompanyName_max | 91
PreviousName_3CONDATE_max | 10
PreviousName_3CompanyName_max | 88
PreviousName_4CONDATE_max | 10
PreviousName_4CompanyName_max | 72
PreviousName_5CONDATE_max | 10
PreviousName_5CompanyName_max | 58
PreviousName_6CONDATE_max | 10
PreviousName_6CompanyName_max | 49
PreviousName_7CONDATE_max | 10
PreviousName_7CompanyName_max | 42
PreviousName_8CONDATE_max | 10
PreviousName_8CompanyName_max | 35
PreviousName_9CONDATE_max | 10
PreviousName_9CompanyName_max | 42
PreviousName_10CONDATE_max | 10
PreviousName_10CompanyName_max | 32
ConfStmtNextDueDate_max | 10
ConfStmtLastMadeUpDate_max | 10
CompanyName_min | 1
CompanyNumber_min | 8
RegAddressCareOf_min | 1
RegAddressPOBox_min | 1
RegAddressAddressLine1_min | 1
RegAddressAddressLine2_min | 1
RegAddressPostTown_min | 1
RegAddressCounty_min | 1
RegAddressCountry_min | 4
RegAddressPostCode_min | 1
CompanyCategory_min | 16
CompanyStatus_min | 6
CountryOfOrigin_min | 1
DissolutionDate_min | null
IncorporationDate_min | 10
AccountsAccountRefDay_min | 1
AccountsAccountRefMonth_min | 1
AccountsNextDueDate_min | 10
AccountsLastMadeUpDate_min | 10
AccountsAccountCategory_min | 4
ReturnsNextDueDate_min | 10
ReturnsLastMadeUpDate_min | 10
MortgagesNumMortCharges_min | 1
MortgagesNumMortOutstanding_min | 1
MortgagesNumMortPartSatisfied_min | 1
MortgagesNumMortSatisfied_min | 1
SICCodeSicText_1_min | 11
SICCodeSicText_2_min | 11
SICCodeSicText_3_min | 11
SICCodeSicText_4_min | 11
LimitedPartnershipsNumGenPartners_min | 1
LimitedPartnershipsNumLimPartners_min | 1
URI_min | 47
PreviousName_1CONDATE_min | 10
PreviousName_1CompanyName_min | 3
PreviousName_2CONDATE_min | 10
PreviousName_2CompanyName_min | 4
PreviousName_3CONDATE_min | 10
PreviousName_3CompanyName_min | 6
PreviousName_4CONDATE_min | 10
PreviousName_4CompanyName_min | 8
PreviousName_5CONDATE_min | 10
PreviousName_5CompanyName_min | 7
PreviousName_6CONDATE_min | 10
PreviousName_6CompanyName_min | 8
PreviousName_7CONDATE_min | 10
PreviousName_7CompanyName_min | 10
PreviousName_8CONDATE_min | 10
PreviousName_8CompanyName_min | 11
PreviousName_9CONDATE_min | 10
PreviousName_9CompanyName_min | 16
PreviousName_10CONDATE_min | 10
PreviousName_10CompanyName_min | 13
ConfStmtNextDueDate_min | 10
ConfStmtLastMadeUpDate_min | 10
Diagnostic GroupBy started(limit 20 records): 202104061544
+-------------------------------------------+----------+-----+----------+
|CompanyName |total_rows|count|proportion|
+-------------------------------------------+----------+-----+----------+
|! LIMITED |849999 |1 |0.0 |
|! LTD |849999 |1 |0.0 |
|!? LTD |849999 |1 |0.0 |
|!BIG IMPACT GRAPHICS LIMITED |849999 |1 |0.0 |
|!L PRODUCTIONS LIMITED |849999 |1 |0.0 |
|!NKED LTD |849999 |1 |0.0 |
|!NNOV8 LIMITED |849999 |1 |0.0 |
|!NSPIRED INVESTMENTS LTD |849999 |1 |0.0 |
|!NSPIRED LTD |849999 |1 |0.0 |
|!NVERTD DESIGNS LIMITED |849999 |1 |0.0 |
|!OBAC LIMITED |849999 |1 |0.0 |
|!OBAC UK LIMITED |849999 |1 |0.0 |
|!YOZO FASS LIMITED |849999 |1 |0.0 |
|" BORA " 2 LTD |849999 |1 |0.0 |
|"1 C O LIMITED" |849999 |1 |0.0 |
|"1ST RATE" PSYCHOLOGY SERVICES LTD |849999 |1 |0.0 |
|"2 ECOUTE" LIMITED |849999 |1 |0.0 |
|"20-20 VOICE" CANCER |849999 |1 |0.0 |
|"243 RUGBY ROAD MANAGEMENT COMPANY LIMITED"|849999 |1 |0.0 |
|"309" WEST END LANE MANAGEMENT LIMITED |849999 |1 |0.0 |
+-------------------------------------------+----------+-----+----------+
None
+-------------+----------+-----+----------+
|CompanyNumber|total_rows|count|proportion|
+-------------+----------+-----+----------+
|00000118 |849999 |1 |0.0 |
|00000133 |849999 |1 |0.0 |
|00000258 |849999 |1 |0.0 |
|00000529 |849999 |1 |0.0 |
|00000687 |849999 |1 |0.0 |
|00001122 |849999 |1 |0.0 |
|00001140 |849999 |1 |0.0 |
|00002119 |849999 |1 |0.0 |
|00002537 |849999 |1 |0.0 |
|00003002 |849999 |1 |0.0 |
|00004190 |849999 |1 |0.0 |
|00004606 |849999 |1 |0.0 |
|00005188 |849999 |1 |0.0 |
|00005295 |849999 |1 |0.0 |
|00005335 |849999 |1 |0.0 |
|00005728 |849999 |1 |0.0 |
|00006278 |849999 |1 |0.0 |
|00006470 |849999 |1 |0.0 |
|00006480 |849999 |1 |0.0 |
|00007241 |849999 |1 |0.0 |
+-------------+----------+-----+----------+
None
+-----------------------------------------------------------------------------------------+----------+------+----------+
|CompanyCategory |total_rows|count |proportion|
+-----------------------------------------------------------------------------------------+----------+------+----------+
|Charitable Incorporated Organisation |849999 |3493 |0.0 |
|Community Interest Company |849999 |3626 |0.0 |
|Converted/Closed |849999 |1 |0.0 |
|Industrial and Provident Society |849999 |47 |0.0 |
|Investment Company with Variable Capital |849999 |51 |0.0 |
|Investment Company with Variable Capital (Securities) |849999 |1 |0.0 |
|Investment Company with Variable Capital(Umbrella) |849999 |19 |0.0 |
|Limited Liability Partnership |849999 |9144 |0.01 |
|Limited Partnership |849999 |11051 |0.01 |
|Old Public Company |849999 |13 |0.0 |
|Other Company Type |849999 |1 |0.0 |
|Other company type |849999 |2304 |0.0 |
|PRI/LBG/NSC (Private, Limited by guarantee, no share capital, use of 'Limited' exemption)|849999 |5842 |0.01 |
|PRI/LTD BY GUAR/NSC (Private, limited by guarantee, no share capital) |849999 |25038 |0.03 |
|PRIV LTD SECT. 30 (Private limited company, section 30 of the Companies Act) |849999 |2 |0.0 |
|Private Limited Company |849999 |784661|0.92 |
|Private Unlimited |849999 |19 |0.0 |
|Private Unlimited Company |849999 |691 |0.0 |
|Protected Cell Company |849999 |1 |0.0 |
|Public Limited Company |849999 |1118 |0.0 |
+-----------------------------------------------------------------------------------------+----------+------+----------+
None
+------------------------------------------------+----------+------+----------+
|CompanyStatus |total_rows|count |proportion|
+------------------------------------------------+----------+------+----------+
|ADMINISTRATION ORDER |849999 |28 |0.0 |
|ADMINISTRATIVE RECEIVER |849999 |17 |0.0 |
|Active |849999 |808781|0.95 |
|Active - Proposal to Strike off |849999 |23946 |0.03 |
|In Administration |849999 |508 |0.0 |
|In Administration/Administrative Receiver |849999 |43 |0.0 |
|In Administration/Receiver Manager |849999 |5 |0.0 |
|Liquidation |849999 |16241 |0.02 |
|Live but Receiver Manager on at least one charge|849999 |225 |0.0 |
|RECEIVER MANAGER / ADMINISTRATIVE RECEIVER |849999 |5 |0.0 |
|RECEIVERSHIP |849999 |35 |0.0 |
|Voluntary Arrangement |849999 |165 |0.0 |
+------------------------------------------------+----------+------+----------+
None
+------------------+----------+-----+----------+
|RegAddressPostCode|total_rows|count|proportion|
+------------------+----------+-----+----------+
|null |849999 |9978 |0.01 |
|0161 |849999 |1 |0.0 |
|02 806 |849999 |1 |0.0 |
|04640 |849999 |1 |0.0 |
|049 318 |849999 |1 |0.0 |
|08540 |849999 |1 |0.0 |
|0L16 1QE |849999 |1 |0.0 |
|1 |849999 |1 |0.0 |
|10 004 |849999 |1 |0.0 |
|100041 |849999 |1 |0.0 |
|10010 |849999 |1 |0.0 |
|10018 |849999 |1 |0.0 |
|1010 |849999 |1 |0.0 |
|1018 GE |849999 |1 |0.0 |
|1050 |849999 |1 |0.0 |
|110042 |849999 |1 |0.0 |
|1160 |849999 |1 |0.0 |
|12 0AF |849999 |1 |0.0 |
|121006 |849999 |1 |0.0 |
|13900 |849999 |1 |0.0 |
+------------------+----------+-----+----------+
None
+---------------------------+----------+------+----------+
|AccountsAccountCategory |total_rows|count |proportion|
+---------------------------+----------+------+----------+
|null |849999 |240774|0.28 |
|ACCOUNTS TYPE NOT AVAILABLE|849999 |866 |0.0 |
|AUDIT EXEMPTION SUBSIDIARY |849999 |2253 |0.0 |
|AUDITED ABRIDGED |849999 |249 |0.0 |
|DORMANT |849999 |104986|0.12 |
|FILING EXEMPTION SUBSIDIARY|849999 |70 |0.0 |
|FULL |849999 |17019 |0.02 |
|GROUP |849999 |4046 |0.0 |
|INITIAL |849999 |1 |0.0 |
|MEDIUM |849999 |76 |0.0 |
|NO ACCOUNTS FILED |849999 |236789|0.28 |
|PARTIAL EXEMPTION |849999 |4 |0.0 |
|SMALL |849999 |11822 |0.01 |
|TOTAL EXEMPTION FULL |849999 |201577|0.24 |
|TOTAL EXEMPTION SMALL |849999 |3789 |0.0 |
|UNAUDITED ABRIDGED |849999 |25678 |0.03 |
+---------------------------+----------+------+----------+
None
+------------------------------------------------------------------------+----------+-----+----------+
|SICCodeSicText_1 |total_rows|count|proportion|
+------------------------------------------------------------------------+----------+-----+----------+
|0111 - Grow cereals & other crops |849999 |2 |0.0 |
|01110 - Growing of cereals (except rice), leguminous crops and oil seeds|849999 |672 |0.0 |
|0112 - Grow vegetables & nursery products |849999 |1 |0.0 |
|01120 - Growing of rice |849999 |13 |0.0 |
|01130 - Growing of vegetables and melons, roots and tubers |849999 |193 |0.0 |
|01140 - Growing of sugar cane |849999 |7 |0.0 |
|01150 - Growing of tobacco |849999 |7 |0.0 |
|01160 - Growing of fibre crops |849999 |19 |0.0 |
|01190 - Growing of other non-perennial crops |849999 |52 |0.0 |
|01210 - Growing of grapes |849999 |20 |0.0 |
|01220 - Growing of tropical and subtropical fruits |849999 |9 |0.0 |
|01230 - Growing of citrus fruits |849999 |2 |0.0 |
|01240 - Growing of pome fruits and stone fruits |849999 |36 |0.0 |
|01250 - Growing of other tree and bush fruits and nuts |849999 |63 |0.0 |
|01260 - Growing of oleaginous fruits |849999 |5 |0.0 |
|01270 - Growing of beverage crops |849999 |7 |0.0 |
|01280 - Growing of spices, aromatic, drug and pharmaceutical crops |849999 |50 |0.0 |
|01290 - Growing of other perennial crops |849999 |44 |0.0 |
|0130 - Crops combined with animals, mixed farms |849999 |1 |0.0 |
|01300 - Plant propagation |849999 |92 |0.0 |
+------------------------------------------------------------------------+----------+-----+----------+
None
Diagnostic Group By completed: 0:00:11.038049
#save dataframe as HIVE parquet table
df_ch.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404")
Identify data quality issues
issue_col = "qa_issues"
df = df_ch.withColumn(issue_col,F.lit(""))
postcode_cols = [
"RegAddressPostCode"
]
for col in postcode_cols:
df = df.withColumn(issue_col, dc.add_to_string_col(
df[issue_col], f"check_postcode({col})",
dc.check_postcode(df[col])))
df.cache().count()
date_cols = [
'DissolutionDate',
'IncorporationDate',
'AccountsNextDueDate',
'AccountsLastMadeUpDate',
'ReturnsNextDueDate',
'ReturnsLastMadeUpDate',
'ConfStmtNextDueDate',
'ConfStmtLastMadeUpDate'
]
for col in date_cols:
df = df.withColumn(issue_col, dc.add_to_string_col(
df[issue_col], f"check_date({col})",
dc.check_date(df[col],"dd/MM/yyyy")))
df.cache().count()
cols = [
'CompanyName',
'CompanyNumber',
'RegAddressAddressLine1',
'RegAddressPostCode',
'CompanyCategory',
'CompanyStatus',
'CountryOfOrigin',
'IncorporationDate',
'MortgagesNumMortCharges',
'MortgagesNumMortOutstanding',
'MortgagesNumMortPartSatisfied',
'MortgagesNumMortSatisfied',
'SICCodeSicText_1',
'LimitedPartnershipsNumGenPartners',
'LimitedPartnershipsNumLimPartners',
'ConfStmtNextDueDate'
]
for col in cols:
df = df.withColumn(issue_col, dc.add_to_string_col(
df[issue_col], f"check_not_null({col})",
dc.check_not_null(df[col])))
df.cache().count()
df_issues = df.filter(F.col('qa_issues') != '')
Result
print(df_issues.count())
df_issues.show(5,truncate=False,vertical=True)
13343
-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------
CompanyName | "20-20 VOICE" CANCER
CompanyNumber | CE000952
RegAddressCareOf | null
RegAddressPOBox | null
RegAddressAddressLine1 | null
RegAddressAddressLine2 | null
RegAddressPostTown | null
RegAddressCounty | null
RegAddressCountry | null
RegAddressPostCode | null
CompanyCategory | Charitable Incorporated Organisation
CompanyStatus | Active
CountryOfOrigin | United Kingdom
DissolutionDate | null
IncorporationDate | 17/04/2014
AccountsAccountRefDay | null
AccountsAccountRefMonth | null
AccountsNextDueDate | null
AccountsLastMadeUpDate | null
AccountsAccountCategory | NO ACCOUNTS FILED
ReturnsNextDueDate | null
ReturnsLastMadeUpDate | null
MortgagesNumMortCharges | 0
MortgagesNumMortOutstanding | 0
MortgagesNumMortPartSatisfied | 0
MortgagesNumMortSatisfied | 0
SICCodeSicText_1 | None Supplied
SICCodeSicText_2 | null
SICCodeSicText_3 | null
SICCodeSicText_4 | null
LimitedPartnershipsNumGenPartners | 0
LimitedPartnershipsNumLimPartners | 0
URI | http://business.data.gov.uk/id/company/CE000952
PreviousName_1CONDATE | null
PreviousName_1CompanyName | null
PreviousName_2CONDATE | null
PreviousName_2CompanyName | null
PreviousName_3CONDATE | null
PreviousName_3CompanyName | null
PreviousName_4CONDATE | null
PreviousName_4CompanyName | null
PreviousName_5CONDATE | null
PreviousName_5CompanyName | null
PreviousName_6CONDATE | null
PreviousName_6CompanyName | null
PreviousName_7CONDATE | null
PreviousName_7CompanyName | null
PreviousName_8CONDATE | null
PreviousName_8CompanyName | null
PreviousName_9CONDATE | null
PreviousName_9CompanyName | null
PreviousName_10CONDATE | null
PreviousName_10CompanyName | null
ConfStmtNextDueDate | null
ConfStmtLastMadeUpDate | null
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)
-RECORD 1------------------------------------------------------------------------------------------------------------------------------------------
CompanyName | "ALTAI CASHMERE" LLC
CompanyNumber | FC027187
RegAddressCareOf | null
RegAddressPOBox | null
RegAddressAddressLine1 | BUILDING LEFT TO THE "TUUL"
RegAddressAddressLine2 | DRY CLEANING
RegAddressPostTown | KHAN-UUL DISTRICT
RegAddressCounty | ULAANBAATAR, MONGOLIA
RegAddressCountry | MONGOLIA
RegAddressPostCode | null
CompanyCategory | Other company type
CompanyStatus | Active
CountryOfOrigin | MONGOLIA
DissolutionDate | null
IncorporationDate | 14/06/2006
AccountsAccountRefDay | 30
AccountsAccountRefMonth | 6
AccountsNextDueDate | null
AccountsLastMadeUpDate | null
AccountsAccountCategory | NO ACCOUNTS FILED
ReturnsNextDueDate | null
ReturnsLastMadeUpDate | null
MortgagesNumMortCharges | 0
MortgagesNumMortOutstanding | 0
MortgagesNumMortPartSatisfied | 0
MortgagesNumMortSatisfied | 0
SICCodeSicText_1 | None Supplied
SICCodeSicText_2 | null
SICCodeSicText_3 | null
SICCodeSicText_4 | null
LimitedPartnershipsNumGenPartners | 0
LimitedPartnershipsNumLimPartners | 0
URI | http://business.data.gov.uk/id/company/FC027187
PreviousName_1CONDATE | null
PreviousName_1CompanyName | null
PreviousName_2CONDATE | null
PreviousName_2CompanyName | null
PreviousName_3CONDATE | null
PreviousName_3CompanyName | null
PreviousName_4CONDATE | null
PreviousName_4CompanyName | null
PreviousName_5CONDATE | null
PreviousName_5CompanyName | null
PreviousName_6CONDATE | null
PreviousName_6CompanyName | null
PreviousName_7CONDATE | null
PreviousName_7CompanyName | null
PreviousName_8CONDATE | null
PreviousName_8CompanyName | null
PreviousName_9CONDATE | null
PreviousName_9CompanyName | null
PreviousName_10CONDATE | null
PreviousName_10CompanyName | null
ConfStmtNextDueDate | 26/12/2016
ConfStmtLastMadeUpDate | null
qa_issues | check_not_null(RegAddressPostCode)
-RECORD 2------------------------------------------------------------------------------------------------------------------------------------------
CompanyName | "ANDI'S GIFT"
CompanyNumber | CE012989
RegAddressCareOf | null
RegAddressPOBox | null
RegAddressAddressLine1 | null
RegAddressAddressLine2 | null
RegAddressPostTown | null
RegAddressCounty | null
RegAddressCountry | null
RegAddressPostCode | null
CompanyCategory | Charitable Incorporated Organisation
CompanyStatus | Active
CountryOfOrigin | United Kingdom
DissolutionDate | null
IncorporationDate | 22/01/2018
AccountsAccountRefDay | null
AccountsAccountRefMonth | null
AccountsNextDueDate | null
AccountsLastMadeUpDate | null
AccountsAccountCategory | NO ACCOUNTS FILED
ReturnsNextDueDate | null
ReturnsLastMadeUpDate | null
MortgagesNumMortCharges | 0
MortgagesNumMortOutstanding | 0
MortgagesNumMortPartSatisfied | 0
MortgagesNumMortSatisfied | 0
SICCodeSicText_1 | None Supplied
SICCodeSicText_2 | null
SICCodeSicText_3 | null
SICCodeSicText_4 | null
LimitedPartnershipsNumGenPartners | 0
LimitedPartnershipsNumLimPartners | 0
URI | http://business.data.gov.uk/id/company/CE012989
PreviousName_1CONDATE | null
PreviousName_1CompanyName | null
PreviousName_2CONDATE | null
PreviousName_2CompanyName | null
PreviousName_3CONDATE | null
PreviousName_3CompanyName | null
PreviousName_4CONDATE | null
PreviousName_4CompanyName | null
PreviousName_5CONDATE | null
PreviousName_5CompanyName | null
PreviousName_6CONDATE | null
PreviousName_6CompanyName | null
PreviousName_7CONDATE | null
PreviousName_7CompanyName | null
PreviousName_8CONDATE | null
PreviousName_8CompanyName | null
PreviousName_9CONDATE | null
PreviousName_9CompanyName | null
PreviousName_10CONDATE | null
PreviousName_10CompanyName | null
ConfStmtNextDueDate | null
ConfStmtLastMadeUpDate | null
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)
-RECORD 3------------------------------------------------------------------------------------------------------------------------------------------
CompanyName | "ARTHUR BALFOUR",CONSERVATIVE WORKING MEN'S CLUB LIMITED
CompanyNumber | IP10067R
RegAddressCareOf | null
RegAddressPOBox | null
RegAddressAddressLine1 | null
RegAddressAddressLine2 | null
RegAddressPostTown | null
RegAddressCounty | null
RegAddressCountry | null
RegAddressPostCode | null
CompanyCategory | Registered Society
CompanyStatus | Active
CountryOfOrigin | United Kingdom
DissolutionDate | null
IncorporationDate | 01/01/1981
AccountsAccountRefDay | null
AccountsAccountRefMonth | null
AccountsNextDueDate | null
AccountsLastMadeUpDate | null
AccountsAccountCategory | NO ACCOUNTS FILED
ReturnsNextDueDate | null
ReturnsLastMadeUpDate | null
MortgagesNumMortCharges | 0
MortgagesNumMortOutstanding | 0
MortgagesNumMortPartSatisfied | 0
MortgagesNumMortSatisfied | 0
SICCodeSicText_1 | None Supplied
SICCodeSicText_2 | null
SICCodeSicText_3 | null
SICCodeSicText_4 | null
LimitedPartnershipsNumGenPartners | 0
LimitedPartnershipsNumLimPartners | 0
URI | http://business.data.gov.uk/id/company/IP10067R
PreviousName_1CONDATE | null
PreviousName_1CompanyName | null
PreviousName_2CONDATE | null
PreviousName_2CompanyName | null
PreviousName_3CONDATE | null
PreviousName_3CompanyName | null
PreviousName_4CONDATE | null
PreviousName_4CompanyName | null
PreviousName_5CONDATE | null
PreviousName_5CompanyName | null
PreviousName_6CONDATE | null
PreviousName_6CompanyName | null
PreviousName_7CONDATE | null
PreviousName_7CompanyName | null
PreviousName_8CONDATE | null
PreviousName_8CompanyName | null
PreviousName_9CONDATE | null
PreviousName_9CompanyName | null
PreviousName_10CONDATE | null
PreviousName_10CompanyName | null
ConfStmtNextDueDate | 15/01/2017
ConfStmtLastMadeUpDate | null
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)
-RECORD 4------------------------------------------------------------------------------------------------------------------------------------------
CompanyName | "AUTOCONTEX" AUTOMOBIL CONTAINER EXPRESS GMBH
CompanyNumber | FC018262
RegAddressCareOf | null
RegAddressPOBox | null
RegAddressAddressLine1 | BRANCH REGISTRATION
RegAddressAddressLine2 | REFER TO PARENT REGISTRY
RegAddressPostTown | null
RegAddressCounty | null
RegAddressCountry | GERMANY
RegAddressPostCode | null
CompanyCategory | Other company type
CompanyStatus | Active
CountryOfOrigin | WEST GERMANY
DissolutionDate | null
IncorporationDate | 01/11/1994
AccountsAccountRefDay | 30
AccountsAccountRefMonth | 11
AccountsNextDueDate | null
AccountsLastMadeUpDate | 31/12/2002
AccountsAccountCategory | FULL
ReturnsNextDueDate | null
ReturnsLastMadeUpDate | null
MortgagesNumMortCharges | 2
MortgagesNumMortOutstanding | 2
MortgagesNumMortPartSatisfied | 0
MortgagesNumMortSatisfied | 0
SICCodeSicText_1 | None Supplied
SICCodeSicText_2 | null
SICCodeSicText_3 | null
SICCodeSicText_4 | null
LimitedPartnershipsNumGenPartners | 0
LimitedPartnershipsNumLimPartners | 0
URI | http://business.data.gov.uk/id/company/FC018262
PreviousName_1CONDATE | null
PreviousName_1CompanyName | null
PreviousName_2CONDATE | null
PreviousName_2CompanyName | null
PreviousName_3CONDATE | null
PreviousName_3CompanyName | null
PreviousName_4CONDATE | null
PreviousName_4CompanyName | null
PreviousName_5CONDATE | null
PreviousName_5CompanyName | null
PreviousName_6CONDATE | null
PreviousName_6CompanyName | null
PreviousName_7CONDATE | null
PreviousName_7CompanyName | null
PreviousName_8CONDATE | null
PreviousName_8CompanyName | null
PreviousName_9CONDATE | null
PreviousName_9CompanyName | null
PreviousName_10CONDATE | null
PreviousName_10CompanyName | null
ConfStmtNextDueDate | 01/12/2016
ConfStmtLastMadeUpDate | null
qa_issues | check_not_null(RegAddressPostCode)
only showing top 5 rows
#save issues as a HIVE table for later processing
df_issues.coalesce(1).write.format("parquet").mode("overwrite").saveAsTable("ch_20210404_issues")
stmt_issues = f"""
SELECT qa_issues,count(*) as no_rows
from ch_20210404_issues group by 1 order by no_rows"""
spark.sql(stmt_issues).show(truncate=False,vertical=True)
-RECORD 0------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(ConfStmtNextDueDate)
no_rows | 1
-RECORD 1------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressAddressLine1)
no_rows | 6
-RECORD 2------------------------------------------------------------------------------------------------------------------
qa_issues | check_postcode(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)
no_rows | 96
-RECORD 3------------------------------------------------------------------------------------------------------------------
qa_issues | check_postcode(RegAddressPostCode)
no_rows | 248
-RECORD 4------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)
no_rows | 966
-RECORD 5------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode)
no_rows | 1700
-RECORD 6------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressPostCode)
no_rows | 2730
-RECORD 7------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(ConfStmtNextDueDate)
no_rows | 3014
-RECORD 8------------------------------------------------------------------------------------------------------------------
qa_issues | check_not_null(RegAddressAddressLine1),check_not_null(RegAddressPostCode),check_not_null(ConfStmtNextDueDate)
no_rows | 4582
stmt_issues = f"""
SELECT
distinct explode(split(qa_issues, ',')) AS qa_history_issues,
substring_index(qa_history_issues, "(", 1) AS qa_check,
replace(substring_index(qa_history_issues, "(", -1),')','') AS field_name
from ch_20210404_issues order by 2,3"""
df_grp = spark.sql(stmt_issues)
df_grp.show(truncate=False)
+--------------------------------------+--------------+----------------------+
|qa_history_issues |qa_check |field_name |
+--------------------------------------+--------------+----------------------+
|check_not_null(ConfStmtNextDueDate) |check_not_null|ConfStmtNextDueDate |
|check_not_null(RegAddressAddressLine1)|check_not_null|RegAddressAddressLine1|
|check_not_null(RegAddressPostCode) |check_not_null|RegAddressPostCode |
|check_postcode(RegAddressPostCode) |check_postcode|RegAddressPostCode |
+--------------------------------------+--------------+----------------------+