Spaces:
Sleeping
Sleeping
| """ | |
| validate.py β post-parse validation of all 27 CSV files. | |
| Checks performed: | |
| 1. All 27 CSV files exist and are non-empty | |
| 2. Row counts are in expected ranges | |
| 3. No drug in drug_interactions references an unknown drugbank_id | |
| 4. All category_ids in drug_categories exist in categories | |
| 5. All interactant_ids in drug_interactants exist in interactants | |
| 6. All polypeptide_ids in interactant_polypeptides exist in polypeptides | |
| 7. All ref_pks in reference_associations exist in references | |
| 8. Primary drug IDs in drug_ids match drugs table | |
| 9. drugbank_id column is never NULL in any table that has one | |
| 10. drug_type values are only 'small molecule' or 'biotech' | |
| 11. ATC code level structure is consistent (l1_code shorter than atc_code) | |
| 12. SNP snp_type values are only 'effect' or 'adverse_reaction' | |
| 13. entity_type in external_identifiers is in the allowed set | |
| 14. drug_interactants role values are in the allowed set | |
| 15. pathway member_type values are in the allowed set | |
| 16. XSD coverage check: counts key tables against expected minimums | |
| Usage: | |
| python validate.py | |
| """ | |
| import csv | |
| import os | |
| import sys | |
| from config import OUTPUT_DIR, SCHEMA | |
| ERRORS = [] | |
| WARNINGS = [] | |
| def err(msg): | |
| ERRORS.append(msg) | |
| print(f" [ERROR] {msg}") | |
| def warn(msg): | |
| WARNINGS.append(msg) | |
| print(f" [WARN] {msg}") | |
| def ok(msg): | |
| print(f" [OK] {msg}") | |
| # ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def csv_path(table): | |
| return os.path.join(OUTPUT_DIR, f"{table}.csv") | |
| def read_csv(table): | |
| path = csv_path(table) | |
| rows = [] | |
| with open(path, newline="", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| rows.append(row) | |
| return rows | |
| def count_rows(table): | |
| """Count data rows using csv.reader to handle multi-line quoted fields correctly.""" | |
| path = csv_path(table) | |
| with open(path, newline="", encoding="utf-8") as f: | |
| return sum(1 for _ in csv.reader(f)) - 1 # subtract header | |
| def col_set(rows, col): | |
| return {r[col] for r in rows if r.get(col)} | |
| # ββ check 1: file existence and size βββββββββββββββββββββββββββββββββββββββββ | |
| def check_files_exist(): | |
| print("\n[1] File existence & size") | |
| for table in SCHEMA: | |
| path = csv_path(table) | |
| if not os.path.exists(path): | |
| err(f"{table}.csv missing") | |
| else: | |
| size = os.path.getsize(path) | |
| if size < 10: | |
| warn(f"{table}.csv exists but is very small ({size} bytes)") | |
| else: | |
| ok(f"{table}.csv ({size:,} bytes)") | |
| # ββ check 2: row counts βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EXPECTED_MINIMUMS = { | |
| "drugs": 19_000, # ~19,842 drugs | |
| "drug_ids": 20_000, # primary + legacy IDs | |
| "drug_attributes": 500_000, # groups/synonyms/organisms/etc. | |
| "drug_properties": 200_000, # calculated + experimental | |
| "external_identifiers": 100_000, | |
| "references": 30_000, # globally deduplicated refs | |
| "reference_associations": 80_000, # general + interactant refs | |
| "products": 400_000, | |
| "drug_interactions": 2_500_000, # directed DDI edges | |
| "interactants": 4_000, | |
| "polypeptides": 5_000, # unique UniProt proteins | |
| "drug_interactants": 30_000, | |
| "categories": 3_000, | |
| "drug_categories": 80_000, | |
| "pathways": 10_000, | |
| "pathway_members": 1_000_000, # pathway drug+enzyme members | |
| } | |
| def check_row_counts(): | |
| print("\n[2] Row count minimums") | |
| counts = {} | |
| for table in SCHEMA: | |
| try: | |
| n = count_rows(table) | |
| counts[table] = n | |
| minimum = EXPECTED_MINIMUMS.get(table, 0) | |
| if n < minimum: | |
| err(f"{table}: {n:,} rows (expected >= {minimum:,})") | |
| else: | |
| ok(f"{table}: {n:,} rows") | |
| except Exception as e: | |
| err(f"{table}: could not count rows β {e}") | |
| return counts | |
| # ββ check 3: DDI referential integrity βββββββββββββββββββββββββββββββββββββββ | |
| def check_ddi_ids(drug_ids_set): | |
| print("\n[3] drug_interactions β referential integrity (sample check)") | |
| try: | |
| rows = read_csv("drug_interactions") | |
| unknown_src = sum(1 for r in rows if r["drugbank_id"] not in drug_ids_set) | |
| unknown_tgt = sum(1 for r in rows if r["interacting_drugbank_id"] not in drug_ids_set) | |
| if unknown_src: | |
| warn(f"drug_interactions: {unknown_src:,} rows with unknown source drugbank_id") | |
| else: | |
| ok("drug_interactions: all source IDs found in drugs table") | |
| # Target IDs can legitimately be absent if the other drug is also in DB | |
| if unknown_tgt: | |
| warn(f"drug_interactions: {unknown_tgt:,} rows with unknown target drugbank_id " | |
| f"(may be withdrawn/experimental drugs not in this export)") | |
| else: | |
| ok("drug_interactions: all target IDs found in drugs table") | |
| except Exception as e: | |
| err(f"check_ddi_ids failed: {e}") | |
| # ββ check 4: category FK βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_category_fk(): | |
| print("\n[4] drug_categories -> categories FK") | |
| try: | |
| cat_ids = col_set(read_csv("categories"), "category_id") | |
| dc_rows = read_csv("drug_categories") | |
| missing = sum(1 for r in dc_rows if r["category_id"] not in cat_ids) | |
| if missing: | |
| err(f"drug_categories: {missing:,} rows with unknown category_id") | |
| else: | |
| ok(f"drug_categories: all {len(dc_rows):,} category_id values found") | |
| except Exception as e: | |
| err(f"check_category_fk failed: {e}") | |
| # ββ check 5: interactant FK βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_interactant_fk(): | |
| print("\n[5] drug_interactants -> interactants FK") | |
| try: | |
| int_ids = col_set(read_csv("interactants"), "interactant_id") | |
| di_rows = read_csv("drug_interactants") | |
| missing = sum(1 for r in di_rows if r["interactant_id"] not in int_ids) | |
| if missing: | |
| err(f"drug_interactants: {missing:,} rows with unknown interactant_id") | |
| else: | |
| ok(f"drug_interactants: all {len(di_rows):,} interactant_id values found") | |
| except Exception as e: | |
| err(f"check_interactant_fk failed: {e}") | |
| # ββ check 6: polypeptide FK βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_polypeptide_fk(): | |
| print("\n[6] interactant_polypeptides -> polypeptides FK") | |
| try: | |
| poly_ids = col_set(read_csv("polypeptides"), "polypeptide_id") | |
| ip_rows = read_csv("interactant_polypeptides") | |
| missing = sum(1 for r in ip_rows if r["polypeptide_id"] not in poly_ids) | |
| if missing: | |
| err(f"interactant_polypeptides: {missing:,} rows with unknown polypeptide_id") | |
| else: | |
| ok(f"interactant_polypeptides: all {len(ip_rows):,} polypeptide_id values found") | |
| except Exception as e: | |
| err(f"check_polypeptide_fk failed: {e}") | |
| # ββ check 7: reference_associations FK βββββββββββββββββββββββββββββββββββββββ | |
| def check_ref_fk(): | |
| print("\n[7] reference_associations -> references FK") | |
| try: | |
| ref_pks = col_set(read_csv("references"), "ref_pk") | |
| ra_rows = read_csv("reference_associations") | |
| missing = sum(1 for r in ra_rows if r["ref_pk"] not in ref_pks) | |
| if missing: | |
| err(f"reference_associations: {missing:,} rows with unknown ref_pk") | |
| else: | |
| ok(f"reference_associations: all {len(ra_rows):,} ref_pk values found") | |
| except Exception as e: | |
| err(f"check_ref_fk failed: {e}") | |
| # ββ check 8: drug_ids primary coverage βββββββββββββββββββββββββββββββββββββββ | |
| def check_drug_ids_coverage(drug_ids_set): | |
| print("\n[8] drug_ids β primary ID coverage") | |
| try: | |
| di_rows = read_csv("drug_ids") | |
| primary_in_di = {r["legacy_id"] for r in di_rows | |
| if r.get("is_primary", "").lower() == "true"} | |
| missing = drug_ids_set - primary_in_di | |
| if missing: | |
| warn(f"drug_ids: {len(missing):,} primary drug IDs not found in drug_ids table") | |
| else: | |
| ok(f"drug_ids: all {len(drug_ids_set):,} primary IDs represented") | |
| except Exception as e: | |
| err(f"check_drug_ids_coverage failed: {e}") | |
| # ββ check 9: NULL drugbank_id βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_no_null_ids(): | |
| print("\n[9] NULL drugbank_id check") | |
| tables_with_did = [t for t in SCHEMA if "drugbank_id" in SCHEMA[t]] | |
| for table in tables_with_did: | |
| try: | |
| rows = read_csv(table) | |
| nulls = sum(1 for r in rows if not r.get("drugbank_id")) | |
| if nulls: | |
| err(f"{table}: {nulls:,} rows with NULL/empty drugbank_id") | |
| else: | |
| ok(f"{table}: no NULL drugbank_id ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"{table} NULL check failed: {e}") | |
| # ββ check 10: drug_type values ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_drug_type(): | |
| print("\n[10] drugs β drug_type values") | |
| try: | |
| rows = read_csv("drugs") | |
| types = {r["drug_type"] for r in rows} | |
| valid = {"small molecule", "biotech"} | |
| bad = types - valid | |
| if bad: | |
| err(f"drugs: unexpected drug_type values: {bad}") | |
| else: | |
| ok(f"drugs: drug_type values = {types}") | |
| except Exception as e: | |
| err(f"check_drug_type failed: {e}") | |
| # ββ check 11: ATC code structure ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_atc_codes(): | |
| print("\n[11] atc_codes β level hierarchy consistency") | |
| try: | |
| rows = read_csv("atc_codes") | |
| bad = 0 | |
| for r in rows: | |
| code = r.get("atc_code") or "" | |
| l1c = r.get("l1_code") or "" | |
| l4c = r.get("l4_code") or "" | |
| # l1 code should be shorter than full code; l4 should be 1 char | |
| if code and l1c and len(l1c) >= len(code): | |
| bad += 1 | |
| if l4c and len(l4c) > 1: | |
| bad += 1 | |
| if bad: | |
| warn(f"atc_codes: {bad} rows with unexpected level code lengths") | |
| else: | |
| ok(f"atc_codes: level hierarchy looks correct ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"check_atc_codes failed: {e}") | |
| # ββ check 12: snp_type values βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def check_snp_types(): | |
| print("\n[12] drug_snp_data β snp_type values") | |
| try: | |
| rows = read_csv("drug_snp_data") | |
| types = {r["snp_type"] for r in rows} | |
| valid = {"effect", "adverse_reaction"} | |
| bad = types - valid | |
| if bad: | |
| err(f"drug_snp_data: unexpected snp_type values: {bad}") | |
| else: | |
| ok(f"drug_snp_data: snp_type values = {types} ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"check_snp_types failed: {e}") | |
| # ββ check 13: external_identifiers entity_type βββββββββββββββββββββββββββββββ | |
| def check_entity_types(): | |
| print("\n[13] external_identifiers β entity_type values") | |
| try: | |
| rows = read_csv("external_identifiers") | |
| types = {r["entity_type"] for r in rows} | |
| valid = {"drug", "drug_link", "polypeptide", "salt"} | |
| bad = types - valid | |
| if bad: | |
| err(f"external_identifiers: unexpected entity_type values: {bad}") | |
| else: | |
| ok(f"external_identifiers: entity_type values = {types} ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"check_entity_types failed: {e}") | |
| # ββ check 14: drug_interactants role values βββββββββββββββββββββββββββββββββββ | |
| def check_interactant_roles(): | |
| print("\n[14] drug_interactants β role values") | |
| try: | |
| rows = read_csv("drug_interactants") | |
| roles = {r["role"] for r in rows} | |
| valid = {"target", "enzyme", "carrier", "transporter"} | |
| bad = roles - valid | |
| if bad: | |
| err(f"drug_interactants: unexpected role values: {bad}") | |
| else: | |
| ok(f"drug_interactants: role values = {roles} ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"check_interactant_roles failed: {e}") | |
| # ββ check 15: pathway member_type values ββββββββββββββββββββββββββββββββββββββ | |
| def check_pathway_member_types(): | |
| print("\n[15] pathway_members β member_type values") | |
| try: | |
| rows = read_csv("pathway_members") | |
| types = {r["member_type"] for r in rows} | |
| valid = {"drug", "enzyme"} | |
| bad = types - valid | |
| if bad: | |
| err(f"pathway_members: unexpected member_type values: {bad}") | |
| else: | |
| ok(f"pathway_members: member_type values = {types} ({len(rows):,} rows)") | |
| except Exception as e: | |
| err(f"check_pathway_member_types failed: {e}") | |
| # ββ check 16: XSD coverage summary βββββββββββββββββββββββββββββββββββββββββββ | |
| def check_xsd_coverage(counts): | |
| print("\n[16] XSD coverage summary") | |
| items = [ | |
| ("drugs", "drug entries (XSD: drug-type)"), | |
| ("drug_ids", "drugbank-id elements"), | |
| ("drug_attributes", "multi-valued string attrs (groups/synonyms/etc.)"), | |
| ("drug_properties", "calculated + experimental properties"), | |
| ("external_identifiers", "external IDs + links (drug/polypeptide/salt)"), | |
| ("references", "globally deduplicated references"), | |
| ("reference_associations", "reference context associations"), | |
| ("salts", "salt forms"), | |
| ("products", "marketed products"), | |
| ("drug_commercial_entities", "packagers + manufacturers + brands"), | |
| ("mixtures", "drug mixtures"), | |
| ("prices", "price entries"), | |
| ("categories", "unique MeSH categories"), | |
| ("drug_categories", "drug-category assignments"), | |
| ("dosages", "dosage records"), | |
| ("atc_codes", "ATC code entries"), | |
| ("patents", "patent records"), | |
| ("drug_interactions","directed DDI edges"), | |
| ("drug_snp_data", "SNP pharmacogenomics records"), | |
| ("pathways", "unique pathways"), | |
| ("pathway_members", "pathway drug/enzyme members"), | |
| ("reactions", "metabolic reactions"), | |
| ("interactants", "unique binding entities (BE-IDs)"), | |
| ("drug_interactants","drugβprotein interaction records"), | |
| ("polypeptides", "unique UniProt polypeptides"), | |
| ("interactant_polypeptides", "interactantβpolypeptide links"), | |
| ("polypeptide_attributes", "polypeptide synonyms/Pfam/GO"), | |
| ] | |
| for table, desc in items: | |
| n = counts.get(table, "?") | |
| print(f" {n:>10,} {desc}") | |
| # ββ main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| print("=" * 65) | |
| print("DrugBank CSV Validation Report") | |
| print("=" * 65) | |
| check_files_exist() | |
| counts = check_row_counts() | |
| # Load drug IDs set (used in multiple checks) | |
| try: | |
| drugs_rows = read_csv("drugs") | |
| drug_ids_set = {r["drugbank_id"] for r in drugs_rows if r.get("drugbank_id")} | |
| except Exception as e: | |
| err(f"Could not load drugs.csv: {e}") | |
| drug_ids_set = set() | |
| check_ddi_ids(drug_ids_set) | |
| check_category_fk() | |
| check_interactant_fk() | |
| check_polypeptide_fk() | |
| check_ref_fk() | |
| check_drug_ids_coverage(drug_ids_set) | |
| check_no_null_ids() | |
| check_drug_type() | |
| check_atc_codes() | |
| check_snp_types() | |
| check_entity_types() | |
| check_interactant_roles() | |
| check_pathway_member_types() | |
| check_xsd_coverage(counts) | |
| print("\n" + "=" * 65) | |
| print(f"Validation complete: {len(ERRORS)} error(s), {len(WARNINGS)} warning(s)") | |
| if ERRORS: | |
| print("\nERRORS:") | |
| for e in ERRORS: | |
| print(f" [X] {e}") | |
| if WARNINGS: | |
| print("\nWARNINGS:") | |
| for w in WARNINGS: | |
| print(f" [!] {w}") | |
| if not ERRORS and not WARNINGS: | |
| print(" [OK] All checks passed β data looks clean!") | |
| print("=" * 65) | |
| return len(ERRORS) | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |