""" validate.py — post-parse validation of all 27 CSV files. Checks performed: 1. All 27 CSV files exist and are non-empty 2. Row counts are in expected ranges 3. No drug in drug_interactions references an unknown drugbank_id 4. All category_ids in drug_categories exist in categories 5. All interactant_ids in drug_interactants exist in interactants 6. All polypeptide_ids in interactant_polypeptides exist in polypeptides 7. All ref_pks in reference_associations exist in references 8. Primary drug IDs in drug_ids match drugs table 9. drugbank_id column is never NULL in any table that has one 10. drug_type values are only 'small molecule' or 'biotech' 11. ATC code level structure is consistent (l1_code shorter than atc_code) 12. SNP snp_type values are only 'effect' or 'adverse_reaction' 13. entity_type in external_identifiers is in the allowed set 14. drug_interactants role values are in the allowed set 15. pathway member_type values are in the allowed set 16. XSD coverage check: counts key tables against expected minimums Usage: python validate.py """ import csv import os import sys from config import OUTPUT_DIR, SCHEMA ERRORS = [] WARNINGS = [] def err(msg): ERRORS.append(msg) print(f" [ERROR] {msg}") def warn(msg): WARNINGS.append(msg) print(f" [WARN] {msg}") def ok(msg): print(f" [OK] {msg}") # ── helpers ─────────────────────────────────────────────────────────────────── def csv_path(table): return os.path.join(OUTPUT_DIR, f"{table}.csv") def read_csv(table): path = csv_path(table) rows = [] with open(path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: rows.append(row) return rows def count_rows(table): """Count data rows using csv.reader to handle multi-line quoted fields correctly.""" path = csv_path(table) with open(path, newline="", encoding="utf-8") as f: return sum(1 for _ in csv.reader(f)) - 1 # subtract header def col_set(rows, col): return {r[col] for r in rows if r.get(col)} # ── check 1: file existence and size ───────────────────────────────────────── def check_files_exist(): print("\n[1] File existence & size") for table in SCHEMA: path = csv_path(table) if not os.path.exists(path): err(f"{table}.csv missing") else: size = os.path.getsize(path) if size < 10: warn(f"{table}.csv exists but is very small ({size} bytes)") else: ok(f"{table}.csv ({size:,} bytes)") # ── check 2: row counts ─────────────────────────────────────────────────────── EXPECTED_MINIMUMS = { "drugs": 19_000, # ~19,842 drugs "drug_ids": 20_000, # primary + legacy IDs "drug_attributes": 500_000, # groups/synonyms/organisms/etc. "drug_properties": 200_000, # calculated + experimental "external_identifiers": 100_000, "references": 30_000, # globally deduplicated refs "reference_associations": 80_000, # general + interactant refs "products": 400_000, "drug_interactions": 2_500_000, # directed DDI edges "interactants": 4_000, "polypeptides": 5_000, # unique UniProt proteins "drug_interactants": 30_000, "categories": 3_000, "drug_categories": 80_000, "pathways": 10_000, "pathway_members": 1_000_000, # pathway drug+enzyme members } def check_row_counts(): print("\n[2] Row count minimums") counts = {} for table in SCHEMA: try: n = count_rows(table) counts[table] = n minimum = EXPECTED_MINIMUMS.get(table, 0) if n < minimum: err(f"{table}: {n:,} rows (expected >= {minimum:,})") else: ok(f"{table}: {n:,} rows") except Exception as e: err(f"{table}: could not count rows — {e}") return counts # ── check 3: DDI referential integrity ─────────────────────────────────────── def check_ddi_ids(drug_ids_set): print("\n[3] drug_interactions — referential integrity (sample check)") try: rows = read_csv("drug_interactions") unknown_src = sum(1 for r in rows if r["drugbank_id"] not in drug_ids_set) unknown_tgt = sum(1 for r in rows if r["interacting_drugbank_id"] not in drug_ids_set) if unknown_src: warn(f"drug_interactions: {unknown_src:,} rows with unknown source drugbank_id") else: ok("drug_interactions: all source IDs found in drugs table") # Target IDs can legitimately be absent if the other drug is also in DB if unknown_tgt: warn(f"drug_interactions: {unknown_tgt:,} rows with unknown target drugbank_id " f"(may be withdrawn/experimental drugs not in this export)") else: ok("drug_interactions: all target IDs found in drugs table") except Exception as e: err(f"check_ddi_ids failed: {e}") # ── check 4: category FK ───────────────────────────────────────────────────── def check_category_fk(): print("\n[4] drug_categories -> categories FK") try: cat_ids = col_set(read_csv("categories"), "category_id") dc_rows = read_csv("drug_categories") missing = sum(1 for r in dc_rows if r["category_id"] not in cat_ids) if missing: err(f"drug_categories: {missing:,} rows with unknown category_id") else: ok(f"drug_categories: all {len(dc_rows):,} category_id values found") except Exception as e: err(f"check_category_fk failed: {e}") # ── check 5: interactant FK ─────────────────────────────────────────────────── def check_interactant_fk(): print("\n[5] drug_interactants -> interactants FK") try: int_ids = col_set(read_csv("interactants"), "interactant_id") di_rows = read_csv("drug_interactants") missing = sum(1 for r in di_rows if r["interactant_id"] not in int_ids) if missing: err(f"drug_interactants: {missing:,} rows with unknown interactant_id") else: ok(f"drug_interactants: all {len(di_rows):,} interactant_id values found") except Exception as e: err(f"check_interactant_fk failed: {e}") # ── check 6: polypeptide FK ─────────────────────────────────────────────────── def check_polypeptide_fk(): print("\n[6] interactant_polypeptides -> polypeptides FK") try: poly_ids = col_set(read_csv("polypeptides"), "polypeptide_id") ip_rows = read_csv("interactant_polypeptides") missing = sum(1 for r in ip_rows if r["polypeptide_id"] not in poly_ids) if missing: err(f"interactant_polypeptides: {missing:,} rows with unknown polypeptide_id") else: ok(f"interactant_polypeptides: all {len(ip_rows):,} polypeptide_id values found") except Exception as e: err(f"check_polypeptide_fk failed: {e}") # ── check 7: reference_associations FK ─────────────────────────────────────── def check_ref_fk(): print("\n[7] reference_associations -> references FK") try: ref_pks = col_set(read_csv("references"), "ref_pk") ra_rows = read_csv("reference_associations") missing = sum(1 for r in ra_rows if r["ref_pk"] not in ref_pks) if missing: err(f"reference_associations: {missing:,} rows with unknown ref_pk") else: ok(f"reference_associations: all {len(ra_rows):,} ref_pk values found") except Exception as e: err(f"check_ref_fk failed: {e}") # ── check 8: drug_ids primary coverage ─────────────────────────────────────── def check_drug_ids_coverage(drug_ids_set): print("\n[8] drug_ids — primary ID coverage") try: di_rows = read_csv("drug_ids") primary_in_di = {r["legacy_id"] for r in di_rows if r.get("is_primary", "").lower() == "true"} missing = drug_ids_set - primary_in_di if missing: warn(f"drug_ids: {len(missing):,} primary drug IDs not found in drug_ids table") else: ok(f"drug_ids: all {len(drug_ids_set):,} primary IDs represented") except Exception as e: err(f"check_drug_ids_coverage failed: {e}") # ── check 9: NULL drugbank_id ───────────────────────────────────────────────── def check_no_null_ids(): print("\n[9] NULL drugbank_id check") tables_with_did = [t for t in SCHEMA if "drugbank_id" in SCHEMA[t]] for table in tables_with_did: try: rows = read_csv(table) nulls = sum(1 for r in rows if not r.get("drugbank_id")) if nulls: err(f"{table}: {nulls:,} rows with NULL/empty drugbank_id") else: ok(f"{table}: no NULL drugbank_id ({len(rows):,} rows)") except Exception as e: err(f"{table} NULL check failed: {e}") # ── check 10: drug_type values ──────────────────────────────────────────────── def check_drug_type(): print("\n[10] drugs — drug_type values") try: rows = read_csv("drugs") types = {r["drug_type"] for r in rows} valid = {"small molecule", "biotech"} bad = types - valid if bad: err(f"drugs: unexpected drug_type values: {bad}") else: ok(f"drugs: drug_type values = {types}") except Exception as e: err(f"check_drug_type failed: {e}") # ── check 11: ATC code structure ────────────────────────────────────────────── def check_atc_codes(): print("\n[11] atc_codes — level hierarchy consistency") try: rows = read_csv("atc_codes") bad = 0 for r in rows: code = r.get("atc_code") or "" l1c = r.get("l1_code") or "" l4c = r.get("l4_code") or "" # l1 code should be shorter than full code; l4 should be 1 char if code and l1c and len(l1c) >= len(code): bad += 1 if l4c and len(l4c) > 1: bad += 1 if bad: warn(f"atc_codes: {bad} rows with unexpected level code lengths") else: ok(f"atc_codes: level hierarchy looks correct ({len(rows):,} rows)") except Exception as e: err(f"check_atc_codes failed: {e}") # ── check 12: snp_type values ───────────────────────────────────────────────── def check_snp_types(): print("\n[12] drug_snp_data — snp_type values") try: rows = read_csv("drug_snp_data") types = {r["snp_type"] for r in rows} valid = {"effect", "adverse_reaction"} bad = types - valid if bad: err(f"drug_snp_data: unexpected snp_type values: {bad}") else: ok(f"drug_snp_data: snp_type values = {types} ({len(rows):,} rows)") except Exception as e: err(f"check_snp_types failed: {e}") # ── check 13: external_identifiers entity_type ─────────────────────────────── def check_entity_types(): print("\n[13] external_identifiers — entity_type values") try: rows = read_csv("external_identifiers") types = {r["entity_type"] for r in rows} valid = {"drug", "drug_link", "polypeptide", "salt"} bad = types - valid if bad: err(f"external_identifiers: unexpected entity_type values: {bad}") else: ok(f"external_identifiers: entity_type values = {types} ({len(rows):,} rows)") except Exception as e: err(f"check_entity_types failed: {e}") # ── check 14: drug_interactants role values ─────────────────────────────────── def check_interactant_roles(): print("\n[14] drug_interactants — role values") try: rows = read_csv("drug_interactants") roles = {r["role"] for r in rows} valid = {"target", "enzyme", "carrier", "transporter"} bad = roles - valid if bad: err(f"drug_interactants: unexpected role values: {bad}") else: ok(f"drug_interactants: role values = {roles} ({len(rows):,} rows)") except Exception as e: err(f"check_interactant_roles failed: {e}") # ── check 15: pathway member_type values ────────────────────────────────────── def check_pathway_member_types(): print("\n[15] pathway_members — member_type values") try: rows = read_csv("pathway_members") types = {r["member_type"] for r in rows} valid = {"drug", "enzyme"} bad = types - valid if bad: err(f"pathway_members: unexpected member_type values: {bad}") else: ok(f"pathway_members: member_type values = {types} ({len(rows):,} rows)") except Exception as e: err(f"check_pathway_member_types failed: {e}") # ── check 16: XSD coverage summary ─────────────────────────────────────────── def check_xsd_coverage(counts): print("\n[16] XSD coverage summary") items = [ ("drugs", "drug entries (XSD: drug-type)"), ("drug_ids", "drugbank-id elements"), ("drug_attributes", "multi-valued string attrs (groups/synonyms/etc.)"), ("drug_properties", "calculated + experimental properties"), ("external_identifiers", "external IDs + links (drug/polypeptide/salt)"), ("references", "globally deduplicated references"), ("reference_associations", "reference context associations"), ("salts", "salt forms"), ("products", "marketed products"), ("drug_commercial_entities", "packagers + manufacturers + brands"), ("mixtures", "drug mixtures"), ("prices", "price entries"), ("categories", "unique MeSH categories"), ("drug_categories", "drug-category assignments"), ("dosages", "dosage records"), ("atc_codes", "ATC code entries"), ("patents", "patent records"), ("drug_interactions","directed DDI edges"), ("drug_snp_data", "SNP pharmacogenomics records"), ("pathways", "unique pathways"), ("pathway_members", "pathway drug/enzyme members"), ("reactions", "metabolic reactions"), ("interactants", "unique binding entities (BE-IDs)"), ("drug_interactants","drug–protein interaction records"), ("polypeptides", "unique UniProt polypeptides"), ("interactant_polypeptides", "interactant–polypeptide links"), ("polypeptide_attributes", "polypeptide synonyms/Pfam/GO"), ] for table, desc in items: n = counts.get(table, "?") print(f" {n:>10,} {desc}") # ── main ────────────────────────────────────────────────────────────────────── def main(): print("=" * 65) print("DrugBank CSV Validation Report") print("=" * 65) check_files_exist() counts = check_row_counts() # Load drug IDs set (used in multiple checks) try: drugs_rows = read_csv("drugs") drug_ids_set = {r["drugbank_id"] for r in drugs_rows if r.get("drugbank_id")} except Exception as e: err(f"Could not load drugs.csv: {e}") drug_ids_set = set() check_ddi_ids(drug_ids_set) check_category_fk() check_interactant_fk() check_polypeptide_fk() check_ref_fk() check_drug_ids_coverage(drug_ids_set) check_no_null_ids() check_drug_type() check_atc_codes() check_snp_types() check_entity_types() check_interactant_roles() check_pathway_member_types() check_xsd_coverage(counts) print("\n" + "=" * 65) print(f"Validation complete: {len(ERRORS)} error(s), {len(WARNINGS)} warning(s)") if ERRORS: print("\nERRORS:") for e in ERRORS: print(f" [X] {e}") if WARNINGS: print("\nWARNINGS:") for w in WARNINGS: print(f" [!] {w}") if not ERRORS and not WARNINGS: print(" [OK] All checks passed — data looks clean!") print("=" * 65) return len(ERRORS) if __name__ == "__main__": sys.exit(main())