ddi-checker / parser /validate.py
marwadeeb's picture
added data parsing stage
a062f28
"""
validate.py β€” post-parse validation of all 27 CSV files.
Checks performed:
1. All 27 CSV files exist and are non-empty
2. Row counts are in expected ranges
3. No drug in drug_interactions references an unknown drugbank_id
4. All category_ids in drug_categories exist in categories
5. All interactant_ids in drug_interactants exist in interactants
6. All polypeptide_ids in interactant_polypeptides exist in polypeptides
7. All ref_pks in reference_associations exist in references
8. Primary drug IDs in drug_ids match drugs table
9. drugbank_id column is never NULL in any table that has one
10. drug_type values are only 'small molecule' or 'biotech'
11. ATC code level structure is consistent (l1_code shorter than atc_code)
12. SNP snp_type values are only 'effect' or 'adverse_reaction'
13. entity_type in external_identifiers is in the allowed set
14. drug_interactants role values are in the allowed set
15. pathway member_type values are in the allowed set
16. XSD coverage check: counts key tables against expected minimums
Usage:
python validate.py
"""
import csv
import os
import sys
from config import OUTPUT_DIR, SCHEMA
ERRORS = []
WARNINGS = []
def err(msg):
ERRORS.append(msg)
print(f" [ERROR] {msg}")
def warn(msg):
WARNINGS.append(msg)
print(f" [WARN] {msg}")
def ok(msg):
print(f" [OK] {msg}")
# ── helpers ───────────────────────────────────────────────────────────────────
def csv_path(table):
return os.path.join(OUTPUT_DIR, f"{table}.csv")
def read_csv(table):
path = csv_path(table)
rows = []
with open(path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
rows.append(row)
return rows
def count_rows(table):
"""Count data rows using csv.reader to handle multi-line quoted fields correctly."""
path = csv_path(table)
with open(path, newline="", encoding="utf-8") as f:
return sum(1 for _ in csv.reader(f)) - 1 # subtract header
def col_set(rows, col):
return {r[col] for r in rows if r.get(col)}
# ── check 1: file existence and size ─────────────────────────────────────────
def check_files_exist():
print("\n[1] File existence & size")
for table in SCHEMA:
path = csv_path(table)
if not os.path.exists(path):
err(f"{table}.csv missing")
else:
size = os.path.getsize(path)
if size < 10:
warn(f"{table}.csv exists but is very small ({size} bytes)")
else:
ok(f"{table}.csv ({size:,} bytes)")
# ── check 2: row counts ───────────────────────────────────────────────────────
EXPECTED_MINIMUMS = {
"drugs": 19_000, # ~19,842 drugs
"drug_ids": 20_000, # primary + legacy IDs
"drug_attributes": 500_000, # groups/synonyms/organisms/etc.
"drug_properties": 200_000, # calculated + experimental
"external_identifiers": 100_000,
"references": 30_000, # globally deduplicated refs
"reference_associations": 80_000, # general + interactant refs
"products": 400_000,
"drug_interactions": 2_500_000, # directed DDI edges
"interactants": 4_000,
"polypeptides": 5_000, # unique UniProt proteins
"drug_interactants": 30_000,
"categories": 3_000,
"drug_categories": 80_000,
"pathways": 10_000,
"pathway_members": 1_000_000, # pathway drug+enzyme members
}
def check_row_counts():
print("\n[2] Row count minimums")
counts = {}
for table in SCHEMA:
try:
n = count_rows(table)
counts[table] = n
minimum = EXPECTED_MINIMUMS.get(table, 0)
if n < minimum:
err(f"{table}: {n:,} rows (expected >= {minimum:,})")
else:
ok(f"{table}: {n:,} rows")
except Exception as e:
err(f"{table}: could not count rows β€” {e}")
return counts
# ── check 3: DDI referential integrity ───────────────────────────────────────
def check_ddi_ids(drug_ids_set):
print("\n[3] drug_interactions β€” referential integrity (sample check)")
try:
rows = read_csv("drug_interactions")
unknown_src = sum(1 for r in rows if r["drugbank_id"] not in drug_ids_set)
unknown_tgt = sum(1 for r in rows if r["interacting_drugbank_id"] not in drug_ids_set)
if unknown_src:
warn(f"drug_interactions: {unknown_src:,} rows with unknown source drugbank_id")
else:
ok("drug_interactions: all source IDs found in drugs table")
# Target IDs can legitimately be absent if the other drug is also in DB
if unknown_tgt:
warn(f"drug_interactions: {unknown_tgt:,} rows with unknown target drugbank_id "
f"(may be withdrawn/experimental drugs not in this export)")
else:
ok("drug_interactions: all target IDs found in drugs table")
except Exception as e:
err(f"check_ddi_ids failed: {e}")
# ── check 4: category FK ─────────────────────────────────────────────────────
def check_category_fk():
print("\n[4] drug_categories -> categories FK")
try:
cat_ids = col_set(read_csv("categories"), "category_id")
dc_rows = read_csv("drug_categories")
missing = sum(1 for r in dc_rows if r["category_id"] not in cat_ids)
if missing:
err(f"drug_categories: {missing:,} rows with unknown category_id")
else:
ok(f"drug_categories: all {len(dc_rows):,} category_id values found")
except Exception as e:
err(f"check_category_fk failed: {e}")
# ── check 5: interactant FK ───────────────────────────────────────────────────
def check_interactant_fk():
print("\n[5] drug_interactants -> interactants FK")
try:
int_ids = col_set(read_csv("interactants"), "interactant_id")
di_rows = read_csv("drug_interactants")
missing = sum(1 for r in di_rows if r["interactant_id"] not in int_ids)
if missing:
err(f"drug_interactants: {missing:,} rows with unknown interactant_id")
else:
ok(f"drug_interactants: all {len(di_rows):,} interactant_id values found")
except Exception as e:
err(f"check_interactant_fk failed: {e}")
# ── check 6: polypeptide FK ───────────────────────────────────────────────────
def check_polypeptide_fk():
print("\n[6] interactant_polypeptides -> polypeptides FK")
try:
poly_ids = col_set(read_csv("polypeptides"), "polypeptide_id")
ip_rows = read_csv("interactant_polypeptides")
missing = sum(1 for r in ip_rows if r["polypeptide_id"] not in poly_ids)
if missing:
err(f"interactant_polypeptides: {missing:,} rows with unknown polypeptide_id")
else:
ok(f"interactant_polypeptides: all {len(ip_rows):,} polypeptide_id values found")
except Exception as e:
err(f"check_polypeptide_fk failed: {e}")
# ── check 7: reference_associations FK ───────────────────────────────────────
def check_ref_fk():
print("\n[7] reference_associations -> references FK")
try:
ref_pks = col_set(read_csv("references"), "ref_pk")
ra_rows = read_csv("reference_associations")
missing = sum(1 for r in ra_rows if r["ref_pk"] not in ref_pks)
if missing:
err(f"reference_associations: {missing:,} rows with unknown ref_pk")
else:
ok(f"reference_associations: all {len(ra_rows):,} ref_pk values found")
except Exception as e:
err(f"check_ref_fk failed: {e}")
# ── check 8: drug_ids primary coverage ───────────────────────────────────────
def check_drug_ids_coverage(drug_ids_set):
print("\n[8] drug_ids β€” primary ID coverage")
try:
di_rows = read_csv("drug_ids")
primary_in_di = {r["legacy_id"] for r in di_rows
if r.get("is_primary", "").lower() == "true"}
missing = drug_ids_set - primary_in_di
if missing:
warn(f"drug_ids: {len(missing):,} primary drug IDs not found in drug_ids table")
else:
ok(f"drug_ids: all {len(drug_ids_set):,} primary IDs represented")
except Exception as e:
err(f"check_drug_ids_coverage failed: {e}")
# ── check 9: NULL drugbank_id ─────────────────────────────────────────────────
def check_no_null_ids():
print("\n[9] NULL drugbank_id check")
tables_with_did = [t for t in SCHEMA if "drugbank_id" in SCHEMA[t]]
for table in tables_with_did:
try:
rows = read_csv(table)
nulls = sum(1 for r in rows if not r.get("drugbank_id"))
if nulls:
err(f"{table}: {nulls:,} rows with NULL/empty drugbank_id")
else:
ok(f"{table}: no NULL drugbank_id ({len(rows):,} rows)")
except Exception as e:
err(f"{table} NULL check failed: {e}")
# ── check 10: drug_type values ────────────────────────────────────────────────
def check_drug_type():
print("\n[10] drugs β€” drug_type values")
try:
rows = read_csv("drugs")
types = {r["drug_type"] for r in rows}
valid = {"small molecule", "biotech"}
bad = types - valid
if bad:
err(f"drugs: unexpected drug_type values: {bad}")
else:
ok(f"drugs: drug_type values = {types}")
except Exception as e:
err(f"check_drug_type failed: {e}")
# ── check 11: ATC code structure ──────────────────────────────────────────────
def check_atc_codes():
print("\n[11] atc_codes β€” level hierarchy consistency")
try:
rows = read_csv("atc_codes")
bad = 0
for r in rows:
code = r.get("atc_code") or ""
l1c = r.get("l1_code") or ""
l4c = r.get("l4_code") or ""
# l1 code should be shorter than full code; l4 should be 1 char
if code and l1c and len(l1c) >= len(code):
bad += 1
if l4c and len(l4c) > 1:
bad += 1
if bad:
warn(f"atc_codes: {bad} rows with unexpected level code lengths")
else:
ok(f"atc_codes: level hierarchy looks correct ({len(rows):,} rows)")
except Exception as e:
err(f"check_atc_codes failed: {e}")
# ── check 12: snp_type values ─────────────────────────────────────────────────
def check_snp_types():
print("\n[12] drug_snp_data β€” snp_type values")
try:
rows = read_csv("drug_snp_data")
types = {r["snp_type"] for r in rows}
valid = {"effect", "adverse_reaction"}
bad = types - valid
if bad:
err(f"drug_snp_data: unexpected snp_type values: {bad}")
else:
ok(f"drug_snp_data: snp_type values = {types} ({len(rows):,} rows)")
except Exception as e:
err(f"check_snp_types failed: {e}")
# ── check 13: external_identifiers entity_type ───────────────────────────────
def check_entity_types():
print("\n[13] external_identifiers β€” entity_type values")
try:
rows = read_csv("external_identifiers")
types = {r["entity_type"] for r in rows}
valid = {"drug", "drug_link", "polypeptide", "salt"}
bad = types - valid
if bad:
err(f"external_identifiers: unexpected entity_type values: {bad}")
else:
ok(f"external_identifiers: entity_type values = {types} ({len(rows):,} rows)")
except Exception as e:
err(f"check_entity_types failed: {e}")
# ── check 14: drug_interactants role values ───────────────────────────────────
def check_interactant_roles():
print("\n[14] drug_interactants β€” role values")
try:
rows = read_csv("drug_interactants")
roles = {r["role"] for r in rows}
valid = {"target", "enzyme", "carrier", "transporter"}
bad = roles - valid
if bad:
err(f"drug_interactants: unexpected role values: {bad}")
else:
ok(f"drug_interactants: role values = {roles} ({len(rows):,} rows)")
except Exception as e:
err(f"check_interactant_roles failed: {e}")
# ── check 15: pathway member_type values ──────────────────────────────────────
def check_pathway_member_types():
print("\n[15] pathway_members β€” member_type values")
try:
rows = read_csv("pathway_members")
types = {r["member_type"] for r in rows}
valid = {"drug", "enzyme"}
bad = types - valid
if bad:
err(f"pathway_members: unexpected member_type values: {bad}")
else:
ok(f"pathway_members: member_type values = {types} ({len(rows):,} rows)")
except Exception as e:
err(f"check_pathway_member_types failed: {e}")
# ── check 16: XSD coverage summary ───────────────────────────────────────────
def check_xsd_coverage(counts):
print("\n[16] XSD coverage summary")
items = [
("drugs", "drug entries (XSD: drug-type)"),
("drug_ids", "drugbank-id elements"),
("drug_attributes", "multi-valued string attrs (groups/synonyms/etc.)"),
("drug_properties", "calculated + experimental properties"),
("external_identifiers", "external IDs + links (drug/polypeptide/salt)"),
("references", "globally deduplicated references"),
("reference_associations", "reference context associations"),
("salts", "salt forms"),
("products", "marketed products"),
("drug_commercial_entities", "packagers + manufacturers + brands"),
("mixtures", "drug mixtures"),
("prices", "price entries"),
("categories", "unique MeSH categories"),
("drug_categories", "drug-category assignments"),
("dosages", "dosage records"),
("atc_codes", "ATC code entries"),
("patents", "patent records"),
("drug_interactions","directed DDI edges"),
("drug_snp_data", "SNP pharmacogenomics records"),
("pathways", "unique pathways"),
("pathway_members", "pathway drug/enzyme members"),
("reactions", "metabolic reactions"),
("interactants", "unique binding entities (BE-IDs)"),
("drug_interactants","drug–protein interaction records"),
("polypeptides", "unique UniProt polypeptides"),
("interactant_polypeptides", "interactant–polypeptide links"),
("polypeptide_attributes", "polypeptide synonyms/Pfam/GO"),
]
for table, desc in items:
n = counts.get(table, "?")
print(f" {n:>10,} {desc}")
# ── main ──────────────────────────────────────────────────────────────────────
def main():
print("=" * 65)
print("DrugBank CSV Validation Report")
print("=" * 65)
check_files_exist()
counts = check_row_counts()
# Load drug IDs set (used in multiple checks)
try:
drugs_rows = read_csv("drugs")
drug_ids_set = {r["drugbank_id"] for r in drugs_rows if r.get("drugbank_id")}
except Exception as e:
err(f"Could not load drugs.csv: {e}")
drug_ids_set = set()
check_ddi_ids(drug_ids_set)
check_category_fk()
check_interactant_fk()
check_polypeptide_fk()
check_ref_fk()
check_drug_ids_coverage(drug_ids_set)
check_no_null_ids()
check_drug_type()
check_atc_codes()
check_snp_types()
check_entity_types()
check_interactant_roles()
check_pathway_member_types()
check_xsd_coverage(counts)
print("\n" + "=" * 65)
print(f"Validation complete: {len(ERRORS)} error(s), {len(WARNINGS)} warning(s)")
if ERRORS:
print("\nERRORS:")
for e in ERRORS:
print(f" [X] {e}")
if WARNINGS:
print("\nWARNINGS:")
for w in WARNINGS:
print(f" [!] {w}")
if not ERRORS and not WARNINGS:
print(" [OK] All checks passed β€” data looks clean!")
print("=" * 65)
return len(ERRORS)
if __name__ == "__main__":
sys.exit(main())