Spaces:

marwadeeb
/

ddi-checker

Sleeping

App Files Files Community

ddi-checker / parser /validate.py

marwadeeb

added data parsing stage

a062f28 2 months ago

raw

history blame contribute delete

18.1 kB

	"""
	validate.py — post-parse validation of all 27 CSV files.

	Checks performed:
	1. All 27 CSV files exist and are non-empty
	2. Row counts are in expected ranges
	3. No drug in drug_interactions references an unknown drugbank_id
	4. All category_ids in drug_categories exist in categories
	5. All interactant_ids in drug_interactants exist in interactants
	6. All polypeptide_ids in interactant_polypeptides exist in polypeptides
	7. All ref_pks in reference_associations exist in references
	8. Primary drug IDs in drug_ids match drugs table
	9. drugbank_id column is never NULL in any table that has one
	10. drug_type values are only 'small molecule' or 'biotech'
	11. ATC code level structure is consistent (l1_code shorter than atc_code)
	12. SNP snp_type values are only 'effect' or 'adverse_reaction'
	13. entity_type in external_identifiers is in the allowed set
	14. drug_interactants role values are in the allowed set
	15. pathway member_type values are in the allowed set
	16. XSD coverage check: counts key tables against expected minimums

	Usage:
	python validate.py
	"""
	import csv
	import os
	import sys
	from config import OUTPUT_DIR, SCHEMA

	ERRORS = []
	WARNINGS = []


	def err(msg):
	ERRORS.append(msg)
	print(f" [ERROR] {msg}")


	def warn(msg):
	WARNINGS.append(msg)
	print(f" [WARN] {msg}")


	def ok(msg):
	print(f" [OK] {msg}")


	# ── helpers ───────────────────────────────────────────────────────────────────

	def csv_path(table):
	return os.path.join(OUTPUT_DIR, f"{table}.csv")


	def read_csv(table):
	path = csv_path(table)
	rows = []
	with open(path, newline="", encoding="utf-8") as f:
	reader = csv.DictReader(f)
	for row in reader:
	rows.append(row)
	return rows


	def count_rows(table):
	"""Count data rows using csv.reader to handle multi-line quoted fields correctly."""
	path = csv_path(table)
	with open(path, newline="", encoding="utf-8") as f:
	return sum(1 for _ in csv.reader(f)) - 1 # subtract header


	def col_set(rows, col):
	return {r[col] for r in rows if r.get(col)}


	# ── check 1: file existence and size ─────────────────────────────────────────

	def check_files_exist():
	print("\n[1] File existence & size")
	for table in SCHEMA:
	path = csv_path(table)
	if not os.path.exists(path):
	err(f"{table}.csv missing")
	else:
	size = os.path.getsize(path)
	if size < 10:
	warn(f"{table}.csv exists but is very small ({size} bytes)")
	else:
	ok(f"{table}.csv ({size:,} bytes)")


	# ── check 2: row counts ───────────────────────────────────────────────────────

	EXPECTED_MINIMUMS = {
	"drugs": 19_000, # ~19,842 drugs
	"drug_ids": 20_000, # primary + legacy IDs
	"drug_attributes": 500_000, # groups/synonyms/organisms/etc.
	"drug_properties": 200_000, # calculated + experimental
	"external_identifiers": 100_000,
	"references": 30_000, # globally deduplicated refs
	"reference_associations": 80_000, # general + interactant refs
	"products": 400_000,
	"drug_interactions": 2_500_000, # directed DDI edges
	"interactants": 4_000,
	"polypeptides": 5_000, # unique UniProt proteins
	"drug_interactants": 30_000,
	"categories": 3_000,
	"drug_categories": 80_000,
	"pathways": 10_000,
	"pathway_members": 1_000_000, # pathway drug+enzyme members
	}

	def check_row_counts():
	print("\n[2] Row count minimums")
	counts = {}
	for table in SCHEMA:
	try:
	n = count_rows(table)
	counts[table] = n
	minimum = EXPECTED_MINIMUMS.get(table, 0)
	if n < minimum:
	err(f"{table}: {n:,} rows (expected >= {minimum:,})")
	else:
	ok(f"{table}: {n:,} rows")
	except Exception as e:
	err(f"{table}: could not count rows — {e}")
	return counts


	# ── check 3: DDI referential integrity ───────────────────────────────────────

	def check_ddi_ids(drug_ids_set):
	print("\n[3] drug_interactions — referential integrity (sample check)")
	try:
	rows = read_csv("drug_interactions")
	unknown_src = sum(1 for r in rows if r["drugbank_id"] not in drug_ids_set)
	unknown_tgt = sum(1 for r in rows if r["interacting_drugbank_id"] not in drug_ids_set)
	if unknown_src:
	warn(f"drug_interactions: {unknown_src:,} rows with unknown source drugbank_id")
	else:
	ok("drug_interactions: all source IDs found in drugs table")
	# Target IDs can legitimately be absent if the other drug is also in DB
	if unknown_tgt:
	warn(f"drug_interactions: {unknown_tgt:,} rows with unknown target drugbank_id "
	f"(may be withdrawn/experimental drugs not in this export)")
	else:
	ok("drug_interactions: all target IDs found in drugs table")
	except Exception as e:
	err(f"check_ddi_ids failed: {e}")


	# ── check 4: category FK ─────────────────────────────────────────────────────

	def check_category_fk():
	print("\n[4] drug_categories -> categories FK")
	try:
	cat_ids = col_set(read_csv("categories"), "category_id")
	dc_rows = read_csv("drug_categories")
	missing = sum(1 for r in dc_rows if r["category_id"] not in cat_ids)
	if missing:
	err(f"drug_categories: {missing:,} rows with unknown category_id")
	else:
	ok(f"drug_categories: all {len(dc_rows):,} category_id values found")
	except Exception as e:
	err(f"check_category_fk failed: {e}")


	# ── check 5: interactant FK ───────────────────────────────────────────────────

	def check_interactant_fk():
	print("\n[5] drug_interactants -> interactants FK")
	try:
	int_ids = col_set(read_csv("interactants"), "interactant_id")
	di_rows = read_csv("drug_interactants")
	missing = sum(1 for r in di_rows if r["interactant_id"] not in int_ids)
	if missing:
	err(f"drug_interactants: {missing:,} rows with unknown interactant_id")
	else:
	ok(f"drug_interactants: all {len(di_rows):,} interactant_id values found")
	except Exception as e:
	err(f"check_interactant_fk failed: {e}")


	# ── check 6: polypeptide FK ───────────────────────────────────────────────────

	def check_polypeptide_fk():
	print("\n[6] interactant_polypeptides -> polypeptides FK")
	try:
	poly_ids = col_set(read_csv("polypeptides"), "polypeptide_id")
	ip_rows = read_csv("interactant_polypeptides")
	missing = sum(1 for r in ip_rows if r["polypeptide_id"] not in poly_ids)
	if missing:
	err(f"interactant_polypeptides: {missing:,} rows with unknown polypeptide_id")
	else:
	ok(f"interactant_polypeptides: all {len(ip_rows):,} polypeptide_id values found")
	except Exception as e:
	err(f"check_polypeptide_fk failed: {e}")


	# ── check 7: reference_associations FK ───────────────────────────────────────

	def check_ref_fk():
	print("\n[7] reference_associations -> references FK")
	try:
	ref_pks = col_set(read_csv("references"), "ref_pk")
	ra_rows = read_csv("reference_associations")
	missing = sum(1 for r in ra_rows if r["ref_pk"] not in ref_pks)
	if missing:
	err(f"reference_associations: {missing:,} rows with unknown ref_pk")
	else:
	ok(f"reference_associations: all {len(ra_rows):,} ref_pk values found")
	except Exception as e:
	err(f"check_ref_fk failed: {e}")


	# ── check 8: drug_ids primary coverage ───────────────────────────────────────

	def check_drug_ids_coverage(drug_ids_set):
	print("\n[8] drug_ids — primary ID coverage")
	try:
	di_rows = read_csv("drug_ids")
	primary_in_di = {r["legacy_id"] for r in di_rows
	if r.get("is_primary", "").lower() == "true"}
	missing = drug_ids_set - primary_in_di
	if missing:
	warn(f"drug_ids: {len(missing):,} primary drug IDs not found in drug_ids table")
	else:
	ok(f"drug_ids: all {len(drug_ids_set):,} primary IDs represented")
	except Exception as e:
	err(f"check_drug_ids_coverage failed: {e}")


	# ── check 9: NULL drugbank_id ─────────────────────────────────────────────────

	def check_no_null_ids():
	print("\n[9] NULL drugbank_id check")
	tables_with_did = [t for t in SCHEMA if "drugbank_id" in SCHEMA[t]]
	for table in tables_with_did:
	try:
	rows = read_csv(table)
	nulls = sum(1 for r in rows if not r.get("drugbank_id"))
	if nulls:
	err(f"{table}: {nulls:,} rows with NULL/empty drugbank_id")
	else:
	ok(f"{table}: no NULL drugbank_id ({len(rows):,} rows)")
	except Exception as e:
	err(f"{table} NULL check failed: {e}")


	# ── check 10: drug_type values ────────────────────────────────────────────────

	def check_drug_type():
	print("\n[10] drugs — drug_type values")
	try:
	rows = read_csv("drugs")
	types = {r["drug_type"] for r in rows}
	valid = {"small molecule", "biotech"}
	bad = types - valid
	if bad:
	err(f"drugs: unexpected drug_type values: {bad}")
	else:
	ok(f"drugs: drug_type values = {types}")
	except Exception as e:
	err(f"check_drug_type failed: {e}")


	# ── check 11: ATC code structure ──────────────────────────────────────────────

	def check_atc_codes():
	print("\n[11] atc_codes — level hierarchy consistency")
	try:
	rows = read_csv("atc_codes")
	bad = 0
	for r in rows:
	code = r.get("atc_code") or ""
	l1c = r.get("l1_code") or ""
	l4c = r.get("l4_code") or ""
	# l1 code should be shorter than full code; l4 should be 1 char
	if code and l1c and len(l1c) >= len(code):
	bad += 1
	if l4c and len(l4c) > 1:
	bad += 1
	if bad:
	warn(f"atc_codes: {bad} rows with unexpected level code lengths")
	else:
	ok(f"atc_codes: level hierarchy looks correct ({len(rows):,} rows)")
	except Exception as e:
	err(f"check_atc_codes failed: {e}")


	# ── check 12: snp_type values ─────────────────────────────────────────────────

	def check_snp_types():
	print("\n[12] drug_snp_data — snp_type values")
	try:
	rows = read_csv("drug_snp_data")
	types = {r["snp_type"] for r in rows}
	valid = {"effect", "adverse_reaction"}
	bad = types - valid
	if bad:
	err(f"drug_snp_data: unexpected snp_type values: {bad}")
	else:
	ok(f"drug_snp_data: snp_type values = {types} ({len(rows):,} rows)")
	except Exception as e:
	err(f"check_snp_types failed: {e}")


	# ── check 13: external_identifiers entity_type ───────────────────────────────

	def check_entity_types():
	print("\n[13] external_identifiers — entity_type values")
	try:
	rows = read_csv("external_identifiers")
	types = {r["entity_type"] for r in rows}
	valid = {"drug", "drug_link", "polypeptide", "salt"}
	bad = types - valid
	if bad:
	err(f"external_identifiers: unexpected entity_type values: {bad}")
	else:
	ok(f"external_identifiers: entity_type values = {types} ({len(rows):,} rows)")
	except Exception as e:
	err(f"check_entity_types failed: {e}")


	# ── check 14: drug_interactants role values ───────────────────────────────────

	def check_interactant_roles():
	print("\n[14] drug_interactants — role values")
	try:
	rows = read_csv("drug_interactants")
	roles = {r["role"] for r in rows}
	valid = {"target", "enzyme", "carrier", "transporter"}
	bad = roles - valid
	if bad:
	err(f"drug_interactants: unexpected role values: {bad}")
	else:
	ok(f"drug_interactants: role values = {roles} ({len(rows):,} rows)")
	except Exception as e:
	err(f"check_interactant_roles failed: {e}")


	# ── check 15: pathway member_type values ──────────────────────────────────────

	def check_pathway_member_types():
	print("\n[15] pathway_members — member_type values")
	try:
	rows = read_csv("pathway_members")
	types = {r["member_type"] for r in rows}
	valid = {"drug", "enzyme"}
	bad = types - valid
	if bad:
	err(f"pathway_members: unexpected member_type values: {bad}")
	else:
	ok(f"pathway_members: member_type values = {types} ({len(rows):,} rows)")
	except Exception as e:
	err(f"check_pathway_member_types failed: {e}")


	# ── check 16: XSD coverage summary ───────────────────────────────────────────

	def check_xsd_coverage(counts):
	print("\n[16] XSD coverage summary")
	items = [
	("drugs", "drug entries (XSD: drug-type)"),
	("drug_ids", "drugbank-id elements"),
	("drug_attributes", "multi-valued string attrs (groups/synonyms/etc.)"),
	("drug_properties", "calculated + experimental properties"),
	("external_identifiers", "external IDs + links (drug/polypeptide/salt)"),
	("references", "globally deduplicated references"),
	("reference_associations", "reference context associations"),
	("salts", "salt forms"),
	("products", "marketed products"),
	("drug_commercial_entities", "packagers + manufacturers + brands"),
	("mixtures", "drug mixtures"),
	("prices", "price entries"),
	("categories", "unique MeSH categories"),
	("drug_categories", "drug-category assignments"),
	("dosages", "dosage records"),
	("atc_codes", "ATC code entries"),
	("patents", "patent records"),
	("drug_interactions","directed DDI edges"),
	("drug_snp_data", "SNP pharmacogenomics records"),
	("pathways", "unique pathways"),
	("pathway_members", "pathway drug/enzyme members"),
	("reactions", "metabolic reactions"),
	("interactants", "unique binding entities (BE-IDs)"),
	("drug_interactants","drug–protein interaction records"),
	("polypeptides", "unique UniProt polypeptides"),
	("interactant_polypeptides", "interactant–polypeptide links"),
	("polypeptide_attributes", "polypeptide synonyms/Pfam/GO"),
	]
	for table, desc in items:
	n = counts.get(table, "?")
	print(f" {n:>10,} {desc}")


	# ── main ──────────────────────────────────────────────────────────────────────

	def main():
	print("=" * 65)
	print("DrugBank CSV Validation Report")
	print("=" * 65)

	check_files_exist()

	counts = check_row_counts()

	# Load drug IDs set (used in multiple checks)
	try:
	drugs_rows = read_csv("drugs")
	drug_ids_set = {r["drugbank_id"] for r in drugs_rows if r.get("drugbank_id")}
	except Exception as e:
	err(f"Could not load drugs.csv: {e}")
	drug_ids_set = set()

	check_ddi_ids(drug_ids_set)
	check_category_fk()
	check_interactant_fk()
	check_polypeptide_fk()
	check_ref_fk()
	check_drug_ids_coverage(drug_ids_set)
	check_no_null_ids()
	check_drug_type()
	check_atc_codes()
	check_snp_types()
	check_entity_types()
	check_interactant_roles()
	check_pathway_member_types()
	check_xsd_coverage(counts)

	print("\n" + "=" * 65)
	print(f"Validation complete: {len(ERRORS)} error(s), {len(WARNINGS)} warning(s)")
	if ERRORS:
	print("\nERRORS:")
	for e in ERRORS:
	print(f" [X] {e}")
	if WARNINGS:
	print("\nWARNINGS:")
	for w in WARNINGS:
	print(f" [!] {w}")
	if not ERRORS and not WARNINGS:
	print(" [OK] All checks passed — data looks clean!")
	print("=" * 65)

	return len(ERRORS)


	if __name__ == "__main__":
	sys.exit(main())