File size: 18,073 Bytes
a062f28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
"""
validate.py β€” post-parse validation of all 27 CSV files.

Checks performed:
  1. All 27 CSV files exist and are non-empty
  2. Row counts are in expected ranges
  3. No drug in drug_interactions references an unknown drugbank_id
  4. All category_ids in drug_categories exist in categories
  5. All interactant_ids in drug_interactants exist in interactants
  6. All polypeptide_ids in interactant_polypeptides exist in polypeptides
  7. All ref_pks in reference_associations exist in references
  8. Primary drug IDs in drug_ids match drugs table
  9. drugbank_id column is never NULL in any table that has one
 10. drug_type values are only 'small molecule' or 'biotech'
 11. ATC code level structure is consistent (l1_code shorter than atc_code)
 12. SNP snp_type values are only 'effect' or 'adverse_reaction'
 13. entity_type in external_identifiers is in the allowed set
 14. drug_interactants role values are in the allowed set
 15. pathway member_type values are in the allowed set
 16. XSD coverage check: counts key tables against expected minimums

Usage:
    python validate.py
"""
import csv
import os
import sys
from config import OUTPUT_DIR, SCHEMA

ERRORS = []
WARNINGS = []


def err(msg):
    ERRORS.append(msg)
    print(f"  [ERROR]   {msg}")


def warn(msg):
    WARNINGS.append(msg)
    print(f"  [WARN]    {msg}")


def ok(msg):
    print(f"  [OK]      {msg}")


# ── helpers ───────────────────────────────────────────────────────────────────

def csv_path(table):
    return os.path.join(OUTPUT_DIR, f"{table}.csv")


def read_csv(table):
    path = csv_path(table)
    rows = []
    with open(path, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            rows.append(row)
    return rows


def count_rows(table):
    """Count data rows using csv.reader to handle multi-line quoted fields correctly."""
    path = csv_path(table)
    with open(path, newline="", encoding="utf-8") as f:
        return sum(1 for _ in csv.reader(f)) - 1   # subtract header


def col_set(rows, col):
    return {r[col] for r in rows if r.get(col)}


# ── check 1: file existence and size ─────────────────────────────────────────

def check_files_exist():
    print("\n[1] File existence & size")
    for table in SCHEMA:
        path = csv_path(table)
        if not os.path.exists(path):
            err(f"{table}.csv missing")
        else:
            size = os.path.getsize(path)
            if size < 10:
                warn(f"{table}.csv exists but is very small ({size} bytes)")
            else:
                ok(f"{table}.csv  ({size:,} bytes)")


# ── check 2: row counts ───────────────────────────────────────────────────────

EXPECTED_MINIMUMS = {
    "drugs":                   19_000,   # ~19,842 drugs
    "drug_ids":                20_000,   # primary + legacy IDs
    "drug_attributes":        500_000,   # groups/synonyms/organisms/etc.
    "drug_properties":        200_000,   # calculated + experimental
    "external_identifiers":   100_000,
    "references":              30_000,   # globally deduplicated refs
    "reference_associations":  80_000,   # general + interactant refs
    "products":               400_000,
    "drug_interactions":    2_500_000,   # directed DDI edges
    "interactants":             4_000,
    "polypeptides":             5_000,   # unique UniProt proteins
    "drug_interactants":       30_000,
    "categories":               3_000,
    "drug_categories":         80_000,
    "pathways":                10_000,
    "pathway_members":      1_000_000,   # pathway drug+enzyme members
}

def check_row_counts():
    print("\n[2] Row count minimums")
    counts = {}
    for table in SCHEMA:
        try:
            n = count_rows(table)
            counts[table] = n
            minimum = EXPECTED_MINIMUMS.get(table, 0)
            if n < minimum:
                err(f"{table}: {n:,} rows (expected >= {minimum:,})")
            else:
                ok(f"{table}: {n:,} rows")
        except Exception as e:
            err(f"{table}: could not count rows β€” {e}")
    return counts


# ── check 3: DDI referential integrity ───────────────────────────────────────

def check_ddi_ids(drug_ids_set):
    print("\n[3] drug_interactions β€” referential integrity (sample check)")
    try:
        rows = read_csv("drug_interactions")
        unknown_src  = sum(1 for r in rows if r["drugbank_id"] not in drug_ids_set)
        unknown_tgt  = sum(1 for r in rows if r["interacting_drugbank_id"] not in drug_ids_set)
        if unknown_src:
            warn(f"drug_interactions: {unknown_src:,} rows with unknown source drugbank_id")
        else:
            ok("drug_interactions: all source IDs found in drugs table")
        # Target IDs can legitimately be absent if the other drug is also in DB
        if unknown_tgt:
            warn(f"drug_interactions: {unknown_tgt:,} rows with unknown target drugbank_id "
                 f"(may be withdrawn/experimental drugs not in this export)")
        else:
            ok("drug_interactions: all target IDs found in drugs table")
    except Exception as e:
        err(f"check_ddi_ids failed: {e}")


# ── check 4: category FK ─────────────────────────────────────────────────────

def check_category_fk():
    print("\n[4] drug_categories -> categories FK")
    try:
        cat_ids   = col_set(read_csv("categories"), "category_id")
        dc_rows   = read_csv("drug_categories")
        missing   = sum(1 for r in dc_rows if r["category_id"] not in cat_ids)
        if missing:
            err(f"drug_categories: {missing:,} rows with unknown category_id")
        else:
            ok(f"drug_categories: all {len(dc_rows):,} category_id values found")
    except Exception as e:
        err(f"check_category_fk failed: {e}")


# ── check 5: interactant FK ───────────────────────────────────────────────────

def check_interactant_fk():
    print("\n[5] drug_interactants -> interactants FK")
    try:
        int_ids  = col_set(read_csv("interactants"), "interactant_id")
        di_rows  = read_csv("drug_interactants")
        missing  = sum(1 for r in di_rows if r["interactant_id"] not in int_ids)
        if missing:
            err(f"drug_interactants: {missing:,} rows with unknown interactant_id")
        else:
            ok(f"drug_interactants: all {len(di_rows):,} interactant_id values found")
    except Exception as e:
        err(f"check_interactant_fk failed: {e}")


# ── check 6: polypeptide FK ───────────────────────────────────────────────────

def check_polypeptide_fk():
    print("\n[6] interactant_polypeptides -> polypeptides FK")
    try:
        poly_ids = col_set(read_csv("polypeptides"), "polypeptide_id")
        ip_rows  = read_csv("interactant_polypeptides")
        missing  = sum(1 for r in ip_rows if r["polypeptide_id"] not in poly_ids)
        if missing:
            err(f"interactant_polypeptides: {missing:,} rows with unknown polypeptide_id")
        else:
            ok(f"interactant_polypeptides: all {len(ip_rows):,} polypeptide_id values found")
    except Exception as e:
        err(f"check_polypeptide_fk failed: {e}")


# ── check 7: reference_associations FK ───────────────────────────────────────

def check_ref_fk():
    print("\n[7] reference_associations -> references FK")
    try:
        ref_pks  = col_set(read_csv("references"), "ref_pk")
        ra_rows  = read_csv("reference_associations")
        missing  = sum(1 for r in ra_rows if r["ref_pk"] not in ref_pks)
        if missing:
            err(f"reference_associations: {missing:,} rows with unknown ref_pk")
        else:
            ok(f"reference_associations: all {len(ra_rows):,} ref_pk values found")
    except Exception as e:
        err(f"check_ref_fk failed: {e}")


# ── check 8: drug_ids primary coverage ───────────────────────────────────────

def check_drug_ids_coverage(drug_ids_set):
    print("\n[8] drug_ids β€” primary ID coverage")
    try:
        di_rows  = read_csv("drug_ids")
        primary_in_di = {r["legacy_id"] for r in di_rows
                         if r.get("is_primary", "").lower() == "true"}
        missing = drug_ids_set - primary_in_di
        if missing:
            warn(f"drug_ids: {len(missing):,} primary drug IDs not found in drug_ids table")
        else:
            ok(f"drug_ids: all {len(drug_ids_set):,} primary IDs represented")
    except Exception as e:
        err(f"check_drug_ids_coverage failed: {e}")


# ── check 9: NULL drugbank_id ─────────────────────────────────────────────────

def check_no_null_ids():
    print("\n[9] NULL drugbank_id check")
    tables_with_did = [t for t in SCHEMA if "drugbank_id" in SCHEMA[t]]
    for table in tables_with_did:
        try:
            rows   = read_csv(table)
            nulls  = sum(1 for r in rows if not r.get("drugbank_id"))
            if nulls:
                err(f"{table}: {nulls:,} rows with NULL/empty drugbank_id")
            else:
                ok(f"{table}: no NULL drugbank_id ({len(rows):,} rows)")
        except Exception as e:
            err(f"{table} NULL check failed: {e}")


# ── check 10: drug_type values ────────────────────────────────────────────────

def check_drug_type():
    print("\n[10] drugs β€” drug_type values")
    try:
        rows   = read_csv("drugs")
        types  = {r["drug_type"] for r in rows}
        valid  = {"small molecule", "biotech"}
        bad    = types - valid
        if bad:
            err(f"drugs: unexpected drug_type values: {bad}")
        else:
            ok(f"drugs: drug_type values = {types}")
    except Exception as e:
        err(f"check_drug_type failed: {e}")


# ── check 11: ATC code structure ──────────────────────────────────────────────

def check_atc_codes():
    print("\n[11] atc_codes β€” level hierarchy consistency")
    try:
        rows  = read_csv("atc_codes")
        bad   = 0
        for r in rows:
            code   = r.get("atc_code") or ""
            l1c    = r.get("l1_code") or ""
            l4c    = r.get("l4_code") or ""
            # l1 code should be shorter than full code; l4 should be 1 char
            if code and l1c and len(l1c) >= len(code):
                bad += 1
            if l4c and len(l4c) > 1:
                bad += 1
        if bad:
            warn(f"atc_codes: {bad} rows with unexpected level code lengths")
        else:
            ok(f"atc_codes: level hierarchy looks correct ({len(rows):,} rows)")
    except Exception as e:
        err(f"check_atc_codes failed: {e}")


# ── check 12: snp_type values ─────────────────────────────────────────────────

def check_snp_types():
    print("\n[12] drug_snp_data β€” snp_type values")
    try:
        rows  = read_csv("drug_snp_data")
        types = {r["snp_type"] for r in rows}
        valid = {"effect", "adverse_reaction"}
        bad   = types - valid
        if bad:
            err(f"drug_snp_data: unexpected snp_type values: {bad}")
        else:
            ok(f"drug_snp_data: snp_type values = {types} ({len(rows):,} rows)")
    except Exception as e:
        err(f"check_snp_types failed: {e}")


# ── check 13: external_identifiers entity_type ───────────────────────────────

def check_entity_types():
    print("\n[13] external_identifiers β€” entity_type values")
    try:
        rows  = read_csv("external_identifiers")
        types = {r["entity_type"] for r in rows}
        valid = {"drug", "drug_link", "polypeptide", "salt"}
        bad   = types - valid
        if bad:
            err(f"external_identifiers: unexpected entity_type values: {bad}")
        else:
            ok(f"external_identifiers: entity_type values = {types} ({len(rows):,} rows)")
    except Exception as e:
        err(f"check_entity_types failed: {e}")


# ── check 14: drug_interactants role values ───────────────────────────────────

def check_interactant_roles():
    print("\n[14] drug_interactants β€” role values")
    try:
        rows  = read_csv("drug_interactants")
        roles = {r["role"] for r in rows}
        valid = {"target", "enzyme", "carrier", "transporter"}
        bad   = roles - valid
        if bad:
            err(f"drug_interactants: unexpected role values: {bad}")
        else:
            ok(f"drug_interactants: role values = {roles} ({len(rows):,} rows)")
    except Exception as e:
        err(f"check_interactant_roles failed: {e}")


# ── check 15: pathway member_type values ──────────────────────────────────────

def check_pathway_member_types():
    print("\n[15] pathway_members β€” member_type values")
    try:
        rows  = read_csv("pathway_members")
        types = {r["member_type"] for r in rows}
        valid = {"drug", "enzyme"}
        bad   = types - valid
        if bad:
            err(f"pathway_members: unexpected member_type values: {bad}")
        else:
            ok(f"pathway_members: member_type values = {types} ({len(rows):,} rows)")
    except Exception as e:
        err(f"check_pathway_member_types failed: {e}")


# ── check 16: XSD coverage summary ───────────────────────────────────────────

def check_xsd_coverage(counts):
    print("\n[16] XSD coverage summary")
    items = [
        ("drugs",            "drug entries (XSD: drug-type)"),
        ("drug_ids",         "drugbank-id elements"),
        ("drug_attributes",  "multi-valued string attrs (groups/synonyms/etc.)"),
        ("drug_properties",  "calculated + experimental properties"),
        ("external_identifiers", "external IDs + links (drug/polypeptide/salt)"),
        ("references",       "globally deduplicated references"),
        ("reference_associations", "reference context associations"),
        ("salts",            "salt forms"),
        ("products",         "marketed products"),
        ("drug_commercial_entities", "packagers + manufacturers + brands"),
        ("mixtures",         "drug mixtures"),
        ("prices",           "price entries"),
        ("categories",       "unique MeSH categories"),
        ("drug_categories",  "drug-category assignments"),
        ("dosages",          "dosage records"),
        ("atc_codes",        "ATC code entries"),
        ("patents",          "patent records"),
        ("drug_interactions","directed DDI edges"),
        ("drug_snp_data",    "SNP pharmacogenomics records"),
        ("pathways",         "unique pathways"),
        ("pathway_members",  "pathway drug/enzyme members"),
        ("reactions",        "metabolic reactions"),
        ("interactants",     "unique binding entities (BE-IDs)"),
        ("drug_interactants","drug–protein interaction records"),
        ("polypeptides",     "unique UniProt polypeptides"),
        ("interactant_polypeptides", "interactant–polypeptide links"),
        ("polypeptide_attributes",   "polypeptide synonyms/Pfam/GO"),
    ]
    for table, desc in items:
        n = counts.get(table, "?")
        print(f"    {n:>10,}  {desc}")


# ── main ──────────────────────────────────────────────────────────────────────

def main():
    print("=" * 65)
    print("DrugBank CSV Validation Report")
    print("=" * 65)

    check_files_exist()

    counts = check_row_counts()

    # Load drug IDs set (used in multiple checks)
    try:
        drugs_rows   = read_csv("drugs")
        drug_ids_set = {r["drugbank_id"] for r in drugs_rows if r.get("drugbank_id")}
    except Exception as e:
        err(f"Could not load drugs.csv: {e}")
        drug_ids_set = set()

    check_ddi_ids(drug_ids_set)
    check_category_fk()
    check_interactant_fk()
    check_polypeptide_fk()
    check_ref_fk()
    check_drug_ids_coverage(drug_ids_set)
    check_no_null_ids()
    check_drug_type()
    check_atc_codes()
    check_snp_types()
    check_entity_types()
    check_interactant_roles()
    check_pathway_member_types()
    check_xsd_coverage(counts)

    print("\n" + "=" * 65)
    print(f"Validation complete: {len(ERRORS)} error(s), {len(WARNINGS)} warning(s)")
    if ERRORS:
        print("\nERRORS:")
        for e in ERRORS:
            print(f"  [X] {e}")
    if WARNINGS:
        print("\nWARNINGS:")
        for w in WARNINGS:
            print(f"  [!] {w}")
    if not ERRORS and not WARNINGS:
        print("  [OK] All checks passed β€” data looks clean!")
    print("=" * 65)

    return len(ERRORS)


if __name__ == "__main__":
    sys.exit(main())