Spaces:

marwadeeb
/

ddi-checker

Sleeping

File size: 7,134 Bytes

a062f28

"""
utils.py — shared helper functions for all parser modules.
"""
import csv
import os
from config import NP, OUTPUT_DIR, SCHEMA


# ── Text helpers ─────────────────────────────────────────────────────────────

def clean(text):
    """Strip whitespace; return None for empty/None strings."""
    if text is None:
        return None
    s = str(text).strip()
    return s if s else None


def t(el, tag):
    """Get cleaned text of a direct child element (namespace-aware)."""
    if el is None:
        return None
    child = el.find(f"{NP}{tag}")
    if child is None:
        return None
    return clean(child.text)


def a(el, attr, default=None):
    """Get cleaned attribute value from element."""
    v = el.get(attr, default)
    return clean(v) if v is not None else default


def get_primary_id(drug_el):
    """Return the primary DrugBank ID for a <drug> element."""
    for id_el in drug_el.findall(f"{NP}drugbank-id"):
        if id_el.get("primary") == "true":
            return clean(id_el.text)
    # Fallback: first id if primary flag absent
    ids = drug_el.findall(f"{NP}drugbank-id")
    return clean(ids[0].text) if ids else None


# ── CSV helpers ───────────────────────────────────────────────────────────────

def open_writer(table_name):
    """Open a CSV writer for the given table; writes header row."""
    path = os.path.join(OUTPUT_DIR, f"{table_name}.csv")
    f = open(path, "w", newline="", encoding="utf-8")
    cols = SCHEMA[table_name]
    writer = csv.DictWriter(f, fieldnames=cols, extrasaction="ignore",
                            quoting=csv.QUOTE_ALL)
    writer.writeheader()
    return f, writer


def write_rows(writer, rows):
    """Write a list of row-dicts to a DictWriter (ignores unknown keys)."""
    for row in rows:
        writer.writerow(row)


# ── Reference extraction (shared by parse_references + parse_proteins) ───────

def extract_ref_list(refs_el, state):
    """
    Parse a <references> element (reference-list-type) and return:
      new_ref_rows : list[dict]  — rows for the 'references' table (newly seen)
      ref_pks      : list[int]   — ref_pk for EVERY reference found (for associations)

    Deduplication keys:
      article    → pubmed_id  (or citation truncated to 300 chars if no pubmed_id)
      textbook   → isbn + "|" + citation[:200]
      link       → url
      attachment → title + "|" + url
    """
    new_refs = []
    ref_pks = []

    if refs_el is None:
        return new_refs, ref_pks

    # ── Articles ──────────────────────────────────────────────────────────────
    articles_el = refs_el.find(f"{NP}articles")
    if articles_el is not None:
        for art in articles_el.findall(f"{NP}article"):
            pubmed_id = t(art, "pubmed-id")
            citation  = t(art, "citation")
            ref_id    = t(art, "ref-id")
            if pubmed_id:
                key = ("article", pubmed_id)
            elif citation:
                key = ("article", citation[:300])
            else:
                continue
            if key not in state.refs_seen:
                state.ref_counter += 1
                rpk = state.ref_counter
                state.refs_seen[key] = rpk
                new_refs.append({
                    "ref_pk": rpk, "ref_type": "article", "ref_id": ref_id,
                    "pubmed_id": pubmed_id, "isbn": None,
                    "title": None, "url": None, "citation": citation,
                })
            ref_pks.append(state.refs_seen[key])

    # ── Textbooks ─────────────────────────────────────────────────────────────
    textbooks_el = refs_el.find(f"{NP}textbooks")
    if textbooks_el is not None:
        for tb in textbooks_el.findall(f"{NP}textbook"):
            isbn     = t(tb, "isbn")
            citation = t(tb, "citation")
            ref_id   = t(tb, "ref-id")
            key_str  = (isbn or "") + "|" + (citation[:200] if citation else "")
            key = ("textbook", key_str)
            if not key_str.strip("|"):
                continue
            if key not in state.refs_seen:
                state.ref_counter += 1
                rpk = state.ref_counter
                state.refs_seen[key] = rpk
                new_refs.append({
                    "ref_pk": rpk, "ref_type": "textbook", "ref_id": ref_id,
                    "pubmed_id": None, "isbn": isbn,
                    "title": None, "url": None, "citation": citation,
                })
            ref_pks.append(state.refs_seen[key])

    # ── Links ─────────────────────────────────────────────────────────────────
    links_el = refs_el.find(f"{NP}links")
    if links_el is not None:
        for lnk in links_el.findall(f"{NP}link"):
            url    = t(lnk, "url")
            title  = t(lnk, "title")
            ref_id = t(lnk, "ref-id")
            if not url:
                continue
            key = ("link", url)
            if key not in state.refs_seen:
                state.ref_counter += 1
                rpk = state.ref_counter
                state.refs_seen[key] = rpk
                new_refs.append({
                    "ref_pk": rpk, "ref_type": "link", "ref_id": ref_id,
                    "pubmed_id": None, "isbn": None,
                    "title": title, "url": url, "citation": None,
                })
            ref_pks.append(state.refs_seen[key])

    # ── Attachments ───────────────────────────────────────────────────────────
    attachments_el = refs_el.find(f"{NP}attachments")
    if attachments_el is not None:
        for att in attachments_el.findall(f"{NP}attachment"):
            url    = t(att, "url")
            title  = t(att, "title")
            ref_id = t(att, "ref-id")
            key_str = (title or "") + "|" + (url or "")
            key = ("attachment", key_str)
            if not key_str.strip("|"):
                continue
            if key not in state.refs_seen:
                state.ref_counter += 1
                rpk = state.ref_counter
                state.refs_seen[key] = rpk
                new_refs.append({
                    "ref_pk": rpk, "ref_type": "attachment", "ref_id": ref_id,
                    "pubmed_id": None, "isbn": None,
                    "title": title, "url": url, "citation": None,
                })
            ref_pks.append(state.refs_seen[key])

    return new_refs, ref_pks