Spaces:

essprasad
/

CT-Chat-V2

Running

App Files Files Community

essprasad commited on Dec 7, 2025

Commit

eb85c59

verified ·

1 Parent(s): ef81201

Create variable_loader.py

Browse files

Files changed (1) hide show

core/variable_loader.py +236 -0

core/variable_loader.py ADDED Viewed

	@@ -0,0 +1,236 @@

+# core/variable_loader.py
+import os
+import glob
+import json
+import time
+import logging
+logger = logging.getLogger(__name__)
+try:
+    import pandas as pd
+except Exception:
+    pd = None
+# cache path (temporary)
+CACHE_PATH = "/tmp/ct_var_cache.json"
+CACHE_TTL_SECONDS = 60 * 60  # 1 hour; adjust as needed
+# candidate filenames / patterns to detect relevant excel files
+DEFAULT_PATTERNS = [
+    "*SDTM*.xls*", "*SDTMIG*.xls*", "*SDTM_*.xls*", "SDTM*.xls*",
+    "*ADaM*.xls*", "*ADaMIG*.xls*", "ADaM*.xls*",
+    "*CDASH*.xls*", "*CDASHIG*.xls*", "CDASH*.xls*"
+]
+# Typical column name candidates
+VAR_COL_CANDIDATES = [
+    "variable", "variable name", "varname", "var", "column", "fieldname"
+]
+LABEL_COL_CANDIDATES = [
+    "label", "variable label", "var label", "column label"
+]
+DESC_COL_CANDIDATES = [
+    "description", "definition", "long name", "comments", "notes"
+]
+ROLE_COL_CANDIDATES = [
+    "role", "type", "datatype", "origin"
+]
+def _first_existing(columns, candidates):
+    if not columns:
+        return None
+    low = {c.strip().lower(): c for c in columns}
+    for cand in candidates:
+        for k, orig in low.items():
+            if cand == k or cand in k:
+                return orig
+    return None
+def _discover_files(search_paths=None, patterns=None):
+    patterns = patterns or DEFAULT_PATTERNS
+    search_paths = search_paths or [
+        ".", "/workspace/data", "/mnt/data", os.getcwd(),
+        "/root/.cache/huggingface/hub", "/home/user/.cache/huggingface/hub",
+        "/root/.cache/huggingface/hub/datasets--essprasad--CT-Chat-Docs",
+        "/home/user/.cache/huggingface/hub/datasets--essprasad--CT-Chat-Docs",
+    ]
+    found = []
+    for base in search_paths:
+        if not base or not os.path.exists(base):
+            continue
+        for pat in patterns:
+            try:
+                matches = glob.glob(os.path.join(base, pat), recursive=True)
+                for m in matches:
+                    if os.path.isfile(m) and m.lower().endswith((".xls", ".xlsx")):
+                        found.append(os.path.abspath(m))
+            except Exception:
+                continue
+    # dedupe but keep order
+    seen = set()
+    unique = []
+    for p in found:
+        if p not in seen:
+            seen.add(p)
+            unique.append(p)
+    return unique
+def _extract_from_df(df, filename):
+    """
+    Given a dataframe, find likely variable/label/description columns and extract rows.
+    Returns list of dicts.
+    """
+    out = []
+    if df is None or df.shape[0] == 0:
+        return out
+    cols = list(df.columns)
+    term_col = _first_existing(cols, VAR_COL_CANDIDATES)
+    label_col = _first_existing(cols, LABEL_COL_CANDIDATES)
+    desc_col = _first_existing(cols, DESC_COL_CANDIDATES)
+    role_col = _first_existing(cols, ROLE_COL_CANDIDATES)
+    # If we absolutely cannot find a term column, try first column
+    if not term_col:
+        term_col = cols[0] if cols else None
+    # If there's absolutely no useful columns, give up
+    if not term_col:
+        return out
+    for _, row in df.iterrows():
+        try:
+            term = str(row.get(term_col, "") or "").strip()
+        except Exception:
+            term = ""
+        if not term:
+            continue
+        label = ""
+        desc = ""
+        role = ""
+        try:
+            label = str(row.get(label_col, "") or "").strip() if label_col in df.columns else ""
+        except Exception:
+            label = ""
+        try:
+            desc = str(row.get(desc_col, "") or "").strip() if desc_col in df.columns else ""
+        except Exception:
+            desc = ""
+        try:
+            role = str(row.get(role_col, "") or "").strip() if role_col in df.columns else ""
+        except Exception:
+            role = ""
+        # Compose a clean definition
+        parts = []
+        if label:
+            parts.append(f"Label: {label}")
+        if desc:
+            parts.append(f"Description: {desc}")
+        if role:
+            parts.append(f"Role/Origin: {role}")
+        definition = "  \n".join(parts).strip() or (label or desc or "")
+        out.append({
+            "term": term,
+            "definition": definition,
+            "file": os.path.basename(filename),
+            "type": "variable",
+            "sources": [os.path.basename(filename)]
+        })
+    return out
+def load_variable_metadata(search_paths=None, use_cache=True, verbose=True):
+    """
+    Discover SDTM/ADaM/CDASH excel files and extract variable metadata.
+    Returns list of dicts: {'term','definition','file','type','sources'}
+    """
+    # quick fail if pandas not installed
+    if pd is None:
+        logger.warning("pandas not available — variable metadata loading skipped.")
+        return []
+    # cache handling
+    try:
+        if use_cache and os.path.exists(CACHE_PATH):
+            mtime = os.path.getmtime(CACHE_PATH)
+            if time.time() - mtime < CACHE_TTL_SECONDS:
+                if verbose:
+                    logger.info("Loading variable metadata from cache: %s", CACHE_PATH)
+                with open(CACHE_PATH, "r", encoding="utf-8") as f:
+                    return json.load(f)
+    except Exception:
+        # continue if cache read fails
+        pass
+    files = _discover_files(search_paths=search_paths)
+    if verbose:
+        logger.info("Variable loader discovered %d candidate Excel files.", len(files))
+    all_entries = []
+    for fx in files:
+        try:
+            # read all sheets (ExcelFile faster for many sheets)
+            xls = pd.ExcelFile(fx)
+            # iterate sheets:
+            for sheet in xls.sheet_names:
+                try:
+                    df = pd.read_excel(fx, sheet_name=sheet)
+                    # drop rows where all cells are NaN
+                    df = df.dropna(how="all")
+                    entries = _extract_from_df(df, fx)
+                    if entries:
+                        # annotate with sheet name to improve provenance
+                        for e in entries:
+                            e["sources"].append(f"{os.path.basename(fx)}::{sheet}")
+                        all_entries.extend(entries)
+                except Exception:
+                    # try next sheet
+                    continue
+        except Exception:
+            # fallback: try single-sheet read
+            try:
+                df = pd.read_excel(fx)
+                df = df.dropna(how="all")
+                entries = _extract_from_df(df, fx)
+                all_entries.extend(entries)
+            except Exception as e:
+                logger.debug("Failed reading excel %s: %s", fx, e)
+                continue
+    # dedupe by term (keep first occurrence)
+    seen = {}
+    deduped = []
+    for e in all_entries:
+        key = (e["term"].strip().lower())
+        if key and key not in seen:
+            seen[key] = True
+            deduped.append(e)
+    # write cache
+    try:
+        with open(CACHE_PATH, "w", encoding="utf-8") as f:
+            json.dump(deduped, f, ensure_ascii=False, indent=2)
+    except Exception:
+        pass
+    if verbose:
+        logger.info("Variable loader extracted %d unique variables.", len(deduped))
+    return deduped
+if __name__ == "__main__":
+    # quick CLI for debugging
+    items = load_variable_metadata(verbose=True)
+    print(f"[variable_loader] extracted {len(items)} items")
+    if items:
+        print("Sample:", items[:5])