Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

Rajan Sharma commited on Sep 7

Commit

023cf3a

verified ·

1 Parent(s): f20b5e0

Update upload_ingest.py

Browse files

Files changed (1) hide show

upload_ingest.py +112 -80

upload_ingest.py CHANGED Viewed

@@ -1,92 +1,124 @@
-import os, mimetypes
-from typing import List, Tuple
-import pdfplumber
-from docx import Document as DocxDocument
-from PIL import Image
-import pytesseract
-from settings import ALLOWED_EXT, ALLOWED_MIME, MAX_UPLOAD_MB, ENABLE_AV_SCAN, CLAMD_UNIX_SOCKET, CLAMD_NETWORK
-from privacy import redact_text
-# --- Optional AV scan (clamd) ---
-def _clamd_scan(path: str) -> bool:
-    if not ENABLE_AV_SCAN:
-        return True
     try:
-        import clamd
-        cd = None
-        if CLAMD_UNIX_SOCKET:
-            cd = clamd.ClamdUnixSocket(CLAMD_UNIX_SOCKET)
-        elif CLAMD_NETWORK:
-            host, port = CLAMD_NETWORK
-            cd = clamd.ClamdNetworkSocket(host, port)
-        if not cd:
-            return True
-        res = cd.scan(path)
-        # Expected: {'/path/file': ('OK', 'OK')} or ('FOUND','Eicar-Test-Signature')
-        verdict = next(iter(res.values()))[0] if isinstance(res, dict) else "OK"
-        return verdict == "OK"
     except Exception:
-        # If AV unavailable, fail open by default (configurable)
-        return True
-def _check_allowed(path: str) -> tuple[bool, str]:
-    ext = os.path.splitext(path.lower())[1]
-    if ext not in ALLOWED_EXT:
-        return False, f"Extension {ext} not allowed."
-    mime, _ = mimetypes.guess_type(path)
-    if mime not in ALLOWED_MIME:
-        return False, f"MIME {mime} not allowed."
-    size_mb = os.path.getsize(path) / (1024 * 1024)
-    if size_mb > MAX_UPLOAD_MB:
-        return False, f"File too large ({size_mb:.1f}MB > {MAX_UPLOAD_MB}MB)."
-    if not _clamd_scan(path):
-        return False, "Antivirus scan failed."
-    return True, "ok"
-def _read_text_file(path: str) -> str:
-    with open(path, "r", encoding="utf-8", errors="ignore") as f:
-        return f.read()
-def _read_docx(path: str) -> str:
-    doc = DocxDocument(path)
-    return "\n".join([p.text for p in doc.paragraphs])
-def _read_pdf(path: str) -> str:
     out = []
-    with pdfplumber.open(path) as pdf:
-        for p in pdf.pages:
-            out.append(p.extract_text() or "")
-    return "\n".join(out)
-def _read_image_ocr(path: str) -> str:
-    img = Image.open(path)
-    return pytesseract.image_to_string(img)
-def extract_text_from_files(filepaths: List[str]) -> List[Tuple[str, str]]:
     """
-    Returns a list of (safe_name, redacted_text) for approved files.
     """
-    results: List[Tuple[str, str]] = []
-    for fp in filepaths or []:
-        ok, reason = _check_allowed(fp)
-        if not ok:
-            # skip silently or raise/log upstream
-            continue
-        ext = os.path.splitext(fp.lower())[1]
-        try:
-            if ext in {".txt", ".md", ".csv"}:
-                txt = _read_text_file(fp)
-            elif ext == ".docx":
-                txt = _read_docx(fp)
-            elif ext == ".pdf":
-                txt = _read_pdf(fp)
-            elif ext in {".png", ".jpg", ".jpeg", ".webp"}:
-                txt = _read_image_ocr(fp)
-            else:
-                txt = ""
-            if txt and txt.strip():
-                results.append((os.path.basename(fp), redact_text(txt)))
-        except Exception:
             continue
-    return results

+# upload_ingest.py
+from __future__ import annotations
+import os
+import json
+from typing import Dict, List, Any
+import pandas as pd
+# Optional parsers
+try:
+    import pdfplumber  # noqa: F401
+    _HAS_PDFPLUMBER = True
+except Exception:
+    _HAS_PDFPLUMBER = False
+def _read_text_file(path: str) -> str:
     try:
+        with open(path, "r", encoding="utf-8", errors="ignore") as f:
+            return f.read()
     except Exception:
+        return ""
+def _read_csv_artifact(path: str) -> Dict[str, Any]:
+    # Read a manageable slice, treat everything as string to avoid dtype issues
+    df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False)
+    cols = list(df.columns.astype(str))
+    # Build a short textual summary to help retrieval too
+    preview = df.head(3).to_dict(orient="records")
+    text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}"
+    return {
+        "kind": "csv",
+        "name": os.path.basename(path),
+        "path": path,
+        "columns": cols,
+        "n_rows_sampled": len(df),
+        "preview_rows": preview,
+        "text": text_summary,
+    }
+def _read_pdf_text(path: str) -> str:
+    # Keep it simple; if pdfplumber missing, skip gracefully
+    if not _HAS_PDFPLUMBER:
+        return ""
+    import pdfplumber
     out = []
+    try:
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages[:15]:  # cap pages for speed
+                t = page.extract_text() or ""
+                if t.strip():
+                    out.append(t)
+    except Exception:
+        return ""
+    return "\n\n".join(out)
+def _read_docx_text(path: str) -> str:
+    try:
+        import docx
+    except Exception:
+        return ""
+    try:
+        doc = docx.Document(path)
+        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+    except Exception:
+        return ""
+def _read_image_text(path: str) -> str:
+    # Best-effort OCR
+    try:
+        import pytesseract
+        from PIL import Image
+        img = Image.open(path)
+        return pytesseract.image_to_string(img) or ""
+    except Exception:
+        return ""
+def extract_text_from_files(paths: List[str]) -> Dict[str, Any]:
     """
+    Returns a dict:
+      {
+        "chunks": [str, ...],     # text chunks for retrieval
+        "artifacts": [ { structured meta }, ... ]  # e.g., CSV columns
+      }
+    Backward compatible: callers expecting a list of strings can use ["chunks"].
     """
+    chunks: List[str] = []
+    artifacts: List[Dict[str, Any]] = []
+    for p in paths or []:
+        if not p or not os.path.exists(p):
             continue
+        name = os.path.basename(p).lower()
+        if name.endswith(".csv"):
+            try:
+                art = _read_csv_artifact(p)
+                artifacts.append(art)
+                # also add the textual summary to chunks
+                chunks.append(art["text"])
+            except Exception:
+                # fall back to raw text if any
+                chunks.append(_read_text_file(p))
+        elif name.endswith(".pdf"):
+            txt = _read_pdf_text(p)
+            if txt.strip():
+                chunks.append(txt)
+        elif name.endswith(".docx"):
+            txt = _read_docx_text(p)
+            if txt.strip():
+                chunks.append(txt)
+        elif name.endswith((".txt", ".md", ".json")):
+            txt = _read_text_file(p)
+            if txt.strip():
+                chunks.append(txt)
+        elif name.endswith((".png", ".jpg", ".jpeg")):
+            txt = _read_image_text(p)
+            if txt.strip():
+                chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}")
+        else:
+            # unknown type: try to read as text
+            txt = _read_text_file(p)
+            if txt.strip():
+                chunks.append(txt)
+    return {"chunks": chunks, "artifacts": artifacts}