# upload_ingest.py from __future__ import annotations import os import json from typing import Dict, List, Any import pandas as pd # Optional parsers try: import pdfplumber # noqa: F401 _HAS_PDFPLUMBER = True except Exception: _HAS_PDFPLUMBER = False def _read_text_file(path: str) -> str: try: with open(path, "r", encoding="utf-8", errors="ignore") as f: return f.read() except Exception: return "" def _read_csv_artifact(path: str) -> Dict[str, Any]: # Read a manageable slice, treat everything as string to avoid dtype issues df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False) cols = list(df.columns.astype(str)) # Build a short textual summary to help retrieval too preview = df.head(3).to_dict(orient="records") text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}" return { "kind": "csv", "name": os.path.basename(path), "path": path, "columns": cols, "n_rows_sampled": len(df), "preview_rows": preview, "text": text_summary, } def _read_pdf_text(path: str) -> str: # Keep it simple; if pdfplumber missing, skip gracefully if not _HAS_PDFPLUMBER: return "" import pdfplumber out = [] try: with pdfplumber.open(path) as pdf: for page in pdf.pages[:15]: # cap pages for speed t = page.extract_text() or "" if t.strip(): out.append(t) except Exception: return "" return "\n\n".join(out) def _read_docx_text(path: str) -> str: try: import docx except Exception: return "" try: doc = docx.Document(path) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) except Exception: return "" def _read_image_text(path: str) -> str: # Best-effort OCR try: import pytesseract from PIL import Image img = Image.open(path) return pytesseract.image_to_string(img) or "" except Exception: return "" def extract_text_from_files(paths: List[str]) -> Dict[str, Any]: """ Returns a dict: { "chunks": [str, ...], # text chunks for retrieval "artifacts": [ { structured meta }, ... ] # e.g., CSV columns } Backward compatible: callers expecting a list of strings can use ["chunks"]. """ chunks: List[str] = [] artifacts: List[Dict[str, Any]] = [] for p in paths or []: if not p or not os.path.exists(p): continue name = os.path.basename(p).lower() if name.endswith(".csv"): try: art = _read_csv_artifact(p) artifacts.append(art) # also add the textual summary to chunks chunks.append(art["text"]) except Exception: # fall back to raw text if any chunks.append(_read_text_file(p)) elif name.endswith(".pdf"): txt = _read_pdf_text(p) if txt.strip(): chunks.append(txt) elif name.endswith(".docx"): txt = _read_docx_text(p) if txt.strip(): chunks.append(txt) elif name.endswith((".txt", ".md", ".json")): txt = _read_text_file(p) if txt.strip(): chunks.append(txt) elif name.endswith((".png", ".jpg", ".jpeg")): txt = _read_image_text(p) if txt.strip(): chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}") else: # unknown type: try to read as text txt = _read_text_file(p) if txt.strip(): chunks.append(txt) return {"chunks": chunks, "artifacts": artifacts}