Spaces:
Sleeping
Sleeping
| # upload_ingest.py | |
| from __future__ import annotations | |
| import os | |
| import json | |
| from typing import Dict, List, Any | |
| import pandas as pd | |
| # Optional parsers | |
| try: | |
| import pdfplumber # noqa: F401 | |
| _HAS_PDFPLUMBER = True | |
| except Exception: | |
| _HAS_PDFPLUMBER = False | |
| def _read_text_file(path: str) -> str: | |
| try: | |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
| return f.read() | |
| except Exception: | |
| return "" | |
| def _read_csv_artifact(path: str) -> Dict[str, Any]: | |
| # Read a manageable slice, treat everything as string to avoid dtype issues | |
| df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False) | |
| cols = list(df.columns.astype(str)) | |
| # Build a short textual summary to help retrieval too | |
| preview = df.head(3).to_dict(orient="records") | |
| text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}" | |
| return { | |
| "kind": "csv", | |
| "name": os.path.basename(path), | |
| "path": path, | |
| "columns": cols, | |
| "n_rows_sampled": len(df), | |
| "preview_rows": preview, | |
| "text": text_summary, | |
| } | |
| def _read_pdf_text(path: str) -> str: | |
| # Keep it simple; if pdfplumber missing, skip gracefully | |
| if not _HAS_PDFPLUMBER: | |
| return "" | |
| import pdfplumber | |
| out = [] | |
| try: | |
| with pdfplumber.open(path) as pdf: | |
| for page in pdf.pages[:15]: # cap pages for speed | |
| t = page.extract_text() or "" | |
| if t.strip(): | |
| out.append(t) | |
| except Exception: | |
| return "" | |
| return "\n\n".join(out) | |
| def _read_docx_text(path: str) -> str: | |
| try: | |
| import docx | |
| except Exception: | |
| return "" | |
| try: | |
| doc = docx.Document(path) | |
| return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) | |
| except Exception: | |
| return "" | |
| def _read_image_text(path: str) -> str: | |
| # Best-effort OCR | |
| try: | |
| import pytesseract | |
| from PIL import Image | |
| img = Image.open(path) | |
| return pytesseract.image_to_string(img) or "" | |
| except Exception: | |
| return "" | |
| def extract_text_from_files(paths: List[str]) -> Dict[str, Any]: | |
| """ | |
| Returns a dict: | |
| { | |
| "chunks": [str, ...], # text chunks for retrieval | |
| "artifacts": [ { structured meta }, ... ] # e.g., CSV columns | |
| } | |
| Backward compatible: callers expecting a list of strings can use ["chunks"]. | |
| """ | |
| chunks: List[str] = [] | |
| artifacts: List[Dict[str, Any]] = [] | |
| for p in paths or []: | |
| if not p or not os.path.exists(p): | |
| continue | |
| name = os.path.basename(p).lower() | |
| if name.endswith(".csv"): | |
| try: | |
| art = _read_csv_artifact(p) | |
| artifacts.append(art) | |
| # also add the textual summary to chunks | |
| chunks.append(art["text"]) | |
| except Exception: | |
| # fall back to raw text if any | |
| chunks.append(_read_text_file(p)) | |
| elif name.endswith(".pdf"): | |
| txt = _read_pdf_text(p) | |
| if txt.strip(): | |
| chunks.append(txt) | |
| elif name.endswith(".docx"): | |
| txt = _read_docx_text(p) | |
| if txt.strip(): | |
| chunks.append(txt) | |
| elif name.endswith((".txt", ".md", ".json")): | |
| txt = _read_text_file(p) | |
| if txt.strip(): | |
| chunks.append(txt) | |
| elif name.endswith((".png", ".jpg", ".jpeg")): | |
| txt = _read_image_text(p) | |
| if txt.strip(): | |
| chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}") | |
| else: | |
| # unknown type: try to read as text | |
| txt = _read_text_file(p) | |
| if txt.strip(): | |
| chunks.append(txt) | |
| return {"chunks": chunks, "artifacts": artifacts} | |