Spaces:
Sleeping
Sleeping
File size: 3,881 Bytes
023cf3a f051f2e 023cf3a 1a93711 023cf3a 1a93711 023cf3a 1a93711 023cf3a f051f2e 023cf3a f051f2e 023cf3a f051f2e 023cf3a f051f2e 023cf3a f051f2e 023cf3a 1a93711 023cf3a 1a93711 023cf3a f051f2e 023cf3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# upload_ingest.py
from __future__ import annotations
import os
import json
from typing import Dict, List, Any
import pandas as pd
# Optional parsers
try:
import pdfplumber # noqa: F401
_HAS_PDFPLUMBER = True
except Exception:
_HAS_PDFPLUMBER = False
def _read_text_file(path: str) -> str:
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
except Exception:
return ""
def _read_csv_artifact(path: str) -> Dict[str, Any]:
# Read a manageable slice, treat everything as string to avoid dtype issues
df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False)
cols = list(df.columns.astype(str))
# Build a short textual summary to help retrieval too
preview = df.head(3).to_dict(orient="records")
text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}"
return {
"kind": "csv",
"name": os.path.basename(path),
"path": path,
"columns": cols,
"n_rows_sampled": len(df),
"preview_rows": preview,
"text": text_summary,
}
def _read_pdf_text(path: str) -> str:
# Keep it simple; if pdfplumber missing, skip gracefully
if not _HAS_PDFPLUMBER:
return ""
import pdfplumber
out = []
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages[:15]: # cap pages for speed
t = page.extract_text() or ""
if t.strip():
out.append(t)
except Exception:
return ""
return "\n\n".join(out)
def _read_docx_text(path: str) -> str:
try:
import docx
except Exception:
return ""
try:
doc = docx.Document(path)
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
except Exception:
return ""
def _read_image_text(path: str) -> str:
# Best-effort OCR
try:
import pytesseract
from PIL import Image
img = Image.open(path)
return pytesseract.image_to_string(img) or ""
except Exception:
return ""
def extract_text_from_files(paths: List[str]) -> Dict[str, Any]:
"""
Returns a dict:
{
"chunks": [str, ...], # text chunks for retrieval
"artifacts": [ { structured meta }, ... ] # e.g., CSV columns
}
Backward compatible: callers expecting a list of strings can use ["chunks"].
"""
chunks: List[str] = []
artifacts: List[Dict[str, Any]] = []
for p in paths or []:
if not p or not os.path.exists(p):
continue
name = os.path.basename(p).lower()
if name.endswith(".csv"):
try:
art = _read_csv_artifact(p)
artifacts.append(art)
# also add the textual summary to chunks
chunks.append(art["text"])
except Exception:
# fall back to raw text if any
chunks.append(_read_text_file(p))
elif name.endswith(".pdf"):
txt = _read_pdf_text(p)
if txt.strip():
chunks.append(txt)
elif name.endswith(".docx"):
txt = _read_docx_text(p)
if txt.strip():
chunks.append(txt)
elif name.endswith((".txt", ".md", ".json")):
txt = _read_text_file(p)
if txt.strip():
chunks.append(txt)
elif name.endswith((".png", ".jpg", ".jpeg")):
txt = _read_image_text(p)
if txt.strip():
chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}")
else:
# unknown type: try to read as text
txt = _read_text_file(p)
if txt.strip():
chunks.append(txt)
return {"chunks": chunks, "artifacts": artifacts}
|