File size: 3,881 Bytes
023cf3a
 
 
 
 
 
f051f2e
023cf3a
 
 
 
 
 
1a93711
023cf3a
1a93711
023cf3a
 
1a93711
023cf3a
f051f2e
023cf3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f051f2e
023cf3a
 
 
 
 
f051f2e
023cf3a
 
 
 
 
 
 
 
 
f051f2e
023cf3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f051f2e
023cf3a
1a93711
023cf3a
 
 
 
 
 
1a93711
023cf3a
 
 
 
 
f051f2e
023cf3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# upload_ingest.py
from __future__ import annotations
import os
import json
from typing import Dict, List, Any
import pandas as pd

# Optional parsers
try:
    import pdfplumber  # noqa: F401
    _HAS_PDFPLUMBER = True
except Exception:
    _HAS_PDFPLUMBER = False

def _read_text_file(path: str) -> str:
    try:
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    except Exception:
        return ""

def _read_csv_artifact(path: str) -> Dict[str, Any]:
    # Read a manageable slice, treat everything as string to avoid dtype issues
    df = pd.read_csv(path, nrows=1000, dtype=str, low_memory=False)
    cols = list(df.columns.astype(str))
    # Build a short textual summary to help retrieval too
    preview = df.head(3).to_dict(orient="records")
    text_summary = f"CSV FILE: {os.path.basename(path)}\nCOLUMNS: {', '.join(cols)}\nSAMPLE ROWS: {json.dumps(preview)}"
    return {
        "kind": "csv",
        "name": os.path.basename(path),
        "path": path,
        "columns": cols,
        "n_rows_sampled": len(df),
        "preview_rows": preview,
        "text": text_summary,
    }

def _read_pdf_text(path: str) -> str:
    # Keep it simple; if pdfplumber missing, skip gracefully
    if not _HAS_PDFPLUMBER:
        return ""
    import pdfplumber
    out = []
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages[:15]:  # cap pages for speed
                t = page.extract_text() or ""
                if t.strip():
                    out.append(t)
    except Exception:
        return ""
    return "\n\n".join(out)

def _read_docx_text(path: str) -> str:
    try:
        import docx
    except Exception:
        return ""
    try:
        doc = docx.Document(path)
        return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
    except Exception:
        return ""

def _read_image_text(path: str) -> str:
    # Best-effort OCR
    try:
        import pytesseract
        from PIL import Image
        img = Image.open(path)
        return pytesseract.image_to_string(img) or ""
    except Exception:
        return ""

def extract_text_from_files(paths: List[str]) -> Dict[str, Any]:
    """
    Returns a dict:
      {
        "chunks": [str, ...],     # text chunks for retrieval
        "artifacts": [ { structured meta }, ... ]  # e.g., CSV columns
      }
    Backward compatible: callers expecting a list of strings can use ["chunks"].
    """
    chunks: List[str] = []
    artifacts: List[Dict[str, Any]] = []

    for p in paths or []:
        if not p or not os.path.exists(p):
            continue
        name = os.path.basename(p).lower()
        if name.endswith(".csv"):
            try:
                art = _read_csv_artifact(p)
                artifacts.append(art)
                # also add the textual summary to chunks
                chunks.append(art["text"])
            except Exception:
                # fall back to raw text if any
                chunks.append(_read_text_file(p))
        elif name.endswith(".pdf"):
            txt = _read_pdf_text(p)
            if txt.strip():
                chunks.append(txt)
        elif name.endswith(".docx"):
            txt = _read_docx_text(p)
            if txt.strip():
                chunks.append(txt)
        elif name.endswith((".txt", ".md", ".json")):
            txt = _read_text_file(p)
            if txt.strip():
                chunks.append(txt)
        elif name.endswith((".png", ".jpg", ".jpeg")):
            txt = _read_image_text(p)
            if txt.strip():
                chunks.append(f"IMAGE OCR ({os.path.basename(p)}):\n{txt}")
        else:
            # unknown type: try to read as text
            txt = _read_text_file(p)
            if txt.strip():
                chunks.append(txt)

    return {"chunks": chunks, "artifacts": artifacts}