Spaces:
Running
Running
File size: 3,030 Bytes
55953aa 2623b17 55953aa 3151380 55953aa 3151380 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """
document_loader.py
Parses uploaded files (PDF, DOCX, TXT/MD) into plain text.
"""
import os
from pathlib import Path
def load_documents(file_paths: list[str]) -> list[dict]:
"""
Given a list of file paths, parse each into a dict:
{ "source": filename, "text": full text content }
Supports: .pdf, .docx, .txt, .md
"""
docs = []
for path in file_paths:
if path is None:
continue
ext = Path(path).suffix.lower()
name = Path(path).name
try:
if ext == ".pdf":
text = _load_pdf(path)
elif ext == ".docx":
text = _load_docx(path)
elif ext in (".txt", ".md", ".csv"):
text = _load_text(path)
else:
print(f"[Loader] Unsupported file type: {ext} — skipping {name}")
continue
if text.strip():
docs.append({"source": name, "text": text})
else:
print(f"[Loader] Empty content from {name} — skipping")
except Exception as e:
print(f"[Loader] Failed to load {name}: {e}")
return docs
def _load_pdf(path: str) -> str:
import fitz # PyMuPDF
doc = fitz.open(path)
pages = []
for page in doc:
pages.append(page.get_text("text"))
doc.close()
return "\n".join(pages)
def _load_docx(path: str) -> str:
from docx import Document
doc = Document(path)
parts: list[str] = []
# Body paragraphs (existing)
for p in doc.paragraphs:
if p.text.strip():
parts.append(p.text.strip())
# Tables — previously skipped entirely
for table in doc.tables:
for row in table.rows:
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if cells:
parts.append("\t".join(cells))
return "\n".join(parts)
def _load_text(path: str) -> str:
"""Load plain text files. CSVs are parsed into natural-language row sentences."""
ext = Path(path).suffix.lower()
if ext == ".csv":
return _load_csv(path)
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
def _load_csv(path: str) -> str:
"""
Parse a CSV file into natural-language sentences.
Each row becomes: "ColumnA: value1. ColumnB: value2. ..."
This makes tabular data semantically meaningful to the LLM rather
than presenting it as raw comma-separated text.
"""
import csv
rows: list[str] = []
with open(path, "r", encoding="utf-8", errors="ignore", newline="") as f:
reader = csv.DictReader(f)
if reader.fieldnames is None:
# Fallback to raw text for headerless CSVs
f.seek(0)
return f.read()
for row in reader:
parts = [f"{col}: {val.strip()}" for col, val in row.items() if val and val.strip()]
if parts:
rows.append(". ".join(parts) + ".")
return "\n".join(rows)
|