File size: 2,658 Bytes
c38e9f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
"""Document loader with compatibility across llama-index versions.
It attempts to use `SimpleDirectoryReader` when available. If the
import fails (API changed), it falls back to a simple reader that
creates `Document` objects from TXT and PDF files.
"""
import os
import tempfile
from typing import List
try:
from llama_index.core import SimpleDirectoryReader
except Exception:
SimpleDirectoryReader = None # type: ignore
try:
from llama_index.core import Document
except Exception:
Document = None # type: ignore
import pypdf
class DocumentLoader:
@staticmethod
def _read_pdf(path: str) -> str:
try:
reader = pypdf.PdfReader(path)
texts = []
for page in reader.pages:
texts.append(page.extract_text() or "")
return "\n".join(texts)
except Exception:
return ""
@staticmethod
def load_files(uploaded_files) -> List:
"""Save uploaded files to a temp directory and load as Documents.
Uses `SimpleDirectoryReader` when available; otherwise reads
`.txt` and `.pdf` files manually and wraps them in
`llama_index.Document` objects when possible.
"""
documents = []
with tempfile.TemporaryDirectory() as temp_dir:
# Save uploaded files to temp directory
for uploaded_file in uploaded_files:
file_path = os.path.join(temp_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# If SimpleDirectoryReader is available, prefer it
if SimpleDirectoryReader:
return SimpleDirectoryReader(temp_dir).load_data()
# Fallback: walk files and create Document objects (or raw dicts)
for root, _, files in os.walk(temp_dir):
for fname in files:
path = os.path.join(root, fname)
content = ""
if fname.lower().endswith(".pdf"):
content = DocumentLoader._read_pdf(path)
else:
try:
with open(path, "r", encoding="utf-8", errors="ignore") as fh:
content = fh.read()
except Exception:
content = ""
if Document:
documents.append(Document(text=content, metadata={"file_name": fname}))
else:
documents.append({"text": content, "file_name": fname})
return documents |