|
|
"""Document loader with compatibility across llama-index versions. |
|
|
|
|
|
It attempts to use `SimpleDirectoryReader` when available. If the |
|
|
import fails (API changed), it falls back to a simple reader that |
|
|
creates `Document` objects from TXT and PDF files. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import tempfile |
|
|
from typing import List |
|
|
|
|
|
try: |
|
|
from llama_index.core import SimpleDirectoryReader |
|
|
except Exception: |
|
|
SimpleDirectoryReader = None |
|
|
|
|
|
try: |
|
|
from llama_index.core import Document |
|
|
except Exception: |
|
|
Document = None |
|
|
|
|
|
import pypdf |
|
|
|
|
|
|
|
|
class DocumentLoader: |
|
|
@staticmethod |
|
|
def _read_pdf(path: str) -> str: |
|
|
try: |
|
|
reader = pypdf.PdfReader(path) |
|
|
texts = [] |
|
|
for page in reader.pages: |
|
|
texts.append(page.extract_text() or "") |
|
|
return "\n".join(texts) |
|
|
except Exception: |
|
|
return "" |
|
|
|
|
|
@staticmethod |
|
|
def load_files(uploaded_files) -> List: |
|
|
"""Save uploaded files to a temp directory and load as Documents. |
|
|
|
|
|
Uses `SimpleDirectoryReader` when available; otherwise reads |
|
|
`.txt` and `.pdf` files manually and wraps them in |
|
|
`llama_index.Document` objects when possible. |
|
|
""" |
|
|
documents = [] |
|
|
with tempfile.TemporaryDirectory() as temp_dir: |
|
|
|
|
|
for uploaded_file in uploaded_files: |
|
|
file_path = os.path.join(temp_dir, uploaded_file.name) |
|
|
with open(file_path, "wb") as f: |
|
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
|
|
|
if SimpleDirectoryReader: |
|
|
return SimpleDirectoryReader(temp_dir).load_data() |
|
|
|
|
|
|
|
|
for root, _, files in os.walk(temp_dir): |
|
|
for fname in files: |
|
|
path = os.path.join(root, fname) |
|
|
content = "" |
|
|
if fname.lower().endswith(".pdf"): |
|
|
content = DocumentLoader._read_pdf(path) |
|
|
else: |
|
|
try: |
|
|
with open(path, "r", encoding="utf-8", errors="ignore") as fh: |
|
|
content = fh.read() |
|
|
except Exception: |
|
|
content = "" |
|
|
|
|
|
if Document: |
|
|
documents.append(Document(text=content, metadata={"file_name": fname})) |
|
|
else: |
|
|
documents.append({"text": content, "file_name": fname}) |
|
|
|
|
|
return documents |