Spaces:
Sleeping
Sleeping
File size: 2,232 Bytes
f974658 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
import os
from typing import List, Dict, Any
import tempfile
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
class DocumentLoader:
"""Handles loading and processing PDF documents"""
def __init__(self, document_dir: str = "documents"):
self.document_dir = document_dir
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
)
# Create documents directory if it doesn't exist
os.makedirs(document_dir, exist_ok=True)
def load_pdf(self, file_path: str) -> List[Document]:
"""Load and split a PDF document into chunks"""
try:
loader = PyPDFLoader(file_path)
documents = loader.load()
return self.text_splitter.split_documents(documents)
except Exception as e:
print(f"Error loading PDF: {str(e)}")
return []
def save_uploaded_pdf(self, uploaded_file) -> str:
"""Save an uploaded PDF file with its original name and return its path"""
try:
# Make sure document_dir exists
os.makedirs(self.document_dir, exist_ok=True)
# Sanitize the original filename to prevent path traversal or special characters
safe_filename = os.path.basename(uploaded_file.name)
save_path = os.path.join(self.document_dir, safe_filename)
# Save file content
with open(save_path, 'wb') as f:
f.write(uploaded_file.getvalue())
return save_path
except Exception as e:
print(f"Error saving uploaded PDF: {str(e)}")
return ""
def get_available_documents(self) -> List[str]:
"""Get a list of available PDF documents"""
try:
return [f for f in os.listdir(self.document_dir) if f.endswith('.pdf')]
except Exception as e:
print(f"Error listing documents: {str(e)}")
return []
|