Spaces:
Sleeping
Sleeping
File size: 2,152 Bytes
4722db8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
from typing import List, Dict
from pathlib import Path
import pypdf
from docx import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
class DocumentProcessor:
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
def load_pdf(self, file_path: str) -> str:
"""Load text from PDF"""
with open(file_path, 'rb') as file:
reader = pypdf.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def load_docx(self, file_path: str) -> str:
"""Load text from DOCX"""
doc = Document(file_path)
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
def load_txt(self, file_path: str) -> str:
"""Load text from TXT"""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def process_document(self, file_path: str) -> List[Dict]:
"""Process document and return chunks with metadata"""
path = Path(file_path)
# Load based on extension
if path.suffix == '.pdf':
text = self.load_pdf(file_path)
elif path.suffix == '.docx':
text = self.load_docx(file_path)
elif path.suffix == '.txt':
text = self.load_txt(file_path)
else:
raise ValueError(f"Unsupported file type: {path.suffix}")
# Split into chunks
chunks = self.text_splitter.split_text(text)
# Add metadata
chunk_data = []
for idx, chunk in enumerate(chunks):
chunk_data.append({
"text": chunk,
"metadata": {
"source": path.name,
"chunk_index": idx,
"total_chunks": len(chunks)
}
})
return chunk_data
|