Generative_AI_Project / app /services /document_processor.py
Arif
Demo data added for query test
704b133
from typing import List, Dict
from pathlib import Path
import pypdf
from docx import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
class DocumentProcessor:
def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
def load_pdf(self, file_path: str) -> str:
"""Load text from PDF"""
with open(file_path, 'rb') as file:
reader = pypdf.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def load_docx(self, file_path: str) -> str:
"""Load text from DOCX"""
doc = Document(file_path)
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
def load_txt(self, file_path: str) -> str:
"""Load text from TXT"""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def process_document(self, file_path: str) -> List[Dict]:
"""Process document and return chunks with metadata"""
path = Path(file_path)
# Load based on extension
if path.suffix == '.pdf':
text = self.load_pdf(file_path)
elif path.suffix == '.docx':
text = self.load_docx(file_path)
elif path.suffix == '.txt':
text = self.load_txt(file_path)
else:
raise ValueError(f"Unsupported file type: {path.suffix}")
# Split into chunks
chunks = self.text_splitter.split_text(text)
# Add metadata
chunk_data = []
for idx, chunk in enumerate(chunks):
chunk_data.append({
"text": chunk,
"metadata": {
"source": path.name,
"chunk_index": idx,
"total_chunks": len(chunks)
}
})
return chunk_data