Spaces:
Sleeping
Sleeping
| from typing import List, Dict | |
| from pathlib import Path | |
| import pypdf | |
| from docx import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| class DocumentProcessor: | |
| def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] | |
| ) | |
| def load_pdf(self, file_path: str) -> str: | |
| """Load text from PDF""" | |
| with open(file_path, 'rb') as file: | |
| reader = pypdf.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def load_docx(self, file_path: str) -> str: | |
| """Load text from DOCX""" | |
| doc = Document(file_path) | |
| return "\n".join([paragraph.text for paragraph in doc.paragraphs]) | |
| def load_txt(self, file_path: str) -> str: | |
| """Load text from TXT""" | |
| with open(file_path, 'r', encoding='utf-8') as file: | |
| return file.read() | |
| def process_document(self, file_path: str) -> List[Dict]: | |
| """Process document and return chunks with metadata""" | |
| path = Path(file_path) | |
| # Load based on extension | |
| if path.suffix == '.pdf': | |
| text = self.load_pdf(file_path) | |
| elif path.suffix == '.docx': | |
| text = self.load_docx(file_path) | |
| elif path.suffix == '.txt': | |
| text = self.load_txt(file_path) | |
| else: | |
| raise ValueError(f"Unsupported file type: {path.suffix}") | |
| # Split into chunks | |
| chunks = self.text_splitter.split_text(text) | |
| # Add metadata | |
| chunk_data = [] | |
| for idx, chunk in enumerate(chunks): | |
| chunk_data.append({ | |
| "text": chunk, | |
| "metadata": { | |
| "source": path.name, | |
| "chunk_index": idx, | |
| "total_chunks": len(chunks) | |
| } | |
| }) | |
| return chunk_data | |