Spaces:
Sleeping
Sleeping
| # core/document_processor.py | |
| from typing import Dict, List | |
| import PyPDF2 | |
| import docx | |
| from pathlib import Path | |
| class DocumentProcessor: | |
| """Handles document ingestion and preprocessing.""" | |
| supported_formats = ['.txt', '.pdf', '.docx'] | |
| def process_document(file_path: str) -> Dict: | |
| path = Path(file_path) | |
| if not path.exists(): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| if path.suffix not in DocumentProcessor.supported_formats: | |
| raise ValueError(f"Unsupported file format: {path.suffix}") | |
| content = DocumentProcessor._extract_content(path) | |
| metadata = DocumentProcessor._extract_metadata(path) | |
| return { | |
| "content": content, | |
| "metadata": metadata | |
| } | |
| def _extract_content(path: Path) -> str: | |
| if path.suffix == '.txt': | |
| return path.read_text(encoding='utf-8') | |
| elif path.suffix == '.pdf': | |
| with open(path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| return ' '.join([page.extract_text() for page in reader.pages]) | |
| elif path.suffix == '.docx': | |
| doc = docx.Document(path) | |
| return ' '.join([paragraph.text for paragraph in doc.paragraphs]) | |
| def _extract_metadata(path: Path) -> Dict: | |
| return { | |
| "filename": path.name, | |
| "file_size": path.stat().st_size, | |
| "file_type": path.suffix, | |
| "last_modified": path.stat().st_mtime | |
| } | |