Spaces:
Sleeping
Sleeping
| import pypdf | |
| import pdfplumber | |
| from pathlib import Path | |
| from typing import List, Tuple | |
| from backend.app.utils.chunking import intelligent_chunk, create_chunk_metadata | |
| class PDFProcessor: | |
| def __init__(self): | |
| self.max_file_size = 10 * 1024 * 1024 | |
| async def extract_text(self, file_path: Path) -> List[Tuple[str, int]]: | |
| pages_text = [] | |
| try: | |
| with pdfplumber.open(file_path) as pdf: | |
| for i, page in enumerate(pdf.pages, 1): | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| pages_text.append((text, i)) | |
| except Exception as e: | |
| try: | |
| with open(file_path, "rb") as file: | |
| reader = pypdf.PdfReader(file) | |
| for i, page in enumerate(reader.pages, 1): | |
| text = page.extract_text() or "" | |
| if text.strip(): | |
| pages_text.append((text, i)) | |
| except Exception as fallback_error: | |
| raise Exception(f"Failed to extract text: {fallback_error}") | |
| return pages_text | |
| async def process_document(self, file_path: Path, document_id: str) -> List[dict]: | |
| pages_text = await self.extract_text(file_path) | |
| full_text = "\n\n".join([text for text, _ in pages_text]) | |
| chunks = intelligent_chunk(text=full_text, chunk_size=512, overlap=50) | |
| processed_chunks = [] | |
| for idx, chunk in enumerate(chunks): | |
| page_num = self._find_page_number(chunk, pages_text) | |
| chunk_data = { | |
| "text": chunk, | |
| "metadata": create_chunk_metadata( | |
| document_id=document_id, | |
| chunk_index=idx, | |
| page_number=page_num, | |
| total_chunks=len(chunks), | |
| ), | |
| } | |
| processed_chunks.append(chunk_data) | |
| return processed_chunks | |
| def _find_page_number(self, chunk: str, pages_text: List[Tuple[str, int]]) -> int: | |
| chunk_start = chunk[:50] | |
| for text, page_num in pages_text: | |
| if chunk_start in text: | |
| return page_num | |
| return 0 | |
| def validate_file(self, file_path: Path) -> bool: | |
| if not file_path.exists(): | |
| raise Exception("File does not exist") | |
| if file_path.stat().st_size > self.max_file_size: | |
| raise Exception("File size exceeds limit") | |
| try: | |
| with open(file_path, "rb") as f: | |
| pypdf.PdfReader(f) | |
| return True | |
| except Exception: | |
| raise Exception("Invalid PDF file") | |
| pdf_processor = PDFProcessor() | |