Spaces:
Sleeping
Sleeping
| """ | |
| Text Extractor Module | |
| Handles extracting text content from PDF files. | |
| """ | |
| import pdfplumber | |
| class TextExtractor: | |
| """Handles text extraction from PDF files.""" | |
| def __init__(self): | |
| """Initialize the text extractor.""" | |
| pass | |
| async def extract_text_from_pdf(self, pdf_path: str) -> str: | |
| """ | |
| Extract text from PDF file. | |
| Args: | |
| pdf_path: Path to the PDF file | |
| Returns: | |
| str: Extracted text content | |
| Raises: | |
| Exception: If text extraction fails | |
| """ | |
| print(f"📖 Extracting text from PDF...") | |
| full_text = "" | |
| try: | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages): | |
| text = page.extract_text() | |
| if text: | |
| full_text += f"\n--- Page {page_num + 1} ---\n" | |
| full_text += text | |
| print(f"✅ Extracted {len(full_text)} characters from PDF") | |
| return full_text | |
| except Exception as e: | |
| raise Exception(f"Failed to extract text from PDF: {str(e)}") | |
| def validate_extracted_text(self, text: str) -> bool: | |
| """ | |
| Validate that extracted text is not empty and contains meaningful content. | |
| Args: | |
| text: The extracted text to validate | |
| Returns: | |
| bool: True if text is valid, False otherwise | |
| """ | |
| if not text or not text.strip(): | |
| return False | |
| # Check if text has at least some alphabetic characters | |
| alphabetic_chars = sum(1 for char in text if char.isalpha()) | |
| return alphabetic_chars > 50 # At least 50 alphabetic characters | |