import os import re from typing import Tuple, Optional from pathlib import Path from pypdf import PdfReader from docx import Document _EMOJI_RE = re.compile( "[\U0001F600-\U0001F64F" # emoticons (πŸ˜€ πŸ˜‚ πŸ₯Ή etc.) "\U0001F300-\U0001F5FF" # misc symbols & pictographs (🌍 πŸŽ‰ πŸ”₯ etc.) "\U0001F680-\U0001F6FF" # transport & map (πŸš€ ✈️ πŸš— etc.) "\U0001F1E0-\U0001F1FF" # regional indicator letters (flags πŸ‡ΊπŸ‡Έ) "\U0001F900-\U0001F9FF" # supplemental symbols (πŸ€” 🀣 🧠 etc.) "\U0001FA00-\U0001FA6F" # chess / extended pictographic "\U0001FA70-\U0001FAFF" # symbols & pictographs extended-A "]+", flags=re.UNICODE, ) class FileParser: """ Parse multiple file formats and extract text. Supports: PDF, DOCX, TXT, and raw text input. """ SUPPORTED_FORMATS = {".pdf", ".docx", ".doc", ".txt"} @staticmethod def parse_file(file_path: str) -> Tuple[str, str, Optional[Exception]]: """ Parse a file and extract text. Args: file_path: Path to the file Returns: Tuple of (text, format, error) - text: Extracted text content - format: File format (pdf, docx, txt) - error: Exception if parsing failed, None if successful """ file_extension = Path(file_path).suffix.lower() if file_extension not in FileParser.SUPPORTED_FORMATS: error = ValueError(f"Unsupported file format: {file_extension}") return "", "", error if file_extension == ".pdf": return FileParser.parse_pdf(file_path) elif file_extension in {".docx", ".doc"}: return FileParser.parse_docx(file_path) elif file_extension == ".txt": return FileParser.parse_txt(file_path) return "", "", ValueError("Unknown error") @staticmethod def parse_pdf(file_path: str) -> Tuple[str, str, Optional[Exception]]: """Extract text from PDF file""" try: text = "" with open(file_path, 'rb') as pdf_file: pdf_reader = PdfReader(pdf_file) # Extract text from all pages for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() + "\n" return text.strip(), "pdf", None except Exception as e: return "", "pdf", e @staticmethod def parse_docx(file_path: str) -> Tuple[str, str, Optional[Exception]]: """Extract text from DOCX file""" try: doc = Document(file_path) text = "" # Extract text from all paragraphs for paragraph in doc.paragraphs: text += paragraph.text + "\n" # Also extract text from tables if present for table in doc.tables: for row in table.rows: for cell in row.cells: text += cell.text + "\n" return text.strip(), "docx", None except Exception as e: return "", "docx", e @staticmethod def parse_txt(file_path: str) -> Tuple[str, str, Optional[Exception]]: """Extract text from plain text file""" try: with open(file_path, 'r', encoding='utf-8') as txt_file: text = txt_file.read() return text.strip(), "txt", None except UnicodeDecodeError: # Try with different encoding try: with open(file_path, 'r', encoding='latin-1') as txt_file: text = txt_file.read() return text.strip(), "txt", None except Exception as e: return "", "txt", e except Exception as e: return "", "txt", e @staticmethod def parse_raw_text(text: str) -> Tuple[str, str, Optional[Exception]]: """Process raw text input""" try: cleaned_text = text.strip() if not cleaned_text: return "", "raw", ValueError("Empty text provided") return cleaned_text, "raw", None except Exception as e: return "", "raw", e class TextCleaner: """Clean and normalize extracted text""" @staticmethod def clean(text: str) -> str: """ Clean and normalize text. Removes extra whitespace, normalizes line breaks, etc. """ # Remove extra whitespace text = ' '.join(text.split()) # Normalize line breaks text = text.replace('\r\n', '\n').replace('\r', '\n') return text @staticmethod def get_text_stats(text: str) -> dict: """Get statistics about text""" words = text.split() sentences = text.split('.') return { "character_count": len(text), "word_count": len(words), "sentence_count": len([s for s in sentences if s.strip()]), "average_word_length": len(text) / len(words) if words else 0, "average_sentence_length": len(words) / len(sentences) if sentences else 0, "emoji_count": sum(len(m) for m in _EMOJI_RE.findall(text)), "em_dash_count": text.count('\u2014'), "arrow_count": text.count('\u2192'), }