misakovhearst
Initial deploy
48c7fed
import os
import re
from typing import Tuple, Optional
from pathlib import Path
from pypdf import PdfReader
from docx import Document
_EMOJI_RE = re.compile(
"[\U0001F600-\U0001F64F" # emoticons (πŸ˜€ πŸ˜‚ πŸ₯Ή etc.)
"\U0001F300-\U0001F5FF" # misc symbols & pictographs (🌍 πŸŽ‰ πŸ”₯ etc.)
"\U0001F680-\U0001F6FF" # transport & map (πŸš€ ✈️ πŸš— etc.)
"\U0001F1E0-\U0001F1FF" # regional indicator letters (flags πŸ‡ΊπŸ‡Έ)
"\U0001F900-\U0001F9FF" # supplemental symbols (πŸ€” 🀣 🧠 etc.)
"\U0001FA00-\U0001FA6F" # chess / extended pictographic
"\U0001FA70-\U0001FAFF" # symbols & pictographs extended-A
"]+",
flags=re.UNICODE,
)
class FileParser:
"""
Parse multiple file formats and extract text.
Supports: PDF, DOCX, TXT, and raw text input.
"""
SUPPORTED_FORMATS = {".pdf", ".docx", ".doc", ".txt"}
@staticmethod
def parse_file(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""
Parse a file and extract text.
Args:
file_path: Path to the file
Returns:
Tuple of (text, format, error)
- text: Extracted text content
- format: File format (pdf, docx, txt)
- error: Exception if parsing failed, None if successful
"""
file_extension = Path(file_path).suffix.lower()
if file_extension not in FileParser.SUPPORTED_FORMATS:
error = ValueError(f"Unsupported file format: {file_extension}")
return "", "", error
if file_extension == ".pdf":
return FileParser.parse_pdf(file_path)
elif file_extension in {".docx", ".doc"}:
return FileParser.parse_docx(file_path)
elif file_extension == ".txt":
return FileParser.parse_txt(file_path)
return "", "", ValueError("Unknown error")
@staticmethod
def parse_pdf(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from PDF file"""
try:
text = ""
with open(file_path, 'rb') as pdf_file:
pdf_reader = PdfReader(pdf_file)
# Extract text from all pages
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text.strip(), "pdf", None
except Exception as e:
return "", "pdf", e
@staticmethod
def parse_docx(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from DOCX file"""
try:
doc = Document(file_path)
text = ""
# Extract text from all paragraphs
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
# Also extract text from tables if present
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text += cell.text + "\n"
return text.strip(), "docx", None
except Exception as e:
return "", "docx", e
@staticmethod
def parse_txt(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from plain text file"""
try:
with open(file_path, 'r', encoding='utf-8') as txt_file:
text = txt_file.read()
return text.strip(), "txt", None
except UnicodeDecodeError:
# Try with different encoding
try:
with open(file_path, 'r', encoding='latin-1') as txt_file:
text = txt_file.read()
return text.strip(), "txt", None
except Exception as e:
return "", "txt", e
except Exception as e:
return "", "txt", e
@staticmethod
def parse_raw_text(text: str) -> Tuple[str, str, Optional[Exception]]:
"""Process raw text input"""
try:
cleaned_text = text.strip()
if not cleaned_text:
return "", "raw", ValueError("Empty text provided")
return cleaned_text, "raw", None
except Exception as e:
return "", "raw", e
class TextCleaner:
"""Clean and normalize extracted text"""
@staticmethod
def clean(text: str) -> str:
"""
Clean and normalize text.
Removes extra whitespace, normalizes line breaks, etc.
"""
# Remove extra whitespace
text = ' '.join(text.split())
# Normalize line breaks
text = text.replace('\r\n', '\n').replace('\r', '\n')
return text
@staticmethod
def get_text_stats(text: str) -> dict:
"""Get statistics about text"""
words = text.split()
sentences = text.split('.')
return {
"character_count": len(text),
"word_count": len(words),
"sentence_count": len([s for s in sentences if s.strip()]),
"average_word_length": len(text) / len(words) if words else 0,
"average_sentence_length": len(words) / len(sentences) if sentences else 0,
"emoji_count": sum(len(m) for m in _EMOJI_RE.findall(text)),
"em_dash_count": text.count('\u2014'),
"arrow_count": text.count('\u2192'),
}