File size: 5,568 Bytes
48c7fed | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | import os
import re
from typing import Tuple, Optional
from pathlib import Path
from pypdf import PdfReader
from docx import Document
_EMOJI_RE = re.compile(
"[\U0001F600-\U0001F64F" # emoticons (π π π₯Ή etc.)
"\U0001F300-\U0001F5FF" # misc symbols & pictographs (π π π₯ etc.)
"\U0001F680-\U0001F6FF" # transport & map (π βοΈ π etc.)
"\U0001F1E0-\U0001F1FF" # regional indicator letters (flags πΊπΈ)
"\U0001F900-\U0001F9FF" # supplemental symbols (π€ π€£ π§ etc.)
"\U0001FA00-\U0001FA6F" # chess / extended pictographic
"\U0001FA70-\U0001FAFF" # symbols & pictographs extended-A
"]+",
flags=re.UNICODE,
)
class FileParser:
"""
Parse multiple file formats and extract text.
Supports: PDF, DOCX, TXT, and raw text input.
"""
SUPPORTED_FORMATS = {".pdf", ".docx", ".doc", ".txt"}
@staticmethod
def parse_file(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""
Parse a file and extract text.
Args:
file_path: Path to the file
Returns:
Tuple of (text, format, error)
- text: Extracted text content
- format: File format (pdf, docx, txt)
- error: Exception if parsing failed, None if successful
"""
file_extension = Path(file_path).suffix.lower()
if file_extension not in FileParser.SUPPORTED_FORMATS:
error = ValueError(f"Unsupported file format: {file_extension}")
return "", "", error
if file_extension == ".pdf":
return FileParser.parse_pdf(file_path)
elif file_extension in {".docx", ".doc"}:
return FileParser.parse_docx(file_path)
elif file_extension == ".txt":
return FileParser.parse_txt(file_path)
return "", "", ValueError("Unknown error")
@staticmethod
def parse_pdf(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from PDF file"""
try:
text = ""
with open(file_path, 'rb') as pdf_file:
pdf_reader = PdfReader(pdf_file)
# Extract text from all pages
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text() + "\n"
return text.strip(), "pdf", None
except Exception as e:
return "", "pdf", e
@staticmethod
def parse_docx(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from DOCX file"""
try:
doc = Document(file_path)
text = ""
# Extract text from all paragraphs
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
# Also extract text from tables if present
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text += cell.text + "\n"
return text.strip(), "docx", None
except Exception as e:
return "", "docx", e
@staticmethod
def parse_txt(file_path: str) -> Tuple[str, str, Optional[Exception]]:
"""Extract text from plain text file"""
try:
with open(file_path, 'r', encoding='utf-8') as txt_file:
text = txt_file.read()
return text.strip(), "txt", None
except UnicodeDecodeError:
# Try with different encoding
try:
with open(file_path, 'r', encoding='latin-1') as txt_file:
text = txt_file.read()
return text.strip(), "txt", None
except Exception as e:
return "", "txt", e
except Exception as e:
return "", "txt", e
@staticmethod
def parse_raw_text(text: str) -> Tuple[str, str, Optional[Exception]]:
"""Process raw text input"""
try:
cleaned_text = text.strip()
if not cleaned_text:
return "", "raw", ValueError("Empty text provided")
return cleaned_text, "raw", None
except Exception as e:
return "", "raw", e
class TextCleaner:
"""Clean and normalize extracted text"""
@staticmethod
def clean(text: str) -> str:
"""
Clean and normalize text.
Removes extra whitespace, normalizes line breaks, etc.
"""
# Remove extra whitespace
text = ' '.join(text.split())
# Normalize line breaks
text = text.replace('\r\n', '\n').replace('\r', '\n')
return text
@staticmethod
def get_text_stats(text: str) -> dict:
"""Get statistics about text"""
words = text.split()
sentences = text.split('.')
return {
"character_count": len(text),
"word_count": len(words),
"sentence_count": len([s for s in sentences if s.strip()]),
"average_word_length": len(text) / len(words) if words else 0,
"average_sentence_length": len(words) / len(sentences) if sentences else 0,
"emoji_count": sum(len(m) for m in _EMOJI_RE.findall(text)),
"em_dash_count": text.count('\u2014'),
"arrow_count": text.count('\u2192'),
}
|