TranscriptWriting / extractors.py
jmisak's picture
Upload 23 files
54c99ad verified
raw
history blame
7.16 kB
from docx import Document
import pdfplumber
import re
from typing import Tuple
import os
def extract_docx(file_obj) -> str:
"""
Extract text from DOCX with enhanced error handling and formatting preservation
"""
try:
doc = Document(file_obj)
# Extract paragraphs with better handling
paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if text: # Only include non-empty paragraphs
paragraphs.append(text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
row_text = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
row_text.append(cell_text)
if row_text:
paragraphs.append(" | ".join(row_text))
extracted_text = "\n\n".join(paragraphs)
# Clean up common issues
extracted_text = clean_extracted_text(extracted_text)
return extracted_text
except Exception as e:
error_msg = f"[DOCX Extraction Error] {str(e)}"
print(error_msg)
return f"Error extracting DOCX: {str(e)}"
def extract_pdf(file_obj) -> str:
"""
Extract text from PDF with multiple strategies and enhanced error handling
"""
try:
extracted_pages = []
with pdfplumber.open(file_obj) as pdf:
# Track extraction success
successful_pages = 0
total_pages = len(pdf.pages)
for page_num, page in enumerate(pdf.pages, 1):
try:
# Strategy 1: Standard text extraction
page_text = page.extract_text()
# Strategy 2: If standard fails, try with layout
if not page_text or len(page_text.strip()) < 50:
page_text = page.extract_text(layout=True)
# Strategy 3: If still poor, try with custom settings
if not page_text or len(page_text.strip()) < 50:
page_text = page.extract_text(
x_tolerance=2,
y_tolerance=2
)
if page_text and page_text.strip():
# Clean and add page marker
clean_text = page_text.strip()
extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
successful_pages += 1
else:
print(f"[PDF Warning] Page {page_num} yielded no text")
except Exception as page_error:
print(f"[PDF Warning] Error on page {page_num}: {page_error}")
continue
if successful_pages == 0:
return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
if successful_pages < total_pages * 0.5:
print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
full_text = "\n\n".join(extracted_pages)
# Clean up the extracted text
full_text = clean_extracted_text(full_text)
return full_text
except Exception as e:
error_msg = f"[PDF Extraction Error] {str(e)}"
print(error_msg)
return f"Error extracting PDF: {str(e)}"
def clean_extracted_text(text: str) -> str:
"""
Clean up common issues in extracted text
"""
# Remove excessive whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' {2,}', ' ', text)
# Remove page numbers that appear alone on lines
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
# Remove common headers/footers patterns
text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
# Fix common OCR issues (if any)
text = text.replace('', "'") # Curly apostrophe
text = text.replace('', "'")
text = text.replace('"', '"') # Curly quotes
text = text.replace('"', '"')
text = text.replace('–', '-') # En dash
text = text.replace('β€”', '-') # Em dash
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
return text.strip()
def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
"""
Validate extracted text quality
"""
# Check if text is empty
if not text or not text.strip():
return False, "No text extracted"
# Check for minimum length
if len(text) < 100:
return False, f"Extracted text too short ({len(text)} characters)"
# Check for error messages
if text.startswith("Error") or text.startswith("["):
return False, "Extraction error detected"
# Check for gibberish (too many non-alphanumeric characters)
#alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
#ratio = alphanumeric / len(text) if text else 0
#if ratio < 0.2:
# return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
# Check word count
words = text.split()
if len(words) < 50:
return False, f"Too few words ({len(words)})"
# Check for reasonable word lengths (catch binary junk)
#avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
#if avg_word_length < 2 or avg_word_length > 20:
# return False, f"Unusual average word length ({avg_word_length:.1f})"
# All checks passed
return True, f"Valid extraction: {len(words)} words, {len(text)} characters"
def detect_file_encoding(file_path: str) -> str:
"""
Detect file encoding for text files
"""
try:
import chardet
with open(file_path, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
return result['encoding']
except:
return 'utf-8' # Default fallback
def extract_text_file(file_obj) -> str:
"""
Extract from plain text file with encoding detection
"""
try:
# Try UTF-8 first
try:
return file_obj.read().decode('utf-8')
except UnicodeDecodeError:
# Try other common encodings
file_obj.seek(0)
try:
return file_obj.read().decode('latin-1')
except:
file_obj.seek(0)
return file_obj.read().decode('cp1252')
except Exception as e:
return f"Error reading text file: {str(e)}"