Spaces:
Sleeping
Sleeping
File size: 7,157 Bytes
54c99ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
from docx import Document
import pdfplumber
import re
from typing import Tuple
import os
def extract_docx(file_obj) -> str:
"""
Extract text from DOCX with enhanced error handling and formatting preservation
"""
try:
doc = Document(file_obj)
# Extract paragraphs with better handling
paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if text: # Only include non-empty paragraphs
paragraphs.append(text)
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
row_text = []
for cell in row.cells:
cell_text = cell.text.strip()
if cell_text:
row_text.append(cell_text)
if row_text:
paragraphs.append(" | ".join(row_text))
extracted_text = "\n\n".join(paragraphs)
# Clean up common issues
extracted_text = clean_extracted_text(extracted_text)
return extracted_text
except Exception as e:
error_msg = f"[DOCX Extraction Error] {str(e)}"
print(error_msg)
return f"Error extracting DOCX: {str(e)}"
def extract_pdf(file_obj) -> str:
"""
Extract text from PDF with multiple strategies and enhanced error handling
"""
try:
extracted_pages = []
with pdfplumber.open(file_obj) as pdf:
# Track extraction success
successful_pages = 0
total_pages = len(pdf.pages)
for page_num, page in enumerate(pdf.pages, 1):
try:
# Strategy 1: Standard text extraction
page_text = page.extract_text()
# Strategy 2: If standard fails, try with layout
if not page_text or len(page_text.strip()) < 50:
page_text = page.extract_text(layout=True)
# Strategy 3: If still poor, try with custom settings
if not page_text or len(page_text.strip()) < 50:
page_text = page.extract_text(
x_tolerance=2,
y_tolerance=2
)
if page_text and page_text.strip():
# Clean and add page marker
clean_text = page_text.strip()
extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
successful_pages += 1
else:
print(f"[PDF Warning] Page {page_num} yielded no text")
except Exception as page_error:
print(f"[PDF Warning] Error on page {page_num}: {page_error}")
continue
if successful_pages == 0:
return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
if successful_pages < total_pages * 0.5:
print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
full_text = "\n\n".join(extracted_pages)
# Clean up the extracted text
full_text = clean_extracted_text(full_text)
return full_text
except Exception as e:
error_msg = f"[PDF Extraction Error] {str(e)}"
print(error_msg)
return f"Error extracting PDF: {str(e)}"
def clean_extracted_text(text: str) -> str:
"""
Clean up common issues in extracted text
"""
# Remove excessive whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' {2,}', ' ', text)
# Remove page numbers that appear alone on lines
text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
# Remove common headers/footers patterns
text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
# Fix common OCR issues (if any)
text = text.replace('', "'") # Curly apostrophe
text = text.replace('', "'")
text = text.replace('"', '"') # Curly quotes
text = text.replace('"', '"')
text = text.replace('–', '-') # En dash
text = text.replace('—', '-') # Em dash
# Remove zero-width characters
text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
return text.strip()
def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
"""
Validate extracted text quality
"""
# Check if text is empty
if not text or not text.strip():
return False, "No text extracted"
# Check for minimum length
if len(text) < 100:
return False, f"Extracted text too short ({len(text)} characters)"
# Check for error messages
if text.startswith("Error") or text.startswith("["):
return False, "Extraction error detected"
# Check for gibberish (too many non-alphanumeric characters)
#alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
#ratio = alphanumeric / len(text) if text else 0
#if ratio < 0.2:
# return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
# Check word count
words = text.split()
if len(words) < 50:
return False, f"Too few words ({len(words)})"
# Check for reasonable word lengths (catch binary junk)
#avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
#if avg_word_length < 2 or avg_word_length > 20:
# return False, f"Unusual average word length ({avg_word_length:.1f})"
# All checks passed
return True, f"Valid extraction: {len(words)} words, {len(text)} characters"
def detect_file_encoding(file_path: str) -> str:
"""
Detect file encoding for text files
"""
try:
import chardet
with open(file_path, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
return result['encoding']
except:
return 'utf-8' # Default fallback
def extract_text_file(file_obj) -> str:
"""
Extract from plain text file with encoding detection
"""
try:
# Try UTF-8 first
try:
return file_obj.read().decode('utf-8')
except UnicodeDecodeError:
# Try other common encodings
file_obj.seek(0)
try:
return file_obj.read().decode('latin-1')
except:
file_obj.seek(0)
return file_obj.read().decode('cp1252')
except Exception as e:
return f"Error reading text file: {str(e)}" |