Spaces:
Sleeping
Sleeping
| from docx import Document | |
| import pdfplumber | |
| import re | |
| from typing import Tuple | |
| import os | |
| def extract_docx(file_obj) -> str: | |
| """ | |
| Extract text from DOCX with enhanced error handling and formatting preservation | |
| """ | |
| try: | |
| doc = Document(file_obj) | |
| # Extract paragraphs with better handling | |
| paragraphs = [] | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if text: # Only include non-empty paragraphs | |
| paragraphs.append(text) | |
| # Also extract text from tables | |
| for table in doc.tables: | |
| for row in table.rows: | |
| row_text = [] | |
| for cell in row.cells: | |
| cell_text = cell.text.strip() | |
| if cell_text: | |
| row_text.append(cell_text) | |
| if row_text: | |
| paragraphs.append(" | ".join(row_text)) | |
| extracted_text = "\n\n".join(paragraphs) | |
| # Clean up common issues | |
| extracted_text = clean_extracted_text(extracted_text) | |
| return extracted_text | |
| except Exception as e: | |
| error_msg = f"[DOCX Extraction Error] {str(e)}" | |
| print(error_msg) | |
| return f"Error extracting DOCX: {str(e)}" | |
| def extract_pdf(file_obj) -> str: | |
| """ | |
| Extract text from PDF with multiple strategies and enhanced error handling | |
| """ | |
| try: | |
| extracted_pages = [] | |
| with pdfplumber.open(file_obj) as pdf: | |
| # Track extraction success | |
| successful_pages = 0 | |
| total_pages = len(pdf.pages) | |
| for page_num, page in enumerate(pdf.pages, 1): | |
| try: | |
| # Strategy 1: Standard text extraction | |
| page_text = page.extract_text() | |
| # Strategy 2: If standard fails, try with layout | |
| if not page_text or len(page_text.strip()) < 50: | |
| page_text = page.extract_text(layout=True) | |
| # Strategy 3: If still poor, try with custom settings | |
| if not page_text or len(page_text.strip()) < 50: | |
| page_text = page.extract_text( | |
| x_tolerance=2, | |
| y_tolerance=2 | |
| ) | |
| if page_text and page_text.strip(): | |
| # Clean and add page marker | |
| clean_text = page_text.strip() | |
| extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}") | |
| successful_pages += 1 | |
| else: | |
| print(f"[PDF Warning] Page {page_num} yielded no text") | |
| except Exception as page_error: | |
| print(f"[PDF Warning] Error on page {page_num}: {page_error}") | |
| continue | |
| if successful_pages == 0: | |
| return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted." | |
| if successful_pages < total_pages * 0.5: | |
| print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully") | |
| full_text = "\n\n".join(extracted_pages) | |
| # Clean up the extracted text | |
| full_text = clean_extracted_text(full_text) | |
| return full_text | |
| except Exception as e: | |
| error_msg = f"[PDF Extraction Error] {str(e)}" | |
| print(error_msg) | |
| return f"Error extracting PDF: {str(e)}" | |
| def clean_extracted_text(text: str) -> str: | |
| """ | |
| Clean up common issues in extracted text | |
| """ | |
| # Remove excessive whitespace | |
| text = re.sub(r'\n{3,}', '\n\n', text) | |
| text = re.sub(r' {2,}', ' ', text) | |
| # Remove page numbers that appear alone on lines | |
| text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE) | |
| # Remove common headers/footers patterns | |
| text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE) | |
| text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE) | |
| # Fix common OCR issues (if any) | |
| text = text.replace('', "'") # Curly apostrophe | |
| text = text.replace('', "'") | |
| text = text.replace('"', '"') # Curly quotes | |
| text = text.replace('"', '"') | |
| text = text.replace('β', '-') # En dash | |
| text = text.replace('β', '-') # Em dash | |
| # Remove zero-width characters | |
| text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text) | |
| return text.strip() | |
| def validate_extraction(text: str, filename: str) -> Tuple[bool, str]: | |
| """ | |
| Validate extracted text quality | |
| """ | |
| # Check if text is empty | |
| if not text or not text.strip(): | |
| return False, "No text extracted" | |
| # Check for minimum length | |
| if len(text) < 100: | |
| return False, f"Extracted text too short ({len(text)} characters)" | |
| # Check for error messages | |
| if text.startswith("Error") or text.startswith("["): | |
| return False, "Extraction error detected" | |
| # Check for gibberish (too many non-alphanumeric characters) | |
| #alphanumeric = sum(c.isalnum() or c.isspace() for c in text) | |
| #ratio = alphanumeric / len(text) if text else 0 | |
| #if ratio < 0.2: | |
| # return False, f"Text appears garbled (only {ratio*100:.1f}% readable)" | |
| # Check word count | |
| words = text.split() | |
| if len(words) < 50: | |
| return False, f"Too few words ({len(words)})" | |
| # Check for reasonable word lengths (catch binary junk) | |
| #avg_word_length = sum(len(w) for w in words) / len(words) if words else 0 | |
| #if avg_word_length < 2 or avg_word_length > 20: | |
| # return False, f"Unusual average word length ({avg_word_length:.1f})" | |
| # All checks passed | |
| return True, f"Valid extraction: {len(words)} words, {len(text)} characters" | |
| def detect_file_encoding(file_path: str) -> str: | |
| """ | |
| Detect file encoding for text files | |
| """ | |
| try: | |
| import chardet | |
| with open(file_path, 'rb') as f: | |
| raw_data = f.read() | |
| result = chardet.detect(raw_data) | |
| return result['encoding'] | |
| except: | |
| return 'utf-8' # Default fallback | |
| def extract_text_file(file_obj) -> str: | |
| """ | |
| Extract from plain text file with encoding detection | |
| """ | |
| try: | |
| # Try UTF-8 first | |
| try: | |
| return file_obj.read().decode('utf-8') | |
| except UnicodeDecodeError: | |
| # Try other common encodings | |
| file_obj.seek(0) | |
| try: | |
| return file_obj.read().decode('latin-1') | |
| except: | |
| file_obj.seek(0) | |
| return file_obj.read().decode('cp1252') | |
| except Exception as e: | |
| return f"Error reading text file: {str(e)}" |