Spaces:

empirenexus
/

TranscriptWriting

Sleeping

File size: 7,157 Bytes

54c99ad

from docx import Document
import pdfplumber
import re
from typing import Tuple
import os

def extract_docx(file_obj) -> str:
    """

    Extract text from DOCX with enhanced error handling and formatting preservation

    """
    try:
        doc = Document(file_obj)
        
        # Extract paragraphs with better handling
        paragraphs = []
        for para in doc.paragraphs:
            text = para.text.strip()
            if text:  # Only include non-empty paragraphs
                paragraphs.append(text)
        
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if cell_text:
                        row_text.append(cell_text)
                if row_text:
                    paragraphs.append(" | ".join(row_text))
        
        extracted_text = "\n\n".join(paragraphs)
        
        # Clean up common issues
        extracted_text = clean_extracted_text(extracted_text)
        
        return extracted_text
        
    except Exception as e:
        error_msg = f"[DOCX Extraction Error] {str(e)}"
        print(error_msg)
        return f"Error extracting DOCX: {str(e)}"


def extract_pdf(file_obj) -> str:
    """

    Extract text from PDF with multiple strategies and enhanced error handling

    """
    try:
        extracted_pages = []
        
        with pdfplumber.open(file_obj) as pdf:
            # Track extraction success
            successful_pages = 0
            total_pages = len(pdf.pages)
            
            for page_num, page in enumerate(pdf.pages, 1):
                try:
                    # Strategy 1: Standard text extraction
                    page_text = page.extract_text()
                    
                    # Strategy 2: If standard fails, try with layout
                    if not page_text or len(page_text.strip()) < 50:
                        page_text = page.extract_text(layout=True)
                    
                    # Strategy 3: If still poor, try with custom settings
                    if not page_text or len(page_text.strip()) < 50:
                        page_text = page.extract_text(
                            x_tolerance=2,
                            y_tolerance=2
                        )
                    
                    if page_text and page_text.strip():
                        # Clean and add page marker
                        clean_text = page_text.strip()
                        extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
                        successful_pages += 1
                    else:
                        print(f"[PDF Warning] Page {page_num} yielded no text")
                
                except Exception as page_error:
                    print(f"[PDF Warning] Error on page {page_num}: {page_error}")
                    continue
            
            if successful_pages == 0:
                return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
            
            if successful_pages < total_pages * 0.5:
                print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
            
            full_text = "\n\n".join(extracted_pages)
            
            # Clean up the extracted text
            full_text = clean_extracted_text(full_text)
            
            return full_text
            
    except Exception as e:
        error_msg = f"[PDF Extraction Error] {str(e)}"
        print(error_msg)
        return f"Error extracting PDF: {str(e)}"


def clean_extracted_text(text: str) -> str:
    """

    Clean up common issues in extracted text

    """
    # Remove excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r' {2,}', ' ', text)
    
    # Remove page numbers that appear alone on lines
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Remove common headers/footers patterns
    text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Fix common OCR issues (if any)
    text = text.replace('', "'")  # Curly apostrophe
    text = text.replace('', "'")
    text = text.replace('"', '"')  # Curly quotes
    text = text.replace('"', '"')
    text = text.replace('–', '-')  # En dash
    text = text.replace('—', '-')  # Em dash
    
    # Remove zero-width characters
    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
    
    return text.strip()


def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
    """

    Validate extracted text quality

    """
    # Check if text is empty
    if not text or not text.strip():
        return False, "No text extracted"
    
    # Check for minimum length
    if len(text) < 100:
        return False, f"Extracted text too short ({len(text)} characters)"
    
    # Check for error messages
    if text.startswith("Error") or text.startswith("["):
        return False, "Extraction error detected"
    
    # Check for gibberish (too many non-alphanumeric characters)
    #alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
    #ratio = alphanumeric / len(text) if text else 0
    
    #if ratio < 0.2:
     #   return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
    
    # Check word count
    words = text.split()
    if len(words) < 50:
        return False, f"Too few words ({len(words)})"
    
    # Check for reasonable word lengths (catch binary junk)
    #avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
    #if avg_word_length < 2 or avg_word_length > 20:
    #    return False, f"Unusual average word length ({avg_word_length:.1f})"
    
    # All checks passed
    return True, f"Valid extraction: {len(words)} words, {len(text)} characters"


def detect_file_encoding(file_path: str) -> str:
    """

    Detect file encoding for text files

    """
    try:
        import chardet
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            return result['encoding']
    except:
        return 'utf-8'  # Default fallback


def extract_text_file(file_obj) -> str:
    """

    Extract from plain text file with encoding detection

    """
    try:
        # Try UTF-8 first
        try:
            return file_obj.read().decode('utf-8')
        except UnicodeDecodeError:
            # Try other common encodings
            file_obj.seek(0)
            try:
                return file_obj.read().decode('latin-1')
            except:
                file_obj.seek(0)
                return file_obj.read().decode('cp1252')
    except Exception as e:
        return f"Error reading text file: {str(e)}"