File size: 7,157 Bytes
54c99ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
from docx import Document
import pdfplumber
import re
from typing import Tuple
import os

def extract_docx(file_obj) -> str:
    """

    Extract text from DOCX with enhanced error handling and formatting preservation

    """
    try:
        doc = Document(file_obj)
        
        # Extract paragraphs with better handling
        paragraphs = []
        for para in doc.paragraphs:
            text = para.text.strip()
            if text:  # Only include non-empty paragraphs
                paragraphs.append(text)
        
        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = []
                for cell in row.cells:
                    cell_text = cell.text.strip()
                    if cell_text:
                        row_text.append(cell_text)
                if row_text:
                    paragraphs.append(" | ".join(row_text))
        
        extracted_text = "\n\n".join(paragraphs)
        
        # Clean up common issues
        extracted_text = clean_extracted_text(extracted_text)
        
        return extracted_text
        
    except Exception as e:
        error_msg = f"[DOCX Extraction Error] {str(e)}"
        print(error_msg)
        return f"Error extracting DOCX: {str(e)}"


def extract_pdf(file_obj) -> str:
    """

    Extract text from PDF with multiple strategies and enhanced error handling

    """
    try:
        extracted_pages = []
        
        with pdfplumber.open(file_obj) as pdf:
            # Track extraction success
            successful_pages = 0
            total_pages = len(pdf.pages)
            
            for page_num, page in enumerate(pdf.pages, 1):
                try:
                    # Strategy 1: Standard text extraction
                    page_text = page.extract_text()
                    
                    # Strategy 2: If standard fails, try with layout
                    if not page_text or len(page_text.strip()) < 50:
                        page_text = page.extract_text(layout=True)
                    
                    # Strategy 3: If still poor, try with custom settings
                    if not page_text or len(page_text.strip()) < 50:
                        page_text = page.extract_text(
                            x_tolerance=2,
                            y_tolerance=2
                        )
                    
                    if page_text and page_text.strip():
                        # Clean and add page marker
                        clean_text = page_text.strip()
                        extracted_pages.append(f"--- Page {page_num} ---\n{clean_text}")
                        successful_pages += 1
                    else:
                        print(f"[PDF Warning] Page {page_num} yielded no text")
                
                except Exception as page_error:
                    print(f"[PDF Warning] Error on page {page_num}: {page_error}")
                    continue
            
            if successful_pages == 0:
                return "[PDF Error] No text could be extracted from any page. The PDF may be image-based or corrupted."
            
            if successful_pages < total_pages * 0.5:
                print(f"[PDF Warning] Only {successful_pages}/{total_pages} pages extracted successfully")
            
            full_text = "\n\n".join(extracted_pages)
            
            # Clean up the extracted text
            full_text = clean_extracted_text(full_text)
            
            return full_text
            
    except Exception as e:
        error_msg = f"[PDF Extraction Error] {str(e)}"
        print(error_msg)
        return f"Error extracting PDF: {str(e)}"


def clean_extracted_text(text: str) -> str:
    """

    Clean up common issues in extracted text

    """
    # Remove excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r' {2,}', ' ', text)
    
    # Remove page numbers that appear alone on lines
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Remove common headers/footers patterns
    text = re.sub(r'^\s*Page \d+ of \d+\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\d+/\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Fix common OCR issues (if any)
    text = text.replace('', "'")  # Curly apostrophe
    text = text.replace('', "'")
    text = text.replace('"', '"')  # Curly quotes
    text = text.replace('"', '"')
    text = text.replace('–', '-')  # En dash
    text = text.replace('—', '-')  # Em dash
    
    # Remove zero-width characters
    text = re.sub(r'[\u200b\u200c\u200d\ufeff]', '', text)
    
    return text.strip()


def validate_extraction(text: str, filename: str) -> Tuple[bool, str]:
    """

    Validate extracted text quality

    """
    # Check if text is empty
    if not text or not text.strip():
        return False, "No text extracted"
    
    # Check for minimum length
    if len(text) < 100:
        return False, f"Extracted text too short ({len(text)} characters)"
    
    # Check for error messages
    if text.startswith("Error") or text.startswith("["):
        return False, "Extraction error detected"
    
    # Check for gibberish (too many non-alphanumeric characters)
    #alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
    #ratio = alphanumeric / len(text) if text else 0
    
    #if ratio < 0.2:
     #   return False, f"Text appears garbled (only {ratio*100:.1f}% readable)"
    
    # Check word count
    words = text.split()
    if len(words) < 50:
        return False, f"Too few words ({len(words)})"
    
    # Check for reasonable word lengths (catch binary junk)
    #avg_word_length = sum(len(w) for w in words) / len(words) if words else 0
    #if avg_word_length < 2 or avg_word_length > 20:
    #    return False, f"Unusual average word length ({avg_word_length:.1f})"
    
    # All checks passed
    return True, f"Valid extraction: {len(words)} words, {len(text)} characters"


def detect_file_encoding(file_path: str) -> str:
    """

    Detect file encoding for text files

    """
    try:
        import chardet
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            return result['encoding']
    except:
        return 'utf-8'  # Default fallback


def extract_text_file(file_obj) -> str:
    """

    Extract from plain text file with encoding detection

    """
    try:
        # Try UTF-8 first
        try:
            return file_obj.read().decode('utf-8')
        except UnicodeDecodeError:
            # Try other common encodings
            file_obj.seek(0)
            try:
                return file_obj.read().decode('latin-1')
            except:
                file_obj.seek(0)
                return file_obj.read().decode('cp1252')
    except Exception as e:
        return f"Error reading text file: {str(e)}"