File size: 6,277 Bytes
807d482
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
File Upload Handler
Supports: .txt, .csv, .md, .log, .text
Max size: 1GB
Auto encoding detection
"""

import os
import chardet
from typing import Tuple

# Maximum file size: 1 GB
MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024  # 1GB in bytes

# Maximum characters: 50 Million
MAX_CHARACTERS = 50_000_000

# Supported file extensions
SUPPORTED_EXTENSIONS = {'.txt', '.csv', '.md', '.text', '.log', '.srt', '.sub'}


class FileHandler:
    """Handle file uploads and text extraction"""
    
    @staticmethod
    def validate_file(filepath: str) -> Tuple[bool, str]:
        """Validate uploaded file"""
        
        if not filepath or not os.path.exists(filepath):
            return False, "❌ File not found!"
        
        # Check extension
        _, ext = os.path.splitext(filepath)
        ext = ext.lower()
        
        if ext not in SUPPORTED_EXTENSIONS:
            supported = ', '.join(SUPPORTED_EXTENSIONS)
            return False, f"❌ Unsupported file type: {ext}\nSupported: {supported}"
        
        # Check file size
        file_size = os.path.getsize(filepath)
        
        if file_size == 0:
            return False, "❌ File is empty!"
        
        if file_size > MAX_FILE_SIZE:
            size_gb = file_size / (1024 ** 3)
            return False, f"❌ File too large: {size_gb:.2f}GB (Max: 1GB)"
        
        return True, "βœ… File valid"
    
    @staticmethod
    def detect_encoding(filepath: str) -> str:
        """Detect file encoding"""
        try:
            with open(filepath, 'rb') as f:
                # Read first 100KB for detection
                raw = f.read(102400)
            
            result = chardet.detect(raw)
            encoding = result.get('encoding', 'utf-8')
            confidence = result.get('confidence', 0)
            
            # Default to utf-8 if low confidence
            if not encoding or confidence < 0.5:
                encoding = 'utf-8'
            
            return encoding
            
        except Exception:
            return 'utf-8'
    
    @staticmethod
    def read_file(filepath: str) -> Tuple[str, str]:
        """
        Read text from file
        Returns: (text_content, status_message)
        """
        
        # Validate
        is_valid, msg = FileHandler.validate_file(filepath)
        if not is_valid:
            return "", msg
        
        file_size = os.path.getsize(filepath)
        size_mb = file_size / (1024 * 1024)
        
        try:
            # Detect encoding
            encoding = FileHandler.detect_encoding(filepath)
            
            # Read file
            with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
                text = f.read()
            
            # Character count
            char_count = len(text)
            
            # Trim if exceeds limit
            trimmed = False
            if char_count > MAX_CHARACTERS:
                text = text[:MAX_CHARACTERS]
                trimmed = True
                char_count = MAX_CHARACTERS
            
            # Clean text
            text = FileHandler.clean_text(text)
            
            # Format character count
            if char_count >= 1_000_000:
                char_display = f"{char_count/1_000_000:.1f}M"
            elif char_count >= 1_000:
                char_display = f"{char_count/1_000:.1f}K"
            else:
                char_display = str(char_count)
            
            status = f"βœ… Loaded: {size_mb:.1f}MB | {char_display} characters | Encoding: {encoding}"
            
            if trimmed:
                status += f" | ⚠️ Trimmed to 50M characters"
            
            return text, status
            
        except UnicodeDecodeError:
            # Fallback: try reading as binary and decode
            try:
                with open(filepath, 'rb') as f:
                    raw = f.read()
                text = raw.decode('utf-8', errors='ignore')
                return text, f"βœ… Loaded (fallback encoding): {size_mb:.1f}MB"
            except Exception as e:
                return "", f"❌ Cannot read file: {str(e)}"
                
        except MemoryError:
            return "", "❌ File too large for memory! Try a smaller file."
            
        except Exception as e:
            return "", f"❌ Error reading file: {str(e)}"
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Basic text cleaning"""
        if not text:
            return ""
        
        # Remove null bytes
        text = text.replace('\x00', '')
        
        # Normalize line endings
        text = text.replace('\r\n', '\n')
        text = text.replace('\r', '\n')
        
        # Remove excessive whitespace but keep structure
        lines = text.split('\n')
        cleaned_lines = []
        empty_count = 0
        
        for line in lines:
            stripped = line.strip()
            if not stripped:
                empty_count += 1
                if empty_count <= 2:  # Keep max 2 empty lines
                    cleaned_lines.append('')
            else:
                empty_count = 0
                cleaned_lines.append(stripped)
        
        return '\n'.join(cleaned_lines).strip()
    
    @staticmethod
    def get_file_info(filepath: str) -> dict:
        """Get file information"""
        if not filepath or not os.path.exists(filepath):
            return {"error": "File not found"}
        
        file_size = os.path.getsize(filepath)
        _, ext = os.path.splitext(filepath)
        encoding = FileHandler.detect_encoding(filepath)
        
        return {
            "name": os.path.basename(filepath),
            "size_bytes": file_size,
            "size_mb": file_size / (1024 * 1024),
            "extension": ext,
            "encoding": encoding
        }


def process_uploaded_file(file) -> str:
    """
    Gradio-compatible file processor
    Called directly from UI
    """
    if file is None:
        return ""
    
    filepath = file.name if hasattr(file, 'name') else str(file)
    
    text, status = FileHandler.read_file(filepath)
    
    if not text:
        return status  # Return error message
    
    print(f"πŸ“‚ File loaded: {status}")
    return text