File size: 6,277 Bytes
807d482 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 | """
File Upload Handler
Supports: .txt, .csv, .md, .log, .text
Max size: 1GB
Auto encoding detection
"""
import os
import chardet
from typing import Tuple
# Maximum file size: 1 GB
MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024 # 1GB in bytes
# Maximum characters: 50 Million
MAX_CHARACTERS = 50_000_000
# Supported file extensions
SUPPORTED_EXTENSIONS = {'.txt', '.csv', '.md', '.text', '.log', '.srt', '.sub'}
class FileHandler:
"""Handle file uploads and text extraction"""
@staticmethod
def validate_file(filepath: str) -> Tuple[bool, str]:
"""Validate uploaded file"""
if not filepath or not os.path.exists(filepath):
return False, "β File not found!"
# Check extension
_, ext = os.path.splitext(filepath)
ext = ext.lower()
if ext not in SUPPORTED_EXTENSIONS:
supported = ', '.join(SUPPORTED_EXTENSIONS)
return False, f"β Unsupported file type: {ext}\nSupported: {supported}"
# Check file size
file_size = os.path.getsize(filepath)
if file_size == 0:
return False, "β File is empty!"
if file_size > MAX_FILE_SIZE:
size_gb = file_size / (1024 ** 3)
return False, f"β File too large: {size_gb:.2f}GB (Max: 1GB)"
return True, "β
File valid"
@staticmethod
def detect_encoding(filepath: str) -> str:
"""Detect file encoding"""
try:
with open(filepath, 'rb') as f:
# Read first 100KB for detection
raw = f.read(102400)
result = chardet.detect(raw)
encoding = result.get('encoding', 'utf-8')
confidence = result.get('confidence', 0)
# Default to utf-8 if low confidence
if not encoding or confidence < 0.5:
encoding = 'utf-8'
return encoding
except Exception:
return 'utf-8'
@staticmethod
def read_file(filepath: str) -> Tuple[str, str]:
"""
Read text from file
Returns: (text_content, status_message)
"""
# Validate
is_valid, msg = FileHandler.validate_file(filepath)
if not is_valid:
return "", msg
file_size = os.path.getsize(filepath)
size_mb = file_size / (1024 * 1024)
try:
# Detect encoding
encoding = FileHandler.detect_encoding(filepath)
# Read file
with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
text = f.read()
# Character count
char_count = len(text)
# Trim if exceeds limit
trimmed = False
if char_count > MAX_CHARACTERS:
text = text[:MAX_CHARACTERS]
trimmed = True
char_count = MAX_CHARACTERS
# Clean text
text = FileHandler.clean_text(text)
# Format character count
if char_count >= 1_000_000:
char_display = f"{char_count/1_000_000:.1f}M"
elif char_count >= 1_000:
char_display = f"{char_count/1_000:.1f}K"
else:
char_display = str(char_count)
status = f"β
Loaded: {size_mb:.1f}MB | {char_display} characters | Encoding: {encoding}"
if trimmed:
status += f" | β οΈ Trimmed to 50M characters"
return text, status
except UnicodeDecodeError:
# Fallback: try reading as binary and decode
try:
with open(filepath, 'rb') as f:
raw = f.read()
text = raw.decode('utf-8', errors='ignore')
return text, f"β
Loaded (fallback encoding): {size_mb:.1f}MB"
except Exception as e:
return "", f"β Cannot read file: {str(e)}"
except MemoryError:
return "", "β File too large for memory! Try a smaller file."
except Exception as e:
return "", f"β Error reading file: {str(e)}"
@staticmethod
def clean_text(text: str) -> str:
"""Basic text cleaning"""
if not text:
return ""
# Remove null bytes
text = text.replace('\x00', '')
# Normalize line endings
text = text.replace('\r\n', '\n')
text = text.replace('\r', '\n')
# Remove excessive whitespace but keep structure
lines = text.split('\n')
cleaned_lines = []
empty_count = 0
for line in lines:
stripped = line.strip()
if not stripped:
empty_count += 1
if empty_count <= 2: # Keep max 2 empty lines
cleaned_lines.append('')
else:
empty_count = 0
cleaned_lines.append(stripped)
return '\n'.join(cleaned_lines).strip()
@staticmethod
def get_file_info(filepath: str) -> dict:
"""Get file information"""
if not filepath or not os.path.exists(filepath):
return {"error": "File not found"}
file_size = os.path.getsize(filepath)
_, ext = os.path.splitext(filepath)
encoding = FileHandler.detect_encoding(filepath)
return {
"name": os.path.basename(filepath),
"size_bytes": file_size,
"size_mb": file_size / (1024 * 1024),
"extension": ext,
"encoding": encoding
}
def process_uploaded_file(file) -> str:
"""
Gradio-compatible file processor
Called directly from UI
"""
if file is None:
return ""
filepath = file.name if hasattr(file, 'name') else str(file)
text, status = FileHandler.read_file(filepath)
if not text:
return status # Return error message
print(f"π File loaded: {status}")
return text |