Spaces:

Ruhivig65
/

T2ss

Runtime error

File size: 6,277 Bytes

807d482

"""
File Upload Handler
Supports: .txt, .csv, .md, .log, .text
Max size: 1GB
Auto encoding detection
"""

import os
import chardet
from typing import Tuple

# Maximum file size: 1 GB
MAX_FILE_SIZE = 1 * 1024 * 1024 * 1024  # 1GB in bytes

# Maximum characters: 50 Million
MAX_CHARACTERS = 50_000_000

# Supported file extensions
SUPPORTED_EXTENSIONS = {'.txt', '.csv', '.md', '.text', '.log', '.srt', '.sub'}


class FileHandler:
    """Handle file uploads and text extraction"""
    
    @staticmethod
    def validate_file(filepath: str) -> Tuple[bool, str]:
        """Validate uploaded file"""
        
        if not filepath or not os.path.exists(filepath):
            return False, "❌ File not found!"
        
        # Check extension
        _, ext = os.path.splitext(filepath)
        ext = ext.lower()
        
        if ext not in SUPPORTED_EXTENSIONS:
            supported = ', '.join(SUPPORTED_EXTENSIONS)
            return False, f"❌ Unsupported file type: {ext}\nSupported: {supported}"
        
        # Check file size
        file_size = os.path.getsize(filepath)
        
        if file_size == 0:
            return False, "❌ File is empty!"
        
        if file_size > MAX_FILE_SIZE:
            size_gb = file_size / (1024 ** 3)
            return False, f"❌ File too large: {size_gb:.2f}GB (Max: 1GB)"
        
        return True, "✅ File valid"
    
    @staticmethod
    def detect_encoding(filepath: str) -> str:
        """Detect file encoding"""
        try:
            with open(filepath, 'rb') as f:
                # Read first 100KB for detection
                raw = f.read(102400)
            
            result = chardet.detect(raw)
            encoding = result.get('encoding', 'utf-8')
            confidence = result.get('confidence', 0)
            
            # Default to utf-8 if low confidence
            if not encoding or confidence < 0.5:
                encoding = 'utf-8'
            
            return encoding
            
        except Exception:
            return 'utf-8'
    
    @staticmethod
    def read_file(filepath: str) -> Tuple[str, str]:
        """
        Read text from file
        Returns: (text_content, status_message)
        """
        
        # Validate
        is_valid, msg = FileHandler.validate_file(filepath)
        if not is_valid:
            return "", msg
        
        file_size = os.path.getsize(filepath)
        size_mb = file_size / (1024 * 1024)
        
        try:
            # Detect encoding
            encoding = FileHandler.detect_encoding(filepath)
            
            # Read file
            with open(filepath, 'r', encoding=encoding, errors='ignore') as f:
                text = f.read()
            
            # Character count
            char_count = len(text)
            
            # Trim if exceeds limit
            trimmed = False
            if char_count > MAX_CHARACTERS:
                text = text[:MAX_CHARACTERS]
                trimmed = True
                char_count = MAX_CHARACTERS
            
            # Clean text
            text = FileHandler.clean_text(text)
            
            # Format character count
            if char_count >= 1_000_000:
                char_display = f"{char_count/1_000_000:.1f}M"
            elif char_count >= 1_000:
                char_display = f"{char_count/1_000:.1f}K"
            else:
                char_display = str(char_count)
            
            status = f"✅ Loaded: {size_mb:.1f}MB | {char_display} characters | Encoding: {encoding}"
            
            if trimmed:
                status += f" | ⚠️ Trimmed to 50M characters"
            
            return text, status
            
        except UnicodeDecodeError:
            # Fallback: try reading as binary and decode
            try:
                with open(filepath, 'rb') as f:
                    raw = f.read()
                text = raw.decode('utf-8', errors='ignore')
                return text, f"✅ Loaded (fallback encoding): {size_mb:.1f}MB"
            except Exception as e:
                return "", f"❌ Cannot read file: {str(e)}"
                
        except MemoryError:
            return "", "❌ File too large for memory! Try a smaller file."
            
        except Exception as e:
            return "", f"❌ Error reading file: {str(e)}"
    
    @staticmethod
    def clean_text(text: str) -> str:
        """Basic text cleaning"""
        if not text:
            return ""
        
        # Remove null bytes
        text = text.replace('\x00', '')
        
        # Normalize line endings
        text = text.replace('\r\n', '\n')
        text = text.replace('\r', '\n')
        
        # Remove excessive whitespace but keep structure
        lines = text.split('\n')
        cleaned_lines = []
        empty_count = 0
        
        for line in lines:
            stripped = line.strip()
            if not stripped:
                empty_count += 1
                if empty_count <= 2:  # Keep max 2 empty lines
                    cleaned_lines.append('')
            else:
                empty_count = 0
                cleaned_lines.append(stripped)
        
        return '\n'.join(cleaned_lines).strip()
    
    @staticmethod
    def get_file_info(filepath: str) -> dict:
        """Get file information"""
        if not filepath or not os.path.exists(filepath):
            return {"error": "File not found"}
        
        file_size = os.path.getsize(filepath)
        _, ext = os.path.splitext(filepath)
        encoding = FileHandler.detect_encoding(filepath)
        
        return {
            "name": os.path.basename(filepath),
            "size_bytes": file_size,
            "size_mb": file_size / (1024 * 1024),
            "extension": ext,
            "encoding": encoding
        }


def process_uploaded_file(file) -> str:
    """
    Gradio-compatible file processor
    Called directly from UI
    """
    if file is None:
        return ""
    
    filepath = file.name if hasattr(file, 'name') else str(file)
    
    text, status = FileHandler.read_file(filepath)
    
    if not text:
        return status  # Return error message
    
    print(f"📂 File loaded: {status}")
    return text