Spaces:

suprimedev
/

filereader

Sleeping

File size: 7,256 Bytes

750e443

import gradio as gr
import chardet
import mimetypes
import pandas as pd
import json
import yaml
import toml
import configparser
import xml.etree.ElementTree as ET
from pathlib import Path

def read_file(file):
    if file is None:
        return "لطفا یک فایل انتخاب کنید"
    
    try:
        file_path = file.name
        file_extension = Path(file_path).suffix.lower().strip('.')
        
        # ابتدا با MIME type بررسی می‌کنیم
        mime_type, _ = mimetypes.guess_type(file_path)
        
        # اگر فایل صوتی یا تصویری باشد
        if mime_type and (mime_type.startswith('audio/') or mime_type.startswith('image/') or mime_type.startswith('video/')):
            return f"فایل‌های صوتی، تصویری و ویدیویی پشتیبانی نمی‌شوند. (MIME: {mime_type})"
        
        # فایل‌های باینری معروف که نباید خوانده شوند
        binary_extensions = {
            'exe', 'dll', 'so', 'dylib', 'bin', 'dat', 'db', 'sqlite',
            'zip', 'rar', '7z', 'tar', 'gz', 'bz2', 'xz',
            'jpg', 'jpeg', 'png', 'gif', 'bmp', 'ico', 'svg', 'webp',
            'mp3', 'mp4', 'avi', 'mkv', 'mov', 'wmv', 'flv', 'webm',
            'wav', 'flac', 'aac', 'ogg', 'wma', 'm4a',
            'pdf', 'doc', 'ppt', 'pptx', 'odt', 'ods', 'odp'
        }
        
        if file_extension in binary_extensions:
            return f"فرمت فایل .{file_extension} یک فایل باینری است و پشتیبانی نمی‌شود"
        
        # فایل‌های Excel
        if file_extension in ['xlsx', 'xls', 'xlsm', 'xlsb']:
            df = pd.read_excel(file_path)
            return f"=== محتوای Excel ===\n{df.to_string()}"
        
        # فایل‌های CSV و TSV
        elif file_extension in ['csv', 'tsv']:
            separator = '\t' if file_extension == 'tsv' else ','
            df = pd.read_csv(file_path, sep=separator)
            return f"=== محتوای {file_extension.upper()} ===\n{df.to_string()}"
        
        # فایل‌های JSON
        elif file_extension == 'json':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            return f"=== محتوای JSON ===\n{json.dumps(data, indent=2, ensure_ascii=False)}"
        
        # فایل‌های YAML
        elif file_extension in ['yaml', 'yml']:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = yaml.safe_load(f)
            return f"=== محتوای YAML ===\n{yaml.dump(data, allow_unicode=True, default_flow_style=False)}"
        
        # فایل‌های TOML
        elif file_extension == 'toml':
            with open(file_path, 'r', encoding='utf-8') as f:
                data = toml.load(f)
            return f"=== محتوای TOML ===\n{toml.dumps(data)}"
        
        # فایل‌های INI/CFG
        elif file_extension in ['ini', 'cfg', 'conf', 'config']:
            config = configparser.ConfigParser()
            config.read(file_path, encoding='utf-8')
            output = f"=== محتوای {file_extension.upper()} ===\n"
            for section in config.sections():
                output += f"\n[{section}]\n"
                for key, value in config.items(section):
                    output += f"{key} = {value}\n"
            return output
        
        # فایل‌های XML
        elif file_extension == 'xml':
            tree = ET.parse(file_path)
            root = tree.getroot()
            return f"=== محتوای XML ===\n{ET.tostring(root, encoding='unicode')}"
        
        # سایر فایل‌ها - تلاش برای خواندن به عنوان متن
        else:
            # تشخیص encoding
            with open(file_path, 'rb') as f:
                raw_data = f.read(10000)  # فقط 10KB اول برای تشخیص
                result = chardet.detect(raw_data)
                encoding = result['encoding'] if result['encoding'] else 'utf-8'
                confidence = result['confidence'] if 'confidence' in result else 0
            
            # اگر احتمال encoding کم باشد، احتمالا فایل باینری است
            if confidence < 0.7:
                try:
                    # تلاش با UTF-8
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                    return f"=== محتوای فایل (UTF-8) ===\n{content}"
                except UnicodeDecodeError:
                    return "این فایل احتمالاً باینری است یا encoding آن قابل تشخیص نیست"
            
            # خواندن با encoding تشخیص داده شده
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                return f"=== محتوای فایل ({encoding}) ===\n{content}"
            except:
                # تلاش نهایی با encodings مختلف
                for enc in ['utf-8', 'utf-16', 'latin-1', 'windows-1252', 'cp1256']:
                    try:
                        with open(file_path, 'r', encoding=enc) as f:
                            content = f.read()
                        return f"=== محتوای فایل ({enc}) ===\n{content}"
                    except:
                        continue
                
                return "نمی‌توان این فایل را خواند. احتمالاً فایل باینری است."
            
    except Exception as e:
        return f"خطا در خواندن فایل: {str(e)}"

# لیست فرمت‌های پشتیبانی شده
supported_formats = """
**فرمت‌های پشتیبانی شده:**
- فایل‌های متنی: .txt, .log, .md, .rst, .tex
- کدهای برنامه‌نویسی: .py, .js, .ts, .java, .cpp, .c, .h, .cs, .rb, .go, .rs, .swift, .kt, .scala, .r, .m, .php, .pl, .lua, .sh, .bat, .ps1
- فایل‌های وب: .html, .htm, .css, .scss, .sass, .less, .jsx, .tsx, .vue
- فایل‌های پیکربندی: .json, .yaml, .yml, .toml, .ini, .cfg, .conf, .config, .env, .properties
- فایل‌های داده: .csv, .tsv, .xml
- فایل‌های Office: .xlsx, .xls (نیاز به محتوای متنی)
- و بسیاری فرمت‌های متنی دیگر...

**فرمت‌های پشتیبانی نشده:**
- تصاویر: .jpg, .png, .gif, .bmp, و...
- صوت: .mp3, .wav, .flac, و...
- ویدیو: .mp4, .avi, .mkv, و...
- فایل‌های فشرده: .zip, .rar, .tar, و...
- فایل‌های باینری: .exe, .dll, .bin, و...
"""

# رابط کاربری Gradio
iface = gr.Interface(
    fn=read_file,
    inputs=gr.File(label="فایل را انتخاب کنید", file_types=None),
    outputs=gr.Textbox(label="محتوای فایل", lines=25, max_lines=100),
    title="خواننده جامع فایل‌های متنی",
    description="تقریباً هر فایل متنی را آپلود کنید تا محتوای آن را مشاهده کنید\n" + supported_formats,
    examples=[],
    theme="soft"
)

if __name__ == "__main__":
    iface.launch()