""" نسخة محسّنة من app.py مع دعم Quantization و Memory Optimization للنماذج الكبيرة على ZeroGPU Optimized version of app.py with Quantization and Memory Optimization for large models on ZeroGPU """ import gradio as gr import torch import spaces from PIL import Image import os import tempfile import gc from typing import Optional, Union # استيراد المكتبات الضرورية try: from uni_moe.model.processing_qwen2_vl import Qwen2VLProcessor from uni_moe.model.modeling_out import GrinQwen2VLOutForConditionalGeneration from uni_moe.qwen_vl_utils import process_mm_info from transformers import BitsAndBytesConfig except ImportError as e: print(f"⚠️ Warning: Import error - {e}") print("Some features may not work properly.") # ==================== الإعدادات / Configuration ==================== # اختر النموذج المناسب # Choose appropriate model MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Omni" # النموذج الكامل / Full model # MODEL_NAME = "HIT-TMG/Uni-MoE-2.0-Base" # البديل الأصغر / Smaller alternative # إعدادات التحسين / Optimization settings USE_4BIT = True # استخدام 4-bit quantization لتوفير الذاكرة USE_8BIT = False # بديل: استخدام 8-bit quantization USE_FLASH_ATTENTION = True # استخدام Flash Attention للسرعة MAX_MEMORY = "20GB" # الحد الأقصى للذاكرة المستخدمة device = "cuda" if torch.cuda.is_available() else "cpu" # ==================== تحميل النموذج / Model Loading ==================== print("="*60) print(f"🚀 Loading Uni-MoE 2.0 Model") print(f"📍 Model: {MODEL_NAME}") print(f"🖥️ Device: {device}") print(f"⚙️ 4-bit Quantization: {USE_4BIT}") print(f"⚙️ 8-bit Quantization: {USE_8BIT}") print("="*60) def load_model_optimized(): """تحميل النموذج بطريقة محسّنة""" global processor, model try: # تحميل المعالج print("📥 Loading processor...") processor = Qwen2VLProcessor.from_pretrained(MODEL_NAME) # إعداد Quantization Config quantization_config = None if USE_4BIT: print("⚙️ Setting up 4-bit quantization...") quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) elif USE_8BIT: print("⚙️ Setting up 8-bit quantization...") quantization_config = BitsAndBytesConfig( load_in_8bit=True, ) # تحميل النموذج print("📥 Loading model (this may take a few minutes)...") load_kwargs = { "device_map": "auto", "torch_dtype": torch.float16 if not USE_4BIT else None, "trust_remote_code": True, } if quantization_config: load_kwargs["quantization_config"] = quantization_config if device == "cuda" and not USE_4BIT and not USE_8BIT: load_kwargs["max_memory"] = {0: MAX_MEMORY} model = GrinQwen2VLOutForConditionalGeneration.from_pretrained( MODEL_NAME, **load_kwargs ) # تعيين data_args processor.data_args = model.config print("✅ Model loaded successfully!") print(f"💾 Model size: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B parameters") return True except Exception as e: print(f"❌ Error loading model: {str(e)}") return False # تحميل النموذج model_loaded = load_model_optimized() if not model_loaded: processor = None model = None # ==================== دوال مساعدة / Helper Functions ==================== def clear_gpu_memory(): """تنظيف ذاكرة GPU""" if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() def estimate_tokens(text: str) -> int: """تقدير عدد التوكنات""" return len(text.split()) * 1.3 # ==================== دالة التوليد الرئيسية / Main Generation Function ==================== @spaces.GPU(duration=120) def generate_response( text_input: str, image_input: Optional[Image.Image] = None, audio_input: Optional[str] = None, temperature: float = 1.0, max_new_tokens: int = 512, top_p: float = 0.9, repetition_penalty: float = 1.1 ) -> str: """ توليد استجابة من النموذج Generate response from the model """ # التحقق من توفر النموذج if model is None or processor is None: return "❌ النموذج غير متاح. يرجى التحقق من السجلات.\n❌ Model not available. Please check logs." # تنظيف الذاكرة قبل البدء clear_gpu_memory() try: # التحقق من المدخلات if not text_input and image_input is None and audio_input is None: return "⚠️ يرجى إدخال نص أو صورة أو صوت على الأقل.\n⚠️ Please provide at least text, image, or audio input." # بناء محتوى الرسالة content = [] # إضافة النص if text_input: content.append({"type": "text", "text": text_input}) # إضافة الصورة temp_image_path = None if image_input is not None: temp_image_path = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg").name image_input.save(temp_image_path) content.append({"type": "image", "image": temp_image_path}) # إضافة الصوت if audio_input is not None: content.append({"type": "audio", "audio": audio_input}) # بناء الرسائل messages = [{"role": "user", "content": content}] # معالجة النص texts = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # استبدال العلامات الخاصة texts = texts.replace( "", "<|vision_start|><|image_pad|><|vision_end|>" ).replace( "