Spaces:

LuminLabs
/

flash

Sleeping

App Files Files Community

nova commited on Mar 6

Commit

0bdb813

verified ·

1 Parent(s): 0b329f8

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -23

app.py CHANGED Viewed

@@ -1,79 +1,118 @@
 import gradio as gr
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-# Model Configuration: Qwen 2.5 0.5B (ULTRA FAST - Low RAM/CPU)
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
-# Resource Optimization (Save Cores & RAM)
-import os
-os.environ["OMP_NUM_THREADS"] = "4" # Limit threads to avoid CPU contention
 os.environ["MKL_NUM_THREADS"] = "4"
 torch.set_num_threads(4)
-# Check Device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"🚀 Loading {MODEL_ID} on {device}...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         dtype=torch.float16 if device == "cuda" else torch.float32,
         device_map="auto",
         trust_remote_code=True,
-        low_cpu_mem_usage=True # Optimize RAM loading
     )
 except Exception as e:
-    print(f"❌ Error loading model: {e}")
 def chat(message, history):
-    # Prepare messages list
     messages = []
-    # Enhanced System Prompt
     messages.append({
         "role": "system",
         "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
     })
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    # Tokenize with chat template
     try:
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
-    except:
-        # Fallback manual format for Qwen (ChatML style)
         text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
     inputs = tokenizer([text], return_tensors="pt").to(device)
-    # Streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Generate parameters (Tuned for smarts)
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
-        max_new_tokens=1024,  # Increased to prevent cut-offs
         temperature=0.7,
         do_sample=True,
-        top_p=0.9
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # Yield Output
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
-# Gradio Interface
 demo = gr.ChatInterface(
     fn=chat,
-    chatbot=gr.Chatbot(height=500), # Removed type="messages" for compatibility
-    textbox=gr.Textbox(placeholder="Ask Lumin Flash...", container=False, scale=7),
-    title=f"Lumin Flash (Smart Edition)"
 )
 if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import torch
+import sys
+import traceback
+import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+# Configuración del Modelo
 MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+# Optimizaciones extremas de CPU y RAM para Tier Gratuito
+os.environ["OMP_NUM_THREADS"] = "4"
 os.environ["MKL_NUM_THREADS"] = "4"
 torch.set_num_threads(4)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🚀 Iniciando arranque de Lumin Flash ({MODEL_ID}) en {device}...")
+model = None
+tokenizer = None
 try:
+    print("⏳ Descargando y cargando Tokenizer...")
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    print("⏳ Descargando y cargando Modelo en RAM...")
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
         dtype=torch.float16 if device == "cuda" else torch.float32,
         device_map="auto",
         trust_remote_code=True,
+        low_cpu_mem_usage=True
     )
+    print("✅ ¡Modelo cargado correctamente en memoria!")
 except Exception as e:
+    print("❌" * 20)
+    print(f"ERROR CRÍTICO FATAL AL CARGAR EL MODELO:\n{e}")
+    print(traceback.format_exc())
+    print("❌" * 20)
+    # Obligamos al container a morir si no hay modelo, así HF te avisará del fallo
+    # y evitará el estado "Running zombi" que da el "NameError".
+    sys.exit(1)
 def chat(message, history):
+    # Detección de seguridad en tiempo real
+    if model is None or tokenizer is None:
+        yield "⚠️ Error del servidor: El modelo de IA no está cargado correctamente. Contacta al administrador."
+        return
+    # Preparar el contexto del sistema
     messages = []
     messages.append({
         "role": "system",
         "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
     })
+    # Inyectar el historial de chat anterior
     for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
+    # Añadir el nuevo mensaje del usuario
     messages.append({"role": "user", "content": message})
+    # Formatear el texto usando la plantilla oficial de Qwen/ChatML
     try:
         text = tokenizer.apply_chat_template(
             messages,
             tokenize=False,
             add_generation_prompt=True
         )
+    except Exception as e:
+        print(f"Aviso tokenizer: Falló el apply_chat_template, usando fallback manual. {e}")
         text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    # Enviar al procesador (device)
     inputs = tokenizer([text], return_tensors="pt").to(device)
+    # Streamer para respuestas rápidas
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Parámetros de generación inteligentes
     generation_kwargs = dict(
         inputs,
         streamer=streamer,
+        max_new_tokens=1024,
         temperature=0.7,
         do_sample=True,
+        top_k=50,
+        top_p=0.9,
+        repetition_penalty=1.1
     )
+    # Iniciar la generación en segundo plano
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Devolver texto palabra por palabra
     partial_text = ""
     for new_text in streamer:
         partial_text += new_text
         yield partial_text
+# Interfaz Gráfica de Gradio
 demo = gr.ChatInterface(
     fn=chat,
+    chatbot=gr.Chatbot(height=500),
+    textbox=gr.Textbox(placeholder="Pregúntale a Lumin Flash...", container=False, scale=7),
+    title="⚡ Lumin Flash (High Performance)",
+    description="Backend oficial de inferencia rápida para Lumin Web."
 )
 if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860)