| import gradio as gr |
| import torch |
| import sys |
| import traceback |
| import os |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
|
|
| |
| MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct" |
|
|
| |
| os.environ["OMP_NUM_THREADS"] = "4" |
| os.environ["MKL_NUM_THREADS"] = "4" |
| torch.set_num_threads(4) |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"🚀 Iniciando arranque de Lumin Flash ({MODEL_ID}) en {device}...") |
|
|
| model = None |
| tokenizer = None |
|
|
| try: |
| print("⏳ Descargando y cargando Tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) |
| |
| print("⏳ Descargando y cargando Modelo en RAM...") |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_ID, |
| dtype=torch.float16 if device == "cuda" else torch.float32, |
| device_map="auto", |
| trust_remote_code=True, |
| low_cpu_mem_usage=True |
| ) |
| print("✅ ¡Modelo cargado correctamente en memoria!") |
| |
| except Exception as e: |
| print("❌" * 20) |
| print(f"ERROR CRÍTICO FATAL AL CARGAR EL MODELO:\n{e}") |
| print(traceback.format_exc()) |
| print("❌" * 20) |
| |
| |
| sys.exit(1) |
|
|
|
|
| def chat(message, history): |
| |
| if model is None or tokenizer is None: |
| yield "⚠️ Error del servidor: El modelo de IA no está cargado correctamente. Contacta al administrador." |
| return |
|
|
| |
| messages = [] |
| messages.append({ |
| "role": "system", |
| "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences." |
| }) |
| |
| |
| for user_msg, bot_msg in history: |
| messages.append({"role": "user", "content": user_msg}) |
| messages.append({"role": "assistant", "content": bot_msg}) |
| |
| |
| messages.append({"role": "user", "content": message}) |
| |
| |
| try: |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True |
| ) |
| except Exception as e: |
| print(f"Aviso tokenizer: Falló el apply_chat_template, usando fallback manual. {e}") |
| text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" |
| |
| |
| inputs = tokenizer([text], return_tensors="pt").to(device) |
| |
| |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| |
| |
| generation_kwargs = dict( |
| inputs, |
| streamer=streamer, |
| max_new_tokens=1024, |
| temperature=0.7, |
| do_sample=True, |
| top_k=50, |
| top_p=0.9, |
| repetition_penalty=1.1 |
| ) |
| |
| |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| thread.start() |
| |
| |
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| yield partial_text |
|
|
|
|
| |
| demo = gr.ChatInterface( |
| fn=chat, |
| chatbot=gr.Chatbot(height=500), |
| textbox=gr.Textbox(placeholder="Pregúntale a Lumin Flash...", container=False, scale=7), |
| title="⚡ Lumin Flash (High Performance)", |
| description="Backend oficial de inferencia rápida para Lumin Web." |
| ) |
|
|
| if __name__ == "__main__": |
| demo.queue().launch(server_name="0.0.0.0", server_port=7860) |
|
|