nova commited on
Commit
0bdb813
·
verified ·
1 Parent(s): 0b329f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -23
app.py CHANGED
@@ -1,79 +1,118 @@
1
  import gradio as gr
2
  import torch
 
 
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  from threading import Thread
5
- # Model Configuration: Qwen 2.5 0.5B (ULTRA FAST - Low RAM/CPU)
 
6
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
7
- # Resource Optimization (Save Cores & RAM)
8
- import os
9
- os.environ["OMP_NUM_THREADS"] = "4" # Limit threads to avoid CPU contention
10
  os.environ["MKL_NUM_THREADS"] = "4"
11
  torch.set_num_threads(4)
12
- # Check Device
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
- print(f"🚀 Loading {MODEL_ID} on {device}...")
 
 
 
 
15
  try:
 
16
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
 
 
17
  model = AutoModelForCausalLM.from_pretrained(
18
  MODEL_ID,
19
  dtype=torch.float16 if device == "cuda" else torch.float32,
20
  device_map="auto",
21
  trust_remote_code=True,
22
- low_cpu_mem_usage=True # Optimize RAM loading
23
  )
 
 
24
  except Exception as e:
25
- print(f"❌ Error loading model: {e}")
 
 
 
 
 
 
 
 
26
  def chat(message, history):
27
- # Prepare messages list
 
 
 
 
 
28
  messages = []
29
- # Enhanced System Prompt
30
  messages.append({
31
  "role": "system",
32
  "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
33
  })
34
 
 
35
  for user_msg, bot_msg in history:
36
  messages.append({"role": "user", "content": user_msg})
37
  messages.append({"role": "assistant", "content": bot_msg})
38
 
 
39
  messages.append({"role": "user", "content": message})
40
- # Tokenize with chat template
 
41
  try:
42
  text = tokenizer.apply_chat_template(
43
  messages,
44
  tokenize=False,
45
  add_generation_prompt=True
46
  )
47
- except:
48
- # Fallback manual format for Qwen (ChatML style)
49
  text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
 
 
50
  inputs = tokenizer([text], return_tensors="pt").to(device)
51
- # Streamer
 
52
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
53
 
54
- # Generate parameters (Tuned for smarts)
55
  generation_kwargs = dict(
56
  inputs,
57
  streamer=streamer,
58
- max_new_tokens=1024, # Increased to prevent cut-offs
59
  temperature=0.7,
60
  do_sample=True,
61
- top_p=0.9
 
 
62
  )
63
 
 
64
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
65
  thread.start()
66
- # Yield Output
 
67
  partial_text = ""
68
  for new_text in streamer:
69
  partial_text += new_text
70
  yield partial_text
71
- # Gradio Interface
 
 
72
  demo = gr.ChatInterface(
73
  fn=chat,
74
- chatbot=gr.Chatbot(height=500), # Removed type="messages" for compatibility
75
- textbox=gr.Textbox(placeholder="Ask Lumin Flash...", container=False, scale=7),
76
- title=f"Lumin Flash (Smart Edition)"
 
77
  )
 
78
  if __name__ == "__main__":
79
- demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  import torch
3
+ import sys
4
+ import traceback
5
+ import os
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
  from threading import Thread
8
+
9
+ # Configuración del Modelo
10
  MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
11
+
12
+ # Optimizaciones extremas de CPU y RAM para Tier Gratuito
13
+ os.environ["OMP_NUM_THREADS"] = "4"
14
  os.environ["MKL_NUM_THREADS"] = "4"
15
  torch.set_num_threads(4)
16
+
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ print(f"🚀 Iniciando arranque de Lumin Flash ({MODEL_ID}) en {device}...")
19
+
20
+ model = None
21
+ tokenizer = None
22
+
23
  try:
24
+ print("⏳ Descargando y cargando Tokenizer...")
25
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
26
+
27
+ print("⏳ Descargando y cargando Modelo en RAM...")
28
  model = AutoModelForCausalLM.from_pretrained(
29
  MODEL_ID,
30
  dtype=torch.float16 if device == "cuda" else torch.float32,
31
  device_map="auto",
32
  trust_remote_code=True,
33
+ low_cpu_mem_usage=True
34
  )
35
+ print("✅ ¡Modelo cargado correctamente en memoria!")
36
+
37
  except Exception as e:
38
+ print("❌" * 20)
39
+ print(f"ERROR CRÍTICO FATAL AL CARGAR EL MODELO:\n{e}")
40
+ print(traceback.format_exc())
41
+ print("❌" * 20)
42
+ # Obligamos al container a morir si no hay modelo, así HF te avisará del fallo
43
+ # y evitará el estado "Running zombi" que da el "NameError".
44
+ sys.exit(1)
45
+
46
+
47
  def chat(message, history):
48
+ # Detección de seguridad en tiempo real
49
+ if model is None or tokenizer is None:
50
+ yield "⚠️ Error del servidor: El modelo de IA no está cargado correctamente. Contacta al administrador."
51
+ return
52
+
53
+ # Preparar el contexto del sistema
54
  messages = []
 
55
  messages.append({
56
  "role": "system",
57
  "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
58
  })
59
 
60
+ # Inyectar el historial de chat anterior
61
  for user_msg, bot_msg in history:
62
  messages.append({"role": "user", "content": user_msg})
63
  messages.append({"role": "assistant", "content": bot_msg})
64
 
65
+ # Añadir el nuevo mensaje del usuario
66
  messages.append({"role": "user", "content": message})
67
+
68
+ # Formatear el texto usando la plantilla oficial de Qwen/ChatML
69
  try:
70
  text = tokenizer.apply_chat_template(
71
  messages,
72
  tokenize=False,
73
  add_generation_prompt=True
74
  )
75
+ except Exception as e:
76
+ print(f"Aviso tokenizer: Falló el apply_chat_template, usando fallback manual. {e}")
77
  text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
78
+
79
+ # Enviar al procesador (device)
80
  inputs = tokenizer([text], return_tensors="pt").to(device)
81
+
82
+ # Streamer para respuestas rápidas
83
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
84
 
85
+ # Parámetros de generación inteligentes
86
  generation_kwargs = dict(
87
  inputs,
88
  streamer=streamer,
89
+ max_new_tokens=1024,
90
  temperature=0.7,
91
  do_sample=True,
92
+ top_k=50,
93
+ top_p=0.9,
94
+ repetition_penalty=1.1
95
  )
96
 
97
+ # Iniciar la generación en segundo plano
98
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
99
  thread.start()
100
+
101
+ # Devolver texto palabra por palabra
102
  partial_text = ""
103
  for new_text in streamer:
104
  partial_text += new_text
105
  yield partial_text
106
+
107
+
108
+ # Interfaz Gráfica de Gradio
109
  demo = gr.ChatInterface(
110
  fn=chat,
111
+ chatbot=gr.Chatbot(height=500),
112
+ textbox=gr.Textbox(placeholder="Pregúntale a Lumin Flash...", container=False, scale=7),
113
+ title="Lumin Flash (High Performance)",
114
+ description="Backend oficial de inferencia rápida para Lumin Web."
115
  )
116
+
117
  if __name__ == "__main__":
118
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)