albertoo85 commited on
Commit
8ca33f7
·
verified ·
1 Parent(s): e31f1df

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -17
app.py CHANGED
@@ -1,26 +1,49 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- import torch
 
 
4
 
5
- # Usamos la versión de 0.5B o 1.5B si el Space es gratuito,
6
- # ya que el de 7B con 'transformers' superará los 16GB de RAM y colapsará.
7
- model_id = "Qwen/Qwen2.5-1.5B-Instruct"
8
 
9
- print("Cargando tokenizer y modelo...")
10
- tokenizer = AutoTokenizer.from_pretrained(model_id)
11
- model = AutoModelForCausalLM.from_pretrained(
12
- model_id,
13
- torch_dtype="auto",
14
- device_map="cpu" # Forzamos CPU
15
  )
16
 
17
- def chat(message, history):
18
- inputs = tokenizer(message, return_tensors="pt")
19
- outputs = model.generate(**inputs, max_new_tokens=512)
20
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
21
- return response
22
 
23
- demo = gr.ChatInterface(fn=chat, type="messages")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  if __name__ == "__main__":
26
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import psutil
5
+ import os
6
 
7
+ # Monitor de RAM
8
+ def get_ram():
9
+ return f"RAM usada: {psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024):.2f} MB"
10
 
11
+ # 1. Descarga del modelo GGUF (Eficiente para CPU)
12
+ print("Descargando modelo GGUF...")
13
+ model_path = hf_hub_download(
14
+ repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
15
+ filename="qwen2.5-7b-instruct-q4_k_m.gguf"
 
16
  )
17
 
18
+ # 2. Carga del modelo
19
+ print("Cargando modelo en memoria...")
20
+ llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
 
 
21
 
22
+ def predict(message, system_prompt="Responde en español."):
23
+ # Formato de prompt ChatML
24
+ prompt = f"<|im_start|>system\n{system_prompt} ({get_ram()})<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
25
+
26
+ output = llm(
27
+ prompt,
28
+ max_tokens=1024,
29
+ stop=["<|im_end|>", "<|im_start|>"],
30
+ echo=False
31
+ )
32
+ return output['choices'][0]['text']
33
+
34
+ # 3. Interfaz compatible y limpia
35
+ with gr.Blocks() as demo:
36
+ gr.Markdown("# Nodo API Qwen2.5-7B")
37
+ with gr.Row():
38
+ with gr.Column():
39
+ input_text = gr.Textbox(label="Mensaje", placeholder="Escribe aquí...")
40
+ sys_text = gr.Textbox(label="System Prompt", value="Eres un asistente útil.")
41
+ btn = gr.Button("Enviar", variant="primary")
42
+ with gr.Column():
43
+ output_text = gr.Textbox(label="Respuesta")
44
+
45
+ # Este es el nombre que usará tu otra VM
46
+ btn.click(predict, inputs=[input_text, sys_text], outputs=output_text, api_name="query")
47
 
48
  if __name__ == "__main__":
49
  demo.launch(server_name="0.0.0.0", server_port=7860)