File size: 1,478 Bytes
1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d 1c4aa85 5c1061d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M
# Cabe en 16GB RAM (justo pero funciona)
REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF"
FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf"
print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
print("Loading model...")
llm = Llama(
model_path=model_path,
n_ctx=8192, # Contexto moderado por RAM
n_threads=2, # CPU Friendly
verbose=False
)
def generate_pro(message, history):
# Formato ChatML (Estandard de Qwen)
prompt = ""
for user_msg, bot_msg in history:
prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
stream = llm.create_completion(
prompt,
max_tokens=2048,
stop=["<|im_end|>"],
stream=True,
temperature=0.7,
top_p=0.9
)
partial_text = ""
for output in stream:
delta = output['choices'][0]['text']
partial_text += delta
yield partial_text
chat_interface = gr.ChatInterface(
fn=generate_pro,
title="🌟 Lumin Pro (Qwen 14B)",
description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.",
)
if __name__ == "__main__":
chat_interface.launch(server_name="0.0.0.0", server_port=7860) |