import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M # Cabe en 16GB RAM (justo pero funciona) REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF" FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf" print(f"Downloading {FILENAME} from {REPO_ID}...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print("Loading model...") llm = Llama( model_path=model_path, n_ctx=8192, # Contexto moderado por RAM n_threads=2, # CPU Friendly verbose=False ) def generate_pro(message, history): # Formato ChatML (Estandard de Qwen) prompt = "" for user_msg, bot_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" stream = llm.create_completion( prompt, max_tokens=2048, stop=["<|im_end|>"], stream=True, temperature=0.7, top_p=0.9 ) partial_text = "" for output in stream: delta = output['choices'][0]['text'] partial_text += delta yield partial_text chat_interface = gr.ChatInterface( fn=generate_pro, title="🌟 Lumin Pro (Qwen 14B)", description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.", ) if __name__ == "__main__": chat_interface.launch(server_name="0.0.0.0", server_port=7860)