import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M
# Cabe en 16GB RAM (justo pero funciona)
REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF"
FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf"

print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

print("Loading model...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # Contexto moderado por RAM
    n_threads=2,        # CPU Friendly
    verbose=False
)

def generate_pro(message, history):
    # Formato ChatML (Estandard de Qwen)
    prompt = ""
    for user_msg, bot_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    stream = llm.create_completion(
        prompt,
        max_tokens=2048,
        stop=["<|im_end|>"],
        stream=True,
        temperature=0.7,
        top_p=0.9
    )

    partial_text = ""
    for output in stream:
        delta = output['choices'][0]['text']
        partial_text += delta
        yield partial_text

chat_interface = gr.ChatInterface(
    fn=generate_pro,
    title="🌟 Lumin Pro (Qwen 14B)",
    description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.",
)

if __name__ == "__main__":
    chat_interface.launch(server_name="0.0.0.0", server_port=7860)