Spaces:

LuminLabs
/

lumin_pro

Sleeping

File size: 1,478 Bytes

1c4aa85
5c1061d
 
1c4aa85
 
 
5c1061d
 
1c4aa85
 
5c1061d
1c4aa85
5c1061d
 
 
1c4aa85
 
5c1061d
 
1c4aa85
5c1061d
1c4aa85
 
5c1061d
 
 
1c4aa85
5c1061d
 
 
 
 
1c4aa85
 
5c1061d
1c4aa85
5c1061d
 
 
 
 
1c4aa85
5c1061d
 
 
1c4aa85
5c1061d
1c4aa85
5c1061d

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# PRO: Qwen 2.5 14B Instruct (GGUF) - Q4_K_M
# Cabe en 16GB RAM (justo pero funciona)
REPO_ID = "bartowski/Qwen2.5-14B-Instruct-GGUF"
FILENAME = "Qwen2.5-14B-Instruct-Q4_K_M.gguf"

print(f"Downloading {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)

print("Loading model...")
llm = Llama(
    model_path=model_path,
    n_ctx=8192,         # Contexto moderado por RAM
    n_threads=2,        # CPU Friendly
    verbose=False
)

def generate_pro(message, history):
    # Formato ChatML (Estandard de Qwen)
    prompt = ""
    for user_msg, bot_msg in history:
        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    stream = llm.create_completion(
        prompt,
        max_tokens=2048,
        stop=["<|im_end|>"],
        stream=True,
        temperature=0.7,
        top_p=0.9
    )

    partial_text = ""
    for output in stream:
        delta = output['choices'][0]['text']
        partial_text += delta
        yield partial_text

chat_interface = gr.ChatInterface(
    fn=generate_pro,
    title="🌟 Lumin Pro (Qwen 14B)",
    description="Running Qwen2.5-14B-Instruct (GGUF). Balanced Power.",
)

if __name__ == "__main__":
    chat_interface.launch(server_name="0.0.0.0", server_port=7860)