import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # ========================================== # 🚀 LUMIN CODE: 3B SMART (Balanced) # ========================================== # El punto medio perfecto: Más listo que 1.5B, pero corre en CPU. # Qwen 2.5 Coder 3B # 1. MODELO "SMART" (3B) REPO_ID = "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF" FILENAME = "qwen2.5-coder-3b-instruct-q4_k_m.gguf" print(f"⬇️ Downloading {FILENAME}...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print("🚀 Loading model into RAM...") llm = Llama( model_path=model_path, n_ctx=16384, # ✅ 16k Contexto n_threads=2, # ✅ 2 Núcleos n_batch=512, # ⚠️ Bajamos batch para dar espacio a la lógica del 3B f16_kv=False, flash_attn=False, # OFF por seguridad CPU verbose=False ) # 2. LÓGICA def generate_code(message, history): if history is None: history = [] # Prompt Estándar Qwen prompt = ( "<|im_start|>system\n" "Eres Lumin Code, experto en programación. Piensa paso a paso y da soluciones correctas.<|im_end|>\n" ) for item in history: if isinstance(item, (list, tuple)) and len(item) >= 2: u, b = item[0], item[1] if u: prompt += f"<|im_start|>user\n{u}<|im_end|>\n" if b: prompt += f"<|im_start|>assistant\n{b}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" stream = llm.create_completion( prompt, max_tokens=2048, temperature=0.3, stream=True, stop=["<|im_end|>", "<|endoftext|>"] ) partial_text = "" for output in stream: token = output["choices"][0]["text"] partial_text += token yield partial_text # 3. INTERFAZ (Blocks API) with gr.Blocks(title="Lumin Code 3B Smart") as demo: gr.Markdown("#Lumin Haiku (3B Smart)\nSpace") chatbot = gr.Chatbot(height=500) msg = gr.Textbox(placeholder="Escribe aquí...") clear = gr.Button("Limpiar") # Visible Chat def user(u, h): return "", h + [[u, None]] def bot(h): u = h[-1][0] h_prev = h[:-1] p = "" for chunk in generate_code(u, h_prev): p = chunk h[-1][1] = p yield h msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(bot, [chatbot], [chatbot]) clear.click(lambda: None, None, chatbot, queue=False) # API Oculta "chat" api_msg = gr.Textbox(visible=False) api_hist = gr.State() api_out = gr.Textbox(visible=False) btn = gr.Button("API", visible=False) btn.click(fn=generate_code, inputs=[api_msg, api_hist], outputs=[api_out], api_name="chat") if __name__ == "__main__": demo.queue().launch(server_name="0.0.0.0", server_port=7860)