import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import os model_id = "novapixelentretaiment/Lumin-Haiku-4-Instruct" # Archivo corregido para coincidir con el nombre oficial del repo en HF gguf_file = "qwen2.5-coder-3b-instruct-q4_k_m.gguf" token = os.environ.get("HF_TOKEN") print("Cargando Lumin Nano 2.1 (GGUF Optimized)...") tokenizer = AutoTokenizer.from_pretrained(model_id, token=token, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, gguf_file=gguf_file, token=token, torch_dtype=torch.float32, # CPU friendly trust_remote_code=True ) def parse_thought(text): if "" in text: if "" in text: parts = text.split("") return f"Pensamiento: {parts[0].replace('', '').strip()}\n\nRespuesta: {parts[1].strip()}" else: return f"Pensamiento: {text.replace('', '').strip()}" return text def chat_stream(message, history, system_message, max_tokens, temperature, top_p): messages = [{"role": "system", "content": system_message}] for h in history: if h[0]: messages.append({"role": "user", "content": h[0]}) if h[1]: messages.append({"role": "assistant", "content": h[1]}) messages.append({"role": "user", "content": message}) text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer([text], return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, pad_token_id=tokenizer.eos_token_id, eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>")], ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_text = "" for new_text in streamer: partial_text += new_text if "<|im_end|>" in partial_text: partial_text = partial_text.split("<|im_end|>")[0] yield parse_thought(partial_text) break yield parse_thought(partial_text) with gr.Blocks(title="Lumin Nano 2.1") as demo: gr.Markdown("Lumin Nano 2.1 - Spanish Only") gr.ChatInterface( chat_stream, additional_inputs=[ gr.Textbox(value="Eres Lumin Nano 2.1. UNICAMENTE puedes pensar y responder en ESPAÑOL. Tienes PROHIBIDO usar el inglés. Sé directo, conciso y nunca uses emojis.", label="System Message"), gr.Slider(1, 1024, 256, label="Max Tokens"), gr.Slider(0.01, 1.0, 0.1, label="Temperature"), gr.Slider(0.1, 1.0, 0.9, label="Top-p"), ], ) if __name__ == "__main__": demo.launch()