| import gradio as gr |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
| import os |
|
|
| model_id = "novapixelentretaiment/Lumin-Haiku-4-Instruct" |
| |
| gguf_file = "qwen2.5-coder-3b-instruct-q4_k_m.gguf" |
| token = os.environ.get("HF_TOKEN") |
|
|
| print("Cargando Lumin Nano 2.1 (GGUF Optimized)...") |
|
|
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_id, token=token, trust_remote_code=True) |
|
|
|
|
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| gguf_file=gguf_file, |
| token=token, |
| torch_dtype=torch.float32, |
| trust_remote_code=True |
| ) |
|
|
| def parse_thought(text): |
| if "<think>" in text: |
| if "</think>" in text: |
| parts = text.split("</think>") |
| return f"Pensamiento: {parts[0].replace('<think>', '').strip()}\n\nRespuesta: {parts[1].strip()}" |
| else: |
| return f"Pensamiento: {text.replace('<think>', '').strip()}" |
| return text |
|
|
| def chat_stream(message, history, system_message, max_tokens, temperature, top_p): |
| messages = [{"role": "system", "content": system_message}] |
| for h in history: |
| if h[0]: messages.append({"role": "user", "content": h[0]}) |
| if h[1]: messages.append({"role": "assistant", "content": h[1]}) |
| messages.append({"role": "user", "content": message}) |
|
|
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
| inputs = tokenizer([text], return_tensors="pt").to(model.device) |
|
|
| streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True) |
| |
| generate_kwargs = dict( |
| **inputs, |
| streamer=streamer, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| top_p=top_p, |
| pad_token_id=tokenizer.eos_token_id, |
| eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|im_end|>")], |
| ) |
|
|
| t = Thread(target=model.generate, kwargs=generate_kwargs) |
| t.start() |
|
|
| partial_text = "" |
| for new_text in streamer: |
| partial_text += new_text |
| if "<|im_end|>" in partial_text: |
| partial_text = partial_text.split("<|im_end|>")[0] |
| yield parse_thought(partial_text) |
| break |
| yield parse_thought(partial_text) |
|
|
| with gr.Blocks(title="Lumin Nano 2.1") as demo: |
| gr.Markdown("Lumin Nano 2.1 - Spanish Only") |
| |
| gr.ChatInterface( |
| chat_stream, |
| additional_inputs=[ |
| gr.Textbox(value="Eres Lumin Nano 2.1. UNICAMENTE puedes pensar y responder en ESPAÑOL. Tienes PROHIBIDO usar el inglés. Sé directo, conciso y nunca uses emojis.", label="System Message"), |
| gr.Slider(1, 1024, 256, label="Max Tokens"), |
| gr.Slider(0.01, 1.0, 0.1, label="Temperature"), |
| gr.Slider(0.1, 1.0, 0.9, label="Top-p"), |
| ], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|