| import gradio as gr |
| from transformers import pipeline, AutoTokenizer |
| import torch |
|
|
| |
| generador = None |
|
|
| def predecir(pregunta): |
| global generador |
| |
| if generador is None: |
| model_id = "edusc182/Gemma_2B" |
| tokenizer = AutoTokenizer.from_pretrained(model_id, clean_up_tokenization_spaces=False) |
| generador = pipeline( |
| "text-generation", |
| model=model_id, |
| tokenizer=tokenizer, |
| torch_dtype=torch.bfloat16, |
| device_map="cpu" |
| ) |
| |
| |
| CONTEXT_WINDOW = 8192 |
| RESERVED_FOR_RESPONSE = 2048 |
| |
| |
| chat = [{"role": "user", "content": pregunta}] |
| prompt = generador.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
| tokens_prompt = len(generador.tokenizer.encode(prompt)) |
| |
| |
| |
| |
| max_tokens_permitidos = max(512, CONTEXT_WINDOW - tokens_prompt) |
| |
| |
| final_max_new_tokens = min(max_tokens_permitidos, RESERVED_FOR_RESPONSE) |
| |
| print(f"Tokens prompt: {tokens_prompt} | Límite respuesta: {final_max_new_tokens}") |
| |
| |
| resultados = generador( |
| prompt, |
| max_new_tokens=final_max_new_tokens, |
| temperature=0.7, |
| do_sample=True, |
| pad_token_id=generador.tokenizer.eos_token_id, |
| eos_token_id=generador.tokenizer.eos_token_id, |
| return_full_text=False |
| ) |
| |
| return resultados[0]['generated_text'].strip() |
|
|
| iface = gr.Interface( |
| fn=predecir, |
| inputs=gr.Textbox(lines=2, placeholder="Hola Gemma, ¿cómo estás?"), |
| outputs=gr.Textbox(label="Respuesta"), |
| title="Asistente Gemma 2B", |
| description="Asistente optimizado con plantillas de chat." |
| ) |
|
|
| iface.launch() |