Spaces:

roneymatusp
/

paulean-british-optimizer

Sleeping

App Files Files Community

roneymatusp commited on Aug 12, 2025

Commit

1a5bc09

verified ·

1 Parent(s): 5eda67f

Upload app.py

Browse files

Files changed (1) hide show

app.py +87 -59

app.py CHANGED Viewed

@@ -1,93 +1,121 @@
 import os
-import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
-import spaces
 BASE_ID = os.getenv("BASE_ID", "mistralai/Mistral-7B-v0.1")
 ADAPTER_ID = os.getenv("ADAPTER_ID", "roneymatusp/british-optimizer-mistral-final")
 HF_TOKEN = os.getenv("HF_TOKEN")
-SYSTEM_PROMPT = (
-    "You are Paulean AI, a British educator. Be concise, courteous, and academically precise. "
-    "Use UK spelling and classroom vocabulary common in British schools."
-)
-# Lazy cache (só carrega quando houver GPU disponível)
-_tok = None
 _model = None
-def _get_model():
-    global _tok, _model
-    if _tok is None or _model is None:
-        compute_dtype = torch.bfloat16
-        bnb = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_compute_dtype=compute_dtype,
-        )
-        _tok = AutoTokenizer.from_pretrained(
-            BASE_ID, use_fast=True, token=HF_TOKEN
-        )
-        base = AutoModelForCausalLM.from_pretrained(
-            BASE_ID,
-            torch_dtype=compute_dtype,
-            device_map="auto",
-            quantization_config=bnb,
-            token=HF_TOKEN,
-        )
-        _model = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN)
-    return _tok, _model
-def _build_prompt(message: str, history):
-    text = SYSTEM_PROMPT + "\n\n"
-    if history:
-        for user, bot in history:
-            text += f"User: {user}\nAssistant: {bot}\n"
-    text += f"User: {message}\nAssistant:"
-    return text
-@spaces.GPU(duration=120)  # requisita GPU sob demanda no ZeroGPU
 def respond(message, history):
-    tok, model = _get_model()
-    prompt = _build_prompt(message, history)
     inputs = tok(prompt, return_tensors="pt").to(model.device)
-    with torch.inference_mode():
         out = model.generate(
             **inputs,
             do_sample=True,
             temperature=0.7,
-            max_new_tokens=220,
-            repetition_penalty=1.1,
             pad_token_id=tok.eos_token_id,
         )
     text = tok.decode(out[0], skip_special_tokens=True)
-    answer = text.split("Assistant:")[-1].strip()
-    return answer
-# Build the UI using Gradio's high-level ChatInterface. This automatically
-# handles conversation state and displays the chat history. We specify
-# custom labels for the submit and clear buttons to localise them for
-# Portuguese-speaking teachers.
-# Instantiate the ChatInterface.  Note: Gradio versions prior to 4.42
-# do not support a `clear_btn` keyword argument, so only the
-# `submit_btn` label is customised here. The default clear button
-# provided by ChatInterface will remain in English.
 demo = gr.ChatInterface(
     fn=respond,
-    title="Paulean AI – British Prompt Optimizer",
-    description="Escreva sua pergunta/prompt. O modelo responde em estilo British 🇬🇧.",
     submit_btn="Enviar",
 )
-# Launch the demo when run directly.  Queuing is enabled to properly
-# manage GPU allocations on ZeroGPU.
 if __name__ == "__main__":
-    demo.queue().launch()

 import os
 import gradio as gr
+import torch
+import spaces
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from peft import PeftModel
+from huggingface_hub import login
+# --------- Config via Variables/Secrets ---------
 BASE_ID = os.getenv("BASE_ID", "mistralai/Mistral-7B-v0.1")
 ADAPTER_ID = os.getenv("ADAPTER_ID", "roneymatusp/british-optimizer-mistral-final")
 HF_TOKEN = os.getenv("HF_TOKEN")
+if HF_TOKEN:
+    try:
+        login(HF_TOKEN)
+    except Exception:
+        pass
+# --------- Lazy globals (carrega só quando necessário) ---------
+_tokenizer = None
 _model = None
+def _load_model():
+    """
+    Carrega base + LoRA em 4-bit (quando houver GPU) e fica em cache.
+    Em ZeroGPU, este carregamento acontece DENTRO da função anotada com @spaces.GPU.
+    Em GPU fixa, também funciona e permanece em VRAM.
+    """
+    global _tokenizer, _model
+    if _model is not None and _tokenizer is not None:
+        return _tokenizer, _model
+    bnb = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+    _tokenizer = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
+    base = AutoModelForCausalLM.from_pretrained(
+        BASE_ID,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        quantization_config=bnb,
+    )
+    _model = PeftModel.from_pretrained(base, ADAPTER_ID)
+    _model.eval()
+    return _tokenizer, _model
+SYSTEM_PROMPT = (
+    "You are a British educator. Be concise, courteous, and academically precise. "
+    "Prefer UK spelling and classroom vocabulary used in British schools."
+)
+def _build_prompt(history_pairs, user_message):
+    # history_pairs: list of (user, assistant)
+    lines = [SYSTEM_PROMPT, ""]
+    for u, a in history_pairs:
+        if u:
+            lines.append(f"User: {u}")
+        if a:
+            lines.append(f"Assistant: {a}")
+    lines.append(f"User: {user_message}")
+    lines.append("Assistant:")
+    return "\n".join(lines)
+# --------- Função de resposta (GPU on-demand / ZeroGPU) ---------
+@spaces.GPU(duration=120)  # ignorado quando o hardware não é ZeroGPU
 def respond(message, history):
+    """
+    ChatInterface chama com:
+      - message: str
+      - history: list[tuple[str, str]]
+    Retorno: str com a resposta do assistente.
+    """
+    tok, model = _load_model()
+    prompt = _build_prompt(history, message)
     inputs = tok(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
         out = model.generate(
             **inputs,
+            max_new_tokens=256,
             do_sample=True,
             temperature=0.7,
+            top_p=0.95,
             pad_token_id=tok.eos_token_id,
         )
     text = tok.decode(out[0], skip_special_tokens=True)
+    # Extrai apenas o trecho após "Assistant:"
+    if "Assistant:" in text:
+        text = text.split("Assistant:", 1)[1].strip()
+    return text
+# --------- Gradio UI ---------
 demo = gr.ChatInterface(
     fn=respond,
+    type="messages",  # formato moderno compatível
+    title="Paulean AI — British Prompt Optimiser",
+    description=(
+        "Demo escolar (Mistral‑7B + LoRA). Evite dados sensíveis. "
+        "Em ZeroGPU a primeira resposta pode demorar para carregar os pesos."
+    ),
+    chatbot=gr.Chatbot(height=480, show_copy_button=True, label="Chat"),
+    textbox=gr.Textbox(placeholder="Escreva sua pergunta…", label="Mensagem"),
     submit_btn="Enviar",
+    retry_btn="Refazer",
+    undo_btn="Voltar",
+    clear_btn=True,
 )
 if __name__ == "__main__":
+    demo.launch()