roneymatusp commited on
Commit
1a5bc09
·
verified ·
1 Parent(s): 5eda67f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -59
app.py CHANGED
@@ -1,93 +1,121 @@
1
  import os
2
- import torch
3
  import gradio as gr
 
 
 
4
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
5
  from peft import PeftModel
6
- import spaces
7
 
 
8
  BASE_ID = os.getenv("BASE_ID", "mistralai/Mistral-7B-v0.1")
9
  ADAPTER_ID = os.getenv("ADAPTER_ID", "roneymatusp/british-optimizer-mistral-final")
10
  HF_TOKEN = os.getenv("HF_TOKEN")
11
 
12
- SYSTEM_PROMPT = (
13
- "You are Paulean AI, a British educator. Be concise, courteous, and academically precise. "
14
- "Use UK spelling and classroom vocabulary common in British schools."
15
- )
 
16
 
17
- # Lazy cache (só carrega quando houver GPU disponível)
18
- _tok = None
19
  _model = None
20
 
21
- def _get_model():
22
- global _tok, _model
23
- if _tok is None or _model is None:
24
- compute_dtype = torch.bfloat16
25
- bnb = BitsAndBytesConfig(
26
- load_in_4bit=True,
27
- bnb_4bit_quant_type="nf4",
28
- bnb_4bit_use_double_quant=True,
29
- bnb_4bit_compute_dtype=compute_dtype,
30
- )
31
 
32
- _tok = AutoTokenizer.from_pretrained(
33
- BASE_ID, use_fast=True, token=HF_TOKEN
34
- )
 
 
 
35
 
36
- base = AutoModelForCausalLM.from_pretrained(
37
- BASE_ID,
38
- torch_dtype=compute_dtype,
39
- device_map="auto",
40
- quantization_config=bnb,
41
- token=HF_TOKEN,
42
- )
43
- _model = PeftModel.from_pretrained(base, ADAPTER_ID, token=HF_TOKEN)
44
 
45
- return _tok, _model
 
 
46
 
47
- def _build_prompt(message: str, history):
48
- text = SYSTEM_PROMPT + "\n\n"
49
- if history:
50
- for user, bot in history:
51
- text += f"User: {user}\nAssistant: {bot}\n"
52
- text += f"User: {message}\nAssistant:"
53
- return text
54
 
55
- @spaces.GPU(duration=120) # requisita GPU sob demanda no ZeroGPU
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def respond(message, history):
57
- tok, model = _get_model()
58
- prompt = _build_prompt(message, history)
 
 
 
 
 
59
 
 
60
  inputs = tok(prompt, return_tensors="pt").to(model.device)
61
- with torch.inference_mode():
 
62
  out = model.generate(
63
  **inputs,
 
64
  do_sample=True,
65
  temperature=0.7,
66
- max_new_tokens=220,
67
- repetition_penalty=1.1,
68
  pad_token_id=tok.eos_token_id,
69
  )
70
 
71
  text = tok.decode(out[0], skip_special_tokens=True)
72
- answer = text.split("Assistant:")[-1].strip()
73
- return answer
74
-
75
- # Build the UI using Gradio's high-level ChatInterface. This automatically
76
- # handles conversation state and displays the chat history. We specify
77
- # custom labels for the submit and clear buttons to localise them for
78
- # Portuguese-speaking teachers.
79
- # Instantiate the ChatInterface. Note: Gradio versions prior to 4.42
80
- # do not support a `clear_btn` keyword argument, so only the
81
- # `submit_btn` label is customised here. The default clear button
82
- # provided by ChatInterface will remain in English.
83
  demo = gr.ChatInterface(
84
  fn=respond,
85
- title="Paulean AI British Prompt Optimizer",
86
- description="Escreva sua pergunta/prompt. O modelo responde em estilo British 🇬🇧.",
 
 
 
 
 
 
87
  submit_btn="Enviar",
 
 
 
88
  )
89
 
90
- # Launch the demo when run directly. Queuing is enabled to properly
91
- # manage GPU allocations on ZeroGPU.
92
  if __name__ == "__main__":
93
- demo.queue().launch()
 
1
  import os
 
2
  import gradio as gr
3
+ import torch
4
+ import spaces
5
+
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
  from peft import PeftModel
8
+ from huggingface_hub import login
9
 
10
+ # --------- Config via Variables/Secrets ---------
11
  BASE_ID = os.getenv("BASE_ID", "mistralai/Mistral-7B-v0.1")
12
  ADAPTER_ID = os.getenv("ADAPTER_ID", "roneymatusp/british-optimizer-mistral-final")
13
  HF_TOKEN = os.getenv("HF_TOKEN")
14
 
15
+ if HF_TOKEN:
16
+ try:
17
+ login(HF_TOKEN)
18
+ except Exception:
19
+ pass
20
 
21
+ # --------- Lazy globals (carrega só quando necessário) ---------
22
+ _tokenizer = None
23
  _model = None
24
 
25
+ def _load_model():
26
+ """
27
+ Carrega base + LoRA em 4-bit (quando houver GPU) e fica em cache.
28
+ Em ZeroGPU, este carregamento acontece DENTRO da função anotada com @spaces.GPU.
29
+ Em GPU fixa, também funciona e permanece em VRAM.
30
+ """
31
+ global _tokenizer, _model
32
+ if _model is not None and _tokenizer is not None:
33
+ return _tokenizer, _model
 
34
 
35
+ bnb = BitsAndBytesConfig(
36
+ load_in_4bit=True,
37
+ bnb_4bit_quant_type="nf4",
38
+ bnb_4bit_use_double_quant=True,
39
+ bnb_4bit_compute_dtype=torch.bfloat16,
40
+ )
41
 
42
+ _tokenizer = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)
43
+ base = AutoModelForCausalLM.from_pretrained(
44
+ BASE_ID,
45
+ torch_dtype=torch.bfloat16,
46
+ device_map="auto",
47
+ quantization_config=bnb,
48
+ )
 
49
 
50
+ _model = PeftModel.from_pretrained(base, ADAPTER_ID)
51
+ _model.eval()
52
+ return _tokenizer, _model
53
 
54
+ SYSTEM_PROMPT = (
55
+ "You are a British educator. Be concise, courteous, and academically precise. "
56
+ "Prefer UK spelling and classroom vocabulary used in British schools."
57
+ )
 
 
 
58
 
59
+ def _build_prompt(history_pairs, user_message):
60
+ # history_pairs: list of (user, assistant)
61
+ lines = [SYSTEM_PROMPT, ""]
62
+ for u, a in history_pairs:
63
+ if u:
64
+ lines.append(f"User: {u}")
65
+ if a:
66
+ lines.append(f"Assistant: {a}")
67
+ lines.append(f"User: {user_message}")
68
+ lines.append("Assistant:")
69
+ return "\n".join(lines)
70
+
71
+ # --------- Função de resposta (GPU on-demand / ZeroGPU) ---------
72
+ @spaces.GPU(duration=120) # ignorado quando o hardware não é ZeroGPU
73
  def respond(message, history):
74
+ """
75
+ ChatInterface chama com:
76
+ - message: str
77
+ - history: list[tuple[str, str]]
78
+ Retorno: str com a resposta do assistente.
79
+ """
80
+ tok, model = _load_model()
81
 
82
+ prompt = _build_prompt(history, message)
83
  inputs = tok(prompt, return_tensors="pt").to(model.device)
84
+
85
+ with torch.no_grad():
86
  out = model.generate(
87
  **inputs,
88
+ max_new_tokens=256,
89
  do_sample=True,
90
  temperature=0.7,
91
+ top_p=0.95,
 
92
  pad_token_id=tok.eos_token_id,
93
  )
94
 
95
  text = tok.decode(out[0], skip_special_tokens=True)
96
+
97
+ # Extrai apenas o trecho após "Assistant:"
98
+ if "Assistant:" in text:
99
+ text = text.split("Assistant:", 1)[1].strip()
100
+
101
+ return text
102
+
103
+ # --------- Gradio UI ---------
 
 
 
104
  demo = gr.ChatInterface(
105
  fn=respond,
106
+ type="messages", # formato moderno compatível
107
+ title="Paulean AI British Prompt Optimiser",
108
+ description=(
109
+ "Demo escolar (Mistral‑7B + LoRA). Evite dados sensíveis. "
110
+ "Em ZeroGPU a primeira resposta pode demorar para carregar os pesos."
111
+ ),
112
+ chatbot=gr.Chatbot(height=480, show_copy_button=True, label="Chat"),
113
+ textbox=gr.Textbox(placeholder="Escreva sua pergunta…", label="Mensagem"),
114
  submit_btn="Enviar",
115
+ retry_btn="Refazer",
116
+ undo_btn="Voltar",
117
+ clear_btn=True,
118
  )
119
 
 
 
120
  if __name__ == "__main__":
121
+ demo.launch()