DavidBazaldua commited on
Commit
a48fbd7
verified
1 Parent(s): b6f0734

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -17
app.py CHANGED
@@ -8,19 +8,29 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
8
 
9
  MODEL_ID = "DavidBazaldua/llama3_finetuned_transformes"
10
 
11
- DEVICE = "cpu" # Space on CPU
12
- DTYPE = torch.float32 # safer on CPU
13
 
 
 
 
 
14
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
 
16
  model = AutoModelForCausalLM.from_pretrained(
17
  MODEL_ID,
18
  torch_dtype=DTYPE,
19
- low_cpu_mem_usage=True, # intenta optimizar memoria
20
  )
21
  model.to(DEVICE)
22
- model.eval() # modo evaluaci贸n
23
- torch.set_num_threads(2) # limitar n煤mero de threads (aj煤stalo si quieres)
 
 
 
 
 
 
24
  # ---------------------------------------------------------------------
25
  # Prompt building (using the chat template from the tokenizer)
26
  # ---------------------------------------------------------------------
@@ -75,6 +85,9 @@ def generate_answer(system_prompt, context, message, history, max_tokens, temper
75
  if not system_prompt or system_prompt.strip() == "":
76
  system_prompt = DEFAULT_SYSTEM_PROMPT
77
 
 
 
 
78
  prompt = build_prompt(system_prompt, context, history, message)
79
 
80
  inputs = tokenizer(
@@ -86,21 +99,22 @@ def generate_answer(system_prompt, context, message, history, max_tokens, temper
86
  with torch.no_grad():
87
  output_tokens = model.generate(
88
  **inputs,
89
- max_new_tokens=int(max_tokens),
90
  do_sample=True,
91
  temperature=float(temperature),
92
  top_p=float(top_p),
93
  pad_token_id=tokenizer.eos_token_id,
94
  )
95
 
96
- generated = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
97
- # Since we used the chat template, the decode already gives us the full conversation
98
- # but we only need the new assistant segment. For simplicity, take everything after the last user message.
99
- answer = generated[len(prompt):].strip()
100
 
101
- if not answer:
102
- # Fallback if slicing fails for any reason
103
- answer = generated.strip()
 
 
 
104
 
105
  history = history + [[message, answer]]
106
  return answer, history
@@ -120,6 +134,7 @@ def chat(message, history, system_prompt, context, max_tokens, temperature, top_
120
  top_p=top_p,
121
  )
122
 
 
123
  return "", updated_history
124
 
125
 
@@ -164,10 +179,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
164
 
165
  max_tokens_slider = gr.Slider(
166
  label="Max new tokens",
167
- minimum=64,
168
- maximum=512,
169
- value=256,
170
- step=32,
171
  )
172
  temperature_slider = gr.Slider(
173
  label="Temperature",
@@ -218,3 +233,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
218
 
219
  if __name__ == "__main__":
220
  demo.launch()
 
 
8
 
9
  MODEL_ID = "DavidBazaldua/llama3_finetuned_transformes"
10
 
11
+ DEVICE = "cpu" # Space on CPU
12
+ DTYPE = torch.float32 # safer on CPU
13
 
14
+ # Limit CPU threads (you can try 1, 2, 4, etc. depending on performance)
15
+ torch.set_num_threads(2)
16
+
17
+ # Load tokenizer and model
18
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
19
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
  MODEL_ID,
22
  torch_dtype=DTYPE,
23
+ low_cpu_mem_usage=True, # optimize memory usage on CPU
24
  )
25
  model.to(DEVICE)
26
+ model.eval()
27
+
28
+ DEFAULT_SYSTEM_PROMPT = (
29
+ "You are a helpful, knowledgeable, and professional AI assistant. "
30
+ "You respond in English unless the user explicitly requests another language. "
31
+ "Provide clear, concise answers and reason step by step when it is useful."
32
+ )
33
+
34
  # ---------------------------------------------------------------------
35
  # Prompt building (using the chat template from the tokenizer)
36
  # ---------------------------------------------------------------------
 
85
  if not system_prompt or system_prompt.strip() == "":
86
  system_prompt = DEFAULT_SYSTEM_PROMPT
87
 
88
+ # Hard cap for safety on CPU
89
+ max_tokens = int(min(max_tokens, 128))
90
+
91
  prompt = build_prompt(system_prompt, context, history, message)
92
 
93
  inputs = tokenizer(
 
99
  with torch.no_grad():
100
  output_tokens = model.generate(
101
  **inputs,
102
+ max_new_tokens=max_tokens,
103
  do_sample=True,
104
  temperature=float(temperature),
105
  top_p=float(top_p),
106
  pad_token_id=tokenizer.eos_token_id,
107
  )
108
 
109
+ # Decode full text (prompt + completion)
110
+ full_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
 
 
111
 
112
+ # Try to extract only the assistant's new part
113
+ if full_text.startswith(prompt):
114
+ answer = full_text[len(prompt):].strip()
115
+ else:
116
+ # Fallback if for some reason the decoded text does not start with prompt
117
+ answer = full_text.strip()
118
 
119
  history = history + [[message, answer]]
120
  return answer, history
 
134
  top_p=top_p,
135
  )
136
 
137
+ # Return empty input and updated history for the Chatbot
138
  return "", updated_history
139
 
140
 
 
179
 
180
  max_tokens_slider = gr.Slider(
181
  label="Max new tokens",
182
+ minimum=32,
183
+ maximum=256,
184
+ value=128,
185
+ step=16,
186
  )
187
  temperature_slider = gr.Slider(
188
  label="Temperature",
 
233
 
234
  if __name__ == "__main__":
235
  demo.launch()
236
+