daniel-dona commited on
Commit
fd78eab
·
verified ·
1 Parent(s): 9327797

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -4
app.py CHANGED
@@ -2,9 +2,9 @@ import spaces
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
- model = "daniel-dona/gemma-3-270m-it"
6
 
7
- pipe = pipeline("text-generation", model=model, device="cuda")
8
 
9
  @spaces.GPU
10
  def respond(
@@ -28,7 +28,7 @@ def respond(
28
 
29
  messages.append({"role": "user", "content": message})
30
 
31
- response = pipe(
32
  messages,
33
  max_new_tokens=max_tokens,
34
  temperature=temperature,
@@ -38,7 +38,43 @@ def respond(
38
 
39
  generated_text = response[0]['generated_text']
40
 
41
- yield generated_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
 
44
  """
 
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
+ model_name = "daniel-dona/gemma-3-270m-it"
6
 
7
+ #pipe = pipeline("text-generation", model=model, device="cuda")
8
 
9
  @spaces.GPU
10
  def respond(
 
28
 
29
  messages.append({"role": "user", "content": message})
30
 
31
+ """response = pipe(
32
  messages,
33
  max_new_tokens=max_tokens,
34
  temperature=temperature,
 
38
 
39
  generated_text = response[0]['generated_text']
40
 
41
+ yield generated_text"""
42
+
43
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ model_name,
46
+ torch_dtype="auto",
47
+ device_map="auto"
48
+ )
49
+
50
+ text = tokenizer.apply_chat_template(
51
+ messages,
52
+ tokenize=False,
53
+ add_generation_prompt=True,
54
+ enable_thinking=model_thinking
55
+ )
56
+
57
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
58
+
59
+ sample = True
60
+
61
+ if model_temperature == 0:
62
+ sample = False
63
+
64
+
65
+ # conduct text completion
66
+ generated_ids = model.generate(
67
+ **model_inputs,
68
+ max_new_tokens=max_tokens,
69
+ do_sample=sample,
70
+ top_p=top_p,
71
+ temperature=model_temperature
72
+ )
73
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
74
+
75
+ content = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
76
+
77
+ return content
78
 
79
 
80
  """