shuarya2011 commited on
Commit
4ddd7a9
·
verified ·
1 Parent(s): 0be43d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -19
app.py CHANGED
@@ -1,21 +1,41 @@
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
 
 
 
 
 
 
4
 
5
  MODEL_ID = "google/gemma-4-31B-it-assistant"
6
 
 
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 
 
8
  model = AutoModelForCausalLM.from_pretrained(
9
  MODEL_ID,
10
  torch_dtype=torch.bfloat16,
11
  device_map="auto",
12
  )
 
 
 
13
 
14
  def chat(message, history):
15
  messages = []
16
  for user_msg, bot_msg in history:
17
- messages.append({"role": "user", "content": user_msg})
18
- messages.append({"role": "assistant","content": bot_msg})
19
  messages.append({"role": "user", "content": message})
20
 
21
  inputs = tokenizer.apply_chat_template(
@@ -24,25 +44,23 @@ def chat(message, history):
24
  add_generation_prompt=True,
25
  ).to(model.device)
26
 
27
- from transformers import TextIteratorStreamer
28
- from threading import Thread
29
-
30
  streamer = TextIteratorStreamer(
31
  tokenizer,
32
  skip_prompt=True,
33
  skip_special_tokens=True,
34
  )
35
 
36
- gen_kwargs = dict(
37
- input_ids=inputs,
38
- streamer=streamer,
39
- max_new_tokens=512,
40
- do_sample=True,
41
- temperature=0.7,
42
- top_p=0.9,
 
 
 
43
  )
44
-
45
- thread = Thread(target=model.generate, kwargs=gen_kwargs)
46
  thread.start()
47
 
48
  partial = ""
@@ -50,13 +68,16 @@ def chat(message, history):
50
  partial += token
51
  yield partial
52
 
 
53
  demo = gr.ChatInterface(
54
  fn=chat,
55
  title="Gemma 4 Assistant",
56
- description="Powered by google/gemma-4-31B-it-assistant with streaming",
57
- examples=["Explain quantum computing simply",
58
- "Write a Python function to reverse a string",
59
- "What is the capital of France?"],
 
 
60
  theme=gr.themes.Soft(),
61
  )
62
 
 
1
+ import subprocess, sys
2
+
3
+ subprocess.check_call([
4
+ sys.executable, "-m", "pip", "install", "--quiet",
5
+ "transformers>=4.45.0",
6
+ "accelerate>=0.26.0",
7
+ "sentencepiece>=0.1.99",
8
+ ])
9
+
10
  import gradio as gr
 
11
  import torch
12
+ from transformers import (
13
+ AutoTokenizer,
14
+ AutoModelForCausalLM,
15
+ TextIteratorStreamer,
16
+ )
17
+ from threading import Thread
18
 
19
  MODEL_ID = "google/gemma-4-31B-it-assistant"
20
 
21
+ print("Loading tokenizer...")
22
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
23
+
24
+ print("Loading model...")
25
  model = AutoModelForCausalLM.from_pretrained(
26
  MODEL_ID,
27
  torch_dtype=torch.bfloat16,
28
  device_map="auto",
29
  )
30
+ model.eval()
31
+ print("Model ready.")
32
+
33
 
34
  def chat(message, history):
35
  messages = []
36
  for user_msg, bot_msg in history:
37
+ messages.append({"role": "user", "content": user_msg})
38
+ messages.append({"role": "assistant", "content": bot_msg})
39
  messages.append({"role": "user", "content": message})
40
 
41
  inputs = tokenizer.apply_chat_template(
 
44
  add_generation_prompt=True,
45
  ).to(model.device)
46
 
 
 
 
47
  streamer = TextIteratorStreamer(
48
  tokenizer,
49
  skip_prompt=True,
50
  skip_special_tokens=True,
51
  )
52
 
53
+ thread = Thread(
54
+ target=model.generate,
55
+ kwargs=dict(
56
+ input_ids=inputs,
57
+ streamer=streamer,
58
+ max_new_tokens=512,
59
+ do_sample=True,
60
+ temperature=0.7,
61
+ top_p=0.9,
62
+ ),
63
  )
 
 
64
  thread.start()
65
 
66
  partial = ""
 
68
  partial += token
69
  yield partial
70
 
71
+
72
  demo = gr.ChatInterface(
73
  fn=chat,
74
  title="Gemma 4 Assistant",
75
+ description="google/gemma-4-31B-it-assistant streaming enabled",
76
+ examples=[
77
+ "Explain quantum computing in simple terms",
78
+ "Write a Python function to reverse a string",
79
+ "What is photosynthesis?",
80
+ ],
81
  theme=gr.themes.Soft(),
82
  )
83