HedronCreeper commited on
Commit
9928aed
·
verified ·
1 Parent(s): 0b6ff79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -27
app.py CHANGED
@@ -1,47 +1,56 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
 
5
  model_id = "google/gemma-4-E2B"
6
 
 
7
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
8
  model = AutoModelForCausalLM.from_pretrained(
9
  model_id,
10
- device_map={"": "cpu"},
11
- dtype=torch.float32,
12
  low_cpu_mem_usage=True
13
  )
14
 
15
- pipe = pipeline(
16
- "text-generation",
17
- model=model,
18
- tokenizer=tokenizer,
19
- device="cpu"
20
- )
21
 
22
  def predict(message, history):
23
- messages = [
24
- {"role": "user", "content": message},
25
- ]
26
-
27
- prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
28
-
29
- outputs = pipe(
30
- prompt,
31
- max_new_tokens=512,
32
- do_sample=True,
33
- temperature=0.7,
34
- top_k=50,
35
- top_p=0.95,
36
  )
37
-
38
- return outputs[0]["generated_text"][len(prompt):]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  demo = gr.ChatInterface(
41
  fn=predict,
42
- title="Gemma-4-E2B Chatbot",
43
- description="Running on Free CPU Space - No Memory",
44
  )
45
 
46
  if __name__ == "__main__":
47
- demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
 
5
  model_id = "google/gemma-4-E2B"
6
 
7
+ # Load tokenizer
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
+
10
+ # Load model in LOW MEMORY MODE
11
  model = AutoModelForCausalLM.from_pretrained(
12
  model_id,
13
+ device_map="auto", # better memory distribution
14
+ torch_dtype=torch.float16, # HUGE RAM saver vs float32
15
  low_cpu_mem_usage=True
16
  )
17
 
18
+ model.eval()
19
+
 
 
 
 
20
 
21
  def predict(message, history):
22
+ messages = [{"role": "user", "content": message}]
23
+
24
+ prompt = tokenizer.apply_chat_template(
25
+ messages,
26
+ tokenize=False,
27
+ add_generation_prompt=True
 
 
 
 
 
 
 
28
  )
29
+
30
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
31
+
32
+ with torch.no_grad():
33
+ output = model.generate(
34
+ **inputs,
35
+ max_new_tokens=256, # lowered to reduce RAM spikes
36
+ do_sample=True,
37
+ temperature=0.7,
38
+ top_k=50,
39
+ top_p=0.95,
40
+ use_cache=True
41
+ )
42
+
43
+ decoded = tokenizer.decode(output[0], skip_special_tokens=True)
44
+
45
+ # return only new text
46
+ return decoded[len(prompt):]
47
+
48
 
49
  demo = gr.ChatInterface(
50
  fn=predict,
51
+ title="Gemma-4-E2B Chatbot (Optimized)",
52
+ description="Low RAM CPU-optimized version "
53
  )
54
 
55
  if __name__ == "__main__":
56
+ demo.launch()