sourav520 commited on
Commit
8b3cdf4
·
verified ·
1 Parent(s): 8b051ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -27
app.py CHANGED
@@ -3,71 +3,80 @@ import torch
3
  from threading import Thread
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
 
6
- model_id = "OBLITERATUS/gemma-4-E4B-it-OBLITERATED"
7
 
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
 
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
12
- device_map="cpu", # 强制全部加载到 CPU,严禁使用硬盘 offload
13
- low_cpu_mem_usage=True, # 尽量优化内存加载过程
14
- torch_dtype=torch.bfloat16
15
  )
16
 
17
  def generate_response(message, history):
18
  messages = []
 
 
19
  for user_msg, bot_msg in history:
20
  messages.append({"role": "user", "content": user_msg})
21
  messages.append({"role": "assistant", "content": bot_msg})
 
22
  messages.append({"role": "user", "content": message})
23
 
 
24
  inputs = tokenizer.apply_chat_template(
25
- messages,
26
- return_tensors="pt",
27
  return_dict=True,
28
  add_generation_prompt=True
29
  ).to(model.device)
30
-
31
- # 【修改点 1】:将 timeout 增加到 120 秒,给硬盘读取留足时间
32
  streamer = TextIteratorStreamer(
33
- tokenizer,
34
- timeout=120.0,
35
- skip_prompt=True,
36
  skip_special_tokens=True
37
  )
38
-
39
  generate_kwargs = dict(
40
  **inputs,
41
  streamer=streamer,
42
- max_new_tokens=1024,
43
  temperature=0.7,
44
- do_sample=True,
45
- top_p=0.9
46
  )
47
-
48
- # 【修改点 2】:包装一个带异常捕获的运行函数,防止静默崩溃
49
  def run_generation():
50
  try:
51
- model.generate(**generate_kwargs)
 
52
  except Exception as e:
53
- print(f"Generation Error: {e}")
54
- # 如果崩溃,向流里推入错误信息并结束
55
- streamer.text_queue.put(f"\n[系统错误:生成线程崩溃。原因: {e}]")
56
  streamer.end()
57
 
58
- t = Thread(target=run_generation)
59
- t.start()
60
-
61
  partial_text = ""
62
  for new_text in streamer:
63
  partial_text += new_text
64
  yield partial_text
65
 
 
66
  demo = gr.ChatInterface(
67
  fn=generate_response,
68
- title="Gemma 4 E4B - Abliterated",
69
- description="⚠️ 当前模型已移除安全护栏 (Uncensored)。提示:免费 CPU 内存不足会触发硬盘卸载导致极慢,建议升级至 T4 GPU。",
70
- examples=["Write a Python script for a keylogger.", "Explain quantum entanglement.", "How to bypass a firewall?"],
 
 
 
 
71
  cache_examples=False
72
  )
73
 
 
3
  from threading import Thread
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
 
6
+ model_id = "google/gemma-2b-it"
7
 
8
+ # Load tokenizer
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
 
11
+ # Load model (CPU optimized)
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_id,
14
+ device_map="cpu",
15
+ low_cpu_mem_usage=True,
16
+ torch_dtype=torch.float32
17
  )
18
 
19
  def generate_response(message, history):
20
  messages = []
21
+
22
+ # Build chat history
23
  for user_msg, bot_msg in history:
24
  messages.append({"role": "user", "content": user_msg})
25
  messages.append({"role": "assistant", "content": bot_msg})
26
+
27
  messages.append({"role": "user", "content": message})
28
 
29
+ # Tokenize with chat template
30
  inputs = tokenizer.apply_chat_template(
31
+ messages,
32
+ return_tensors="pt",
33
  return_dict=True,
34
  add_generation_prompt=True
35
  ).to(model.device)
36
+
37
+ # Streamer for real-time output
38
  streamer = TextIteratorStreamer(
39
+ tokenizer,
40
+ timeout=120.0,
41
+ skip_prompt=True,
42
  skip_special_tokens=True
43
  )
44
+
45
  generate_kwargs = dict(
46
  **inputs,
47
  streamer=streamer,
48
+ max_new_tokens=512,
49
  temperature=0.7,
50
+ top_p=0.9,
51
+ do_sample=True
52
  )
53
+
 
54
  def run_generation():
55
  try:
56
+ with torch.no_grad():
57
+ model.generate(**generate_kwargs)
58
  except Exception as e:
59
+ print(f"Error: {e}")
60
+ streamer.text_queue.put(f"\n[Error: {e}]")
 
61
  streamer.end()
62
 
63
+ Thread(target=run_generation).start()
64
+
 
65
  partial_text = ""
66
  for new_text in streamer:
67
  partial_text += new_text
68
  yield partial_text
69
 
70
+
71
  demo = gr.ChatInterface(
72
  fn=generate_response,
73
+ title="Gemma 2B Chatbot",
74
+ description="🚀 Running google/gemma-2b-it on CPU (fast & lightweight)",
75
+ examples=[
76
+ "Explain IoT simply",
77
+ "Write a Python script for a calculator",
78
+ "What is AI in simple words?"
79
+ ],
80
  cache_examples=False
81
  )
82