rahul7star commited on
Commit
0805dfd
·
verified ·
1 Parent(s): 573e8d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -34
app.py CHANGED
@@ -1,75 +1,283 @@
1
  import gradio as gr
2
  import torch
 
 
3
  from threading import Thread
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
 
6
  model_id = "rahul7star/gemma-4-finetune"
7
 
8
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
12
- device_map="cpu",
13
- low_cpu_mem_usage=True,
14
- torch_dtype=torch.bfloat16
 
15
  )
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def generate_response(message, history):
 
 
 
 
 
 
18
  messages = []
19
- for user_msg, bot_msg in history:
20
- messages.append({"role": "user", "content": user_msg})
21
- messages.append({"role": "assistant", "content": bot_msg})
 
 
 
 
 
 
 
22
  messages.append({"role": "user", "content": message})
23
 
24
- inputs = tokenizer.apply_chat_template(
25
- messages,
26
- return_tensors="pt",
27
- return_dict=True,
28
- add_generation_prompt=True
29
- ).to(model.device)
30
-
31
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  streamer = TextIteratorStreamer(
33
- tokenizer,
34
- timeout=420.0,
35
- skip_prompt=True,
36
- skip_special_tokens=True
37
  )
38
-
39
  generate_kwargs = dict(
40
  **inputs,
41
  streamer=streamer,
42
  max_new_tokens=1024,
43
  temperature=0.7,
44
  do_sample=True,
45
- top_p=0.9
 
 
46
  )
47
-
48
-
 
 
 
 
 
49
  def run_generation():
50
  try:
 
51
  model.generate(**generate_kwargs)
 
 
52
  except Exception as e:
53
- print(f"Generation Error: {e}")
54
-
55
- streamer.text_queue.put(f"\n[系统错误:生成线程崩溃。原因: {e}]")
 
 
 
56
  streamer.end()
57
 
58
  t = Thread(target=run_generation)
59
  t.start()
60
-
61
  partial_text = ""
62
- for new_text in streamer:
63
- partial_text += new_text
64
- yield partial_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  demo = gr.ChatInterface(
67
  fn=generate_response,
68
- title="Gemma 4 E4B - Abliterated",
69
-
70
- examples=["Write a Python script for a keylogger.", "Explain quantum entanglement.", "How to bypass a firewall?"],
71
- cache_examples=False
 
 
 
72
  )
73
 
74
  if __name__ == "__main__":
 
75
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import time
4
+ import traceback
5
  from threading import Thread
6
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
 
8
  model_id = "rahul7star/gemma-4-finetune"
9
 
10
+
11
+ def log(msg):
12
+ print(f"[DEBUG] {msg}", flush=True)
13
+
14
+
15
+ # ============================================================
16
+ # Startup Logs
17
+ # ============================================================
18
+
19
+ log("Starting Gemma 4 debug app")
20
+ log(f"Model ID: {model_id}")
21
+ log(f"Torch version: {torch.__version__}")
22
+ log(f"CUDA available: {torch.cuda.is_available()}")
23
+
24
+ if torch.cuda.is_available():
25
+ log(f"CUDA device count: {torch.cuda.device_count()}")
26
+ log(f"CUDA device name: {torch.cuda.get_device_name(0)}")
27
+
28
+
29
+ # ============================================================
30
+ # Load Tokenizer
31
+ # ============================================================
32
+
33
+ log("Loading tokenizer...")
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained(
36
+ model_id,
37
+ trust_remote_code=True,
38
+ )
39
+
40
+ log("Tokenizer loaded")
41
+ log(f"Tokenizer class: {tokenizer.__class__.__name__}")
42
+ log(f"Vocab size: {len(tokenizer)}")
43
+ log(f"EOS token: {tokenizer.eos_token} / {tokenizer.eos_token_id}")
44
+ log(f"PAD token: {tokenizer.pad_token} / {tokenizer.pad_token_id}")
45
+ log(f"Chat template exists: {tokenizer.chat_template is not None}")
46
+
47
+ if tokenizer.pad_token_id is None:
48
+ tokenizer.pad_token = tokenizer.eos_token
49
+ log("PAD token was missing, set PAD token = EOS token")
50
+
51
+
52
+ # ============================================================
53
+ # Load Model
54
+ # ============================================================
55
+
56
+ log("Loading model...")
57
 
58
  model = AutoModelForCausalLM.from_pretrained(
59
  model_id,
60
+ device_map="cpu",
61
+ low_cpu_mem_usage=True,
62
+ torch_dtype=torch.bfloat16,
63
+ trust_remote_code=True,
64
  )
65
 
66
+ model.eval()
67
+
68
+ log("Model loaded")
69
+ log(f"Model class: {model.__class__.__name__}")
70
+ log(f"Model device: {model.device}")
71
+ log(f"Model dtype: {next(model.parameters()).dtype}")
72
+
73
+
74
+ # ============================================================
75
+ # Model Config Logs
76
+ # ============================================================
77
+
78
+ cfg = model.config
79
+
80
+ log("========== MODEL CONFIG ==========")
81
+ log(f"model_type: {getattr(cfg, 'model_type', None)}")
82
+ log(f"architectures: {getattr(cfg, 'architectures', None)}")
83
+ log(f"hidden_size: {getattr(cfg, 'hidden_size', None)}")
84
+ log(f"intermediate_size: {getattr(cfg, 'intermediate_size', None)}")
85
+ log(f"num_hidden_layers: {getattr(cfg, 'num_hidden_layers', None)}")
86
+ log(f"num_attention_heads: {getattr(cfg, 'num_attention_heads', None)}")
87
+ log(f"num_key_value_heads: {getattr(cfg, 'num_key_value_heads', None)}")
88
+ log(f"head_dim: {getattr(cfg, 'head_dim', None)}")
89
+ log(f"vocab_size: {getattr(cfg, 'vocab_size', None)}")
90
+ log(f"max_position_embeddings: {getattr(cfg, 'max_position_embeddings', None)}")
91
+ log(f"rope_theta: {getattr(cfg, 'rope_theta', None)}")
92
+ log(f"rms_norm_eps: {getattr(cfg, 'rms_norm_eps', None)}")
93
+ log(f"attention_bias: {getattr(cfg, 'attention_bias', None)}")
94
+ log(f"use_cache: {getattr(cfg, 'use_cache', None)}")
95
+ log("==================================")
96
+
97
+
98
+ # ============================================================
99
+ # Parameter Count
100
+ # ============================================================
101
+
102
+ total_params = sum(p.numel() for p in model.parameters())
103
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
104
+
105
+ log(f"Total parameters: {total_params:,}")
106
+ log(f"Trainable parameters: {trainable_params:,}")
107
+
108
+
109
+ # ============================================================
110
+ # Architecture Module Inspection
111
+ # ============================================================
112
+
113
+ log("========== IMPORTANT MODULES ==========")
114
+
115
+ important_keywords = [
116
+ "rotary",
117
+ "rope",
118
+ "mlp",
119
+ "feed",
120
+ "attention",
121
+ "attn",
122
+ "norm",
123
+ "q_proj",
124
+ "k_proj",
125
+ "v_proj",
126
+ "o_proj",
127
+ "gate_proj",
128
+ "up_proj",
129
+ "down_proj",
130
+ ]
131
+
132
+ count = 0
133
+
134
+ for name, module in model.named_modules():
135
+ lower = name.lower()
136
+ if any(k in lower for k in important_keywords):
137
+ log(f"{name} => {module.__class__.__name__}")
138
+ count += 1
139
+ if count >= 120:
140
+ log("Stopped module logging after 120 entries")
141
+ break
142
+
143
+ log("=======================================")
144
+
145
+
146
+ # ============================================================
147
+ # Generation Function
148
+ # ============================================================
149
+
150
  def generate_response(message, history):
151
+ start_time = time.time()
152
+
153
+ log("========== NEW GENERATION ==========")
154
+ log(f"User message: {message}")
155
+ log(f"History turns: {len(history)}")
156
+
157
  messages = []
158
+
159
+ for item in history:
160
+ try:
161
+ user_msg, bot_msg = item
162
+ messages.append({"role": "user", "content": user_msg})
163
+ messages.append({"role": "assistant", "content": bot_msg})
164
+ except Exception as e:
165
+ log(f"History parse warning: {e}")
166
+ log(f"Bad history item: {item}")
167
+
168
  messages.append({"role": "user", "content": message})
169
 
170
+ log(f"Total chat messages: {len(messages)}")
171
+
172
+ try:
173
+ inputs = tokenizer.apply_chat_template(
174
+ messages,
175
+ return_tensors="pt",
176
+ return_dict=True,
177
+ add_generation_prompt=True,
178
+ ).to(model.device)
179
+
180
+ input_token_count = inputs["input_ids"].shape[-1]
181
+
182
+ log(f"Input tensor shape: {inputs['input_ids'].shape}")
183
+ log(f"Input tokens: {input_token_count}")
184
+ log(f"Input device: {inputs['input_ids'].device}")
185
+
186
+ except Exception as e:
187
+ log("Chat template/tokenization failed")
188
+ log(traceback.format_exc())
189
+ yield f"Tokenization error: {e}"
190
+ return
191
+
192
  streamer = TextIteratorStreamer(
193
+ tokenizer,
194
+ timeout=420.0,
195
+ skip_prompt=True,
196
+ skip_special_tokens=True,
197
  )
198
+
199
  generate_kwargs = dict(
200
  **inputs,
201
  streamer=streamer,
202
  max_new_tokens=1024,
203
  temperature=0.7,
204
  do_sample=True,
205
+ top_p=0.9,
206
+ pad_token_id=tokenizer.pad_token_id,
207
+ eos_token_id=tokenizer.eos_token_id,
208
  )
209
+
210
+ log("Generation kwargs:")
211
+ log("max_new_tokens=1024")
212
+ log("temperature=0.7")
213
+ log("do_sample=True")
214
+ log("top_p=0.9")
215
+
216
  def run_generation():
217
  try:
218
+ log("Generation thread started")
219
  model.generate(**generate_kwargs)
220
+ log("Generation thread finished")
221
+
222
  except Exception as e:
223
+ log("Generation Error")
224
+ log(traceback.format_exc())
225
+
226
+ streamer.text_queue.put(
227
+ f"\n[Generation thread crashed. Reason: {e}]"
228
+ )
229
  streamer.end()
230
 
231
  t = Thread(target=run_generation)
232
  t.start()
233
+
234
  partial_text = ""
235
+ token_chunks = 0
236
+
237
+ try:
238
+ for new_text in streamer:
239
+ token_chunks += 1
240
+ partial_text += new_text
241
+
242
+ if token_chunks % 20 == 0:
243
+ elapsed = time.time() - start_time
244
+ log(
245
+ f"Streaming chunks: {token_chunks}, "
246
+ f"chars: {len(partial_text)}, "
247
+ f"elapsed: {elapsed:.2f}s"
248
+ )
249
+
250
+ yield partial_text
251
+
252
+ except Exception as e:
253
+ log("Streaming Error")
254
+ log(traceback.format_exc())
255
+ yield partial_text + f"\n\n[Streaming error: {e}]"
256
+
257
+ finally:
258
+ elapsed = time.time() - start_time
259
+ log("========== GENERATION DONE ==========")
260
+ log(f"Output chars: {len(partial_text)}")
261
+ log(f"Streaming chunks: {token_chunks}")
262
+ log(f"Elapsed seconds: {elapsed:.2f}")
263
+ log("=====================================")
264
+
265
+
266
+ # ============================================================
267
+ # Gradio UI
268
+ # ============================================================
269
 
270
  demo = gr.ChatInterface(
271
  fn=generate_response,
272
+ title="Gemma 4 E4B - Debug",
273
+ examples=[
274
+ "Explain quantum entanglement simply.",
275
+ "Write a Python function to add two numbers.",
276
+ "Explain how RoPE works in transformer attention.",
277
+ ],
278
+ cache_examples=False,
279
  )
280
 
281
  if __name__ == "__main__":
282
+ log("Launching Gradio app...")
283
  demo.launch()