anaspro commited on
Commit
238300f
·
1 Parent(s): 6d60e00
Files changed (1) hide show
  1. app.py +48 -73
app.py CHANGED
@@ -1,9 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
-
3
  import os
4
  import torch
5
  import transformers
6
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
  import gradio as gr
8
  import spaces
9
 
@@ -22,64 +20,31 @@ model_path = "unsloth/gemma-3-4b-it-unsloth-bnb-4bit"
22
  # إذا كان فيه HF_TOKEN في البيئة
23
  hf_token = os.getenv("HF_TOKEN")
24
 
25
- # Initialize model and tokenizer separately for better control
26
- print("Loading model and tokenizer...")
27
- try:
28
- tokenizer = AutoTokenizer.from_pretrained(
29
- model_path,
30
- token=hf_token,
31
- trust_remote_code=True
32
- )
33
-
34
- # Load model with proper quantization config
35
- from transformers import BitsAndBytesConfig
36
-
37
- bnb_config = BitsAndBytesConfig(
38
- load_in_4bit=True,
39
- bnb_4bit_use_double_quant=True,
40
- bnb_4bit_quant_type="nf4",
41
- bnb_4bit_compute_dtype=torch.bfloat16
42
- )
43
-
44
- model = AutoModelForCausalLM.from_pretrained(
45
- model_path,
46
- quantization_config=bnb_config,
47
- device_map="auto",
48
- token=hf_token,
49
- trust_remote_code=True,
50
- torch_dtype=torch.bfloat16,
51
- low_cpu_mem_usage=True
52
- )
53
-
54
- # Create pipeline with the loaded model
55
- pipeline_model = pipeline(
56
- "text-generation",
57
- model=model,
58
- tokenizer=tokenizer
59
- )
60
-
61
- print("Model loaded successfully!")
62
-
63
- except Exception as e:
64
- print(f"Error loading model: {e}")
65
- # Fallback to direct pipeline loading
66
- print("Trying alternative loading method...")
67
- pipeline_model = pipeline(
68
- "text-generation",
69
- model=model_path,
70
- token=hf_token,
71
- trust_remote_code=True,
72
- model_kwargs={
73
- "torch_dtype": torch.bfloat16,
74
- "low_cpu_mem_usage": True,
75
- }
76
- )
77
- tokenizer = pipeline_model.tokenizer
78
- print("Model loaded with fallback method!")
79
 
80
 
81
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
82
- """Generate response using the pipeline with messages format"""
83
  # Gemma expects messages in format: [{"role": "user", "content": "..."}, {"role": "model", "content": "..."}]
84
  # Convert 'assistant' to 'model' for Gemma
85
  gemma_messages = []
@@ -109,7 +74,7 @@ def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=
109
  )
110
  except Exception as template_error:
111
  print(f"Template application error: {template_error}")
112
- # Fallback: manually format messages
113
  prompt = ""
114
  for msg in gemma_messages:
115
  if msg['role'] == 'user':
@@ -121,20 +86,29 @@ def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=
121
  # Debug: print final prompt
122
  print(f"Final prompt preview: {prompt[:200]}...")
123
 
124
- outputs = pipeline_model(
125
- prompt,
126
- max_new_tokens=max_new_tokens,
127
- temperature=temperature,
128
- top_p=top_p,
129
- top_k=top_k,
130
- repetition_penalty=repetition_penalty,
131
- do_sample=True,
132
- return_full_text=False
133
- )
134
- return outputs[0]["generated_text"]
 
 
 
 
 
 
 
 
 
135
 
136
 
137
- @spaces.GPU()
138
  def generate_response(message, history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
139
  """
140
  Generate response with full conversation history
@@ -168,7 +142,8 @@ def generate_response(message, history, max_new_tokens, temperature, top_p, top_
168
  # Debug: print messages structure
169
  print(f"Messages sent to model: {len(messages)} messages")
170
  for i, msg in enumerate(messages):
171
- print(f" Message {i}: role={msg['role']}, content_preview={msg['content'][:50]}...")
 
172
 
173
  # Generate response
174
  response = generate_with_pipeline(
@@ -234,4 +209,4 @@ demo = gr.ChatInterface(
234
  )
235
 
236
  if __name__ == "__main__":
237
- demo.launch()
 
 
 
1
  import os
2
  import torch
3
  import transformers
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
5
  import gradio as gr
6
  import spaces
7
 
 
20
  # إذا كان فيه HF_TOKEN في البيئة
21
  hf_token = os.getenv("HF_TOKEN")
22
 
23
+ # Initialize model and tokenizer for ZeroGPU
24
+ print("Loading model and tokenizer for ZeroGPU...")
25
+
26
+ # Load tokenizer first
27
+ tokenizer = AutoTokenizer.from_pretrained(
28
+ model_path,
29
+ token=hf_token,
30
+ trust_remote_code=True
31
+ )
32
+
33
+ # For ZeroGPU, load model without specifying device_map
34
+ # The @spaces.GPU() decorator will handle GPU allocation
35
+ model = AutoModelForCausalLM.from_pretrained(
36
+ model_path,
37
+ token=hf_token,
38
+ trust_remote_code=True,
39
+ torch_dtype=torch.float16, # Use float16 for ZeroGPU
40
+ low_cpu_mem_usage=True
41
+ )
42
+
43
+ print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  def generate_with_pipeline(messages, max_new_tokens=256, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.0):
47
+ """Generate response using the model with messages format"""
48
  # Gemma expects messages in format: [{"role": "user", "content": "..."}, {"role": "model", "content": "..."}]
49
  # Convert 'assistant' to 'model' for Gemma
50
  gemma_messages = []
 
74
  )
75
  except Exception as template_error:
76
  print(f"Template application error: {template_error}")
77
+ # Fallback: manually format messages for Gemma
78
  prompt = ""
79
  for msg in gemma_messages:
80
  if msg['role'] == 'user':
 
86
  # Debug: print final prompt
87
  print(f"Final prompt preview: {prompt[:200]}...")
88
 
89
+ # Tokenize
90
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
91
+
92
+ # Generate
93
+ with torch.no_grad():
94
+ outputs = model.generate(
95
+ **inputs,
96
+ max_new_tokens=max_new_tokens,
97
+ temperature=temperature,
98
+ top_p=top_p,
99
+ top_k=top_k,
100
+ repetition_penalty=repetition_penalty,
101
+ do_sample=True,
102
+ pad_token_id=tokenizer.pad_token_id,
103
+ eos_token_id=tokenizer.eos_token_id
104
+ )
105
+
106
+ # Decode only the new tokens
107
+ response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
108
+ return response
109
 
110
 
111
+ @spaces.GPU() # This decorator handles GPU allocation for ZeroGPU
112
  def generate_response(message, history, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
113
  """
114
  Generate response with full conversation history
 
142
  # Debug: print messages structure
143
  print(f"Messages sent to model: {len(messages)} messages")
144
  for i, msg in enumerate(messages):
145
+ content_preview = msg['content'][:50] if len(msg['content']) > 50 else msg['content']
146
+ print(f" Message {i}: role={msg['role']}, content_preview={content_preview}...")
147
 
148
  # Generate response
149
  response = generate_with_pipeline(
 
209
  )
210
 
211
  if __name__ == "__main__":
212
+ demo.launch()