anaspro commited on
Commit
8bda143
·
1 Parent(s): a645494

Use original simple code structure with our customizations

Browse files

- Back to original GPT-OSS demo code (stable and tested)
- Keep our customizations: system_prompt.txt, Arabic examples, Arabic UI
- Model: unsloth/gpt-oss-20b-unsloth-bnb-4bit
- No complex caching - simple and works with ZeroGPU
- Arabic interface and NB TEL specific prompts

Files changed (1) hide show
  1. app.py +25 -114
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import os
 
 
2
  import gradio as gr
3
  import spaces
4
  import re
5
- from threading import Thread
6
- from functools import lru_cache
7
- from transformers import pipeline, TextIteratorStreamer
8
  from huggingface_hub import login
9
  import logging
10
  from openai_harmony import (
@@ -27,29 +26,21 @@ if os.getenv("HF_TOKEN"):
27
  login(token=os.getenv("HF_TOKEN"))
28
  logger.info("🔐 Logged in to Hugging Face")
29
 
30
- # Regex config for parsing reasoning and output
31
  RE_REASONING = re.compile(r'(?i)Reasoning:\s*(low|medium|high)')
32
  RE_FINAL_MARKER = re.compile(r'(?i)assistantfinal')
33
  RE_ANALYSIS_PREFIX = re.compile(r'(?i)^analysis\s*')
34
 
35
- # ======================================================
36
- # Load System Prompt
37
- # ======================================================
38
  try:
39
  with open("system_prompt.txt", "r", encoding="utf-8") as f:
40
  DEFAULT_SYSTEM_PROMPT = f.read()
41
  except FileNotFoundError:
42
  logger.warning("system_prompt.txt not found, using default prompt")
43
- DEFAULT_SYSTEM_PROMPT = """أنت مساعد ذكي متقدم يعتمد على نموذج GPT-OSS-20B من OpenAI مع دعم فني لشركة NB TEL.
44
- تحجي بالعراقي بأسلوب مهني ومحترف.
45
 
46
- Reasoning: high - استخدم مستوى تفكير عالي للتحليل المتعمق والحلول المتقدمة."""
47
-
48
- # ======================================================
49
- # Parse Reasoning Level from System Prompt
50
- # ======================================================
51
  def parse_reasoning_and_instructions(system_prompt: str):
52
- """Parse reasoning effort level from system prompt"""
53
  instructions = system_prompt or "You are a helpful assistant."
54
  match = RE_REASONING.search(instructions)
55
  effort_key = match.group(1).lower() if match else 'medium'
@@ -61,84 +52,48 @@ def parse_reasoning_and_instructions(system_prompt: str):
61
  cleaned_instructions = RE_REASONING.sub('', instructions).strip()
62
  return effort, cleaned_instructions
63
 
64
- # ======================================================
65
- # Model Configuration
66
- # ======================================================
67
  model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
68
 
69
- # Load harmony encoding (lightweight, can load outside GPU)
 
 
 
 
 
 
70
  enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
71
 
72
- # ======================================================
73
- # Cached Model Loader (for ZeroGPU)
74
- # ======================================================
75
- @lru_cache(maxsize=1)
76
- def load_model():
77
- """Load model with caching to avoid reloading"""
78
- logger.info("🚀 Loading GPT-OSS-20B model on GPU...")
79
- model_pipe = pipeline(
80
- "text-generation",
81
- model=model_id,
82
- torch_dtype="auto",
83
- device_map="auto",
84
- trust_remote_code=True,
85
- )
86
- logger.info("✅ Model loaded successfully!")
87
- return model_pipe
88
-
89
- # ======================================================
90
- # Format Conversation History
91
- # ======================================================
92
  def format_conversation_history(chat_history):
93
- """Format Gradio chat history to standard message format"""
94
  messages = []
95
  for item in chat_history:
96
  role = item["role"]
97
  content = item["content"]
98
  if isinstance(content, list):
99
  content = content[0]["text"] if content and "text" in content[0] else str(content)
100
- messages.append({"role": role, "content": content})
101
  return messages
102
 
103
- # ======================================================
104
- # Generate Response with Harmony Format
105
- # ======================================================
106
- @spaces.GPU(duration=120)
107
  def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
108
- """Generate response using GPT-OSS with Harmony format"""
109
-
110
- # Get cached model (loads only once)
111
- pipe = load_model()
112
-
113
- # Create new user message
114
  new_message = {"role": "user", "content": input_data}
115
  processed_history = format_conversation_history(chat_history)
116
-
117
- # Parse reasoning effort from system prompt
118
  effort, instructions = parse_reasoning_and_instructions(system_prompt)
119
-
120
- # Build harmony messages with proper system and developer roles
121
  system_content = SystemContent.new().with_reasoning_effort(effort)
122
  developer_content = DeveloperContent.new().with_instructions(instructions)
123
-
124
  harmony_messages = [
125
  Message.from_role_and_content(Role.SYSTEM, system_content),
126
  Message.from_role_and_content(Role.DEVELOPER, developer_content),
127
  ]
128
 
129
- # Add conversation history
130
  for m in processed_history + [new_message]:
131
  role = Role.USER if m["role"] == "user" else Role.ASSISTANT
132
  harmony_messages.append(Message.from_role_and_content(role, m["content"]))
133
-
134
- # Render conversation using harmony encoding
135
  conversation = Conversation.from_messages(harmony_messages)
136
  prompt_tokens = enc.render_conversation_for_completion(conversation, Role.ASSISTANT)
137
  prompt_text = pipe.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
138
-
139
- # Setup streaming
140
  streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
141
-
142
  generation_kwargs = {
143
  "max_new_tokens": max_new_tokens,
144
  "do_sample": True,
@@ -149,16 +104,13 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
149
  "streamer": streamer,
150
  "return_full_text": False,
151
  }
152
-
153
- # Generate in separate thread
154
  thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
155
  thread.start()
156
-
157
- # Parse thinking process and final answer
158
  thinking = ""
159
  final = ""
160
  started_final = False
161
-
162
  for chunk in streamer:
163
  if not started_final:
164
  parts = RE_FINAL_MARKER.split(chunk, maxsplit=1)
@@ -168,66 +120,25 @@ def generate_response(input_data, chat_history, max_new_tokens, system_prompt, t
168
  started_final = True
169
  else:
170
  final += chunk
171
-
172
- # Clean and format output
173
  clean_thinking = RE_ANALYSIS_PREFIX.sub('', thinking).strip()
174
  clean_final = final.strip()
175
-
176
- # Format with collapsible thinking section
177
- if clean_thinking:
178
- formatted = f"<details open><summary>🧠 عرض عملية التفكير (Thinking Process)</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
179
- else:
180
- formatted = clean_final
181
-
182
  yield formatted
183
 
184
- # ======================================================
185
- # Create Gradio Interface
186
- # ======================================================
187
  demo = gr.ChatInterface(
188
  fn=generate_response,
189
  additional_inputs=[
190
- gr.Slider(
191
- label="Max New Tokens",
192
- minimum=64,
193
- maximum=4096,
194
- step=1,
195
- value=2048
196
- ),
197
  gr.Textbox(
198
  label="System Prompt",
199
  value=DEFAULT_SYSTEM_PROMPT,
200
  lines=6,
201
  placeholder="يمكنك تعديل التعليمات والمستوى: Reasoning: low/medium/high"
202
  ),
203
- gr.Slider(
204
- label="Temperature",
205
- minimum=0.1,
206
- maximum=2.0,
207
- step=0.1,
208
- value=0.7
209
- ),
210
- gr.Slider(
211
- label="Top-p",
212
- minimum=0.05,
213
- maximum=1.0,
214
- step=0.05,
215
- value=0.9
216
- ),
217
- gr.Slider(
218
- label="Top-k",
219
- minimum=1,
220
- maximum=100,
221
- step=1,
222
- value=50
223
- ),
224
- gr.Slider(
225
- label="Repetition Penalty",
226
- minimum=1.0,
227
- maximum=2.0,
228
- step=0.05,
229
- value=1.0
230
- )
231
  ],
232
  examples=[
233
  [{"text": "النت عندي بطيء جداً رغم باقة 100 ميجا. شرحلي الأسباب المحتملة والحلول خطوة بخطوة."}],
 
1
  import os
2
+ from transformers import pipeline, TextIteratorStreamer
3
+ from threading import Thread
4
  import gradio as gr
5
  import spaces
6
  import re
 
 
 
7
  from huggingface_hub import login
8
  import logging
9
  from openai_harmony import (
 
26
  login(token=os.getenv("HF_TOKEN"))
27
  logger.info("🔐 Logged in to Hugging Face")
28
 
29
+ # regex config
30
  RE_REASONING = re.compile(r'(?i)Reasoning:\s*(low|medium|high)')
31
  RE_FINAL_MARKER = re.compile(r'(?i)assistantfinal')
32
  RE_ANALYSIS_PREFIX = re.compile(r'(?i)^analysis\s*')
33
 
34
+ # Load System Prompt from file
 
 
35
  try:
36
  with open("system_prompt.txt", "r", encoding="utf-8") as f:
37
  DEFAULT_SYSTEM_PROMPT = f.read()
38
  except FileNotFoundError:
39
  logger.warning("system_prompt.txt not found, using default prompt")
40
+ DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant. Reasoning: medium"
 
41
 
42
+ # Parse reasoning level from system prompt
 
 
 
 
43
  def parse_reasoning_and_instructions(system_prompt: str):
 
44
  instructions = system_prompt or "You are a helpful assistant."
45
  match = RE_REASONING.search(instructions)
46
  effort_key = match.group(1).lower() if match else 'medium'
 
52
  cleaned_instructions = RE_REASONING.sub('', instructions).strip()
53
  return effort, cleaned_instructions
54
 
 
 
 
55
  model_id = "unsloth/gpt-oss-20b-unsloth-bnb-4bit"
56
 
57
+ pipe = pipeline(
58
+ "text-generation",
59
+ model=model_id,
60
+ torch_dtype="auto",
61
+ device_map="auto",
62
+ trust_remote_code=True,
63
+ )
64
  enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  def format_conversation_history(chat_history):
 
67
  messages = []
68
  for item in chat_history:
69
  role = item["role"]
70
  content = item["content"]
71
  if isinstance(content, list):
72
  content = content[0]["text"] if content and "text" in content[0] else str(content)
73
+ messages.append({"role": "role", "content": content})
74
  return messages
75
 
76
+ @spaces.GPU()
 
 
 
77
  def generate_response(input_data, chat_history, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
 
 
 
 
 
 
78
  new_message = {"role": "user", "content": input_data}
79
  processed_history = format_conversation_history(chat_history)
 
 
80
  effort, instructions = parse_reasoning_and_instructions(system_prompt)
 
 
81
  system_content = SystemContent.new().with_reasoning_effort(effort)
82
  developer_content = DeveloperContent.new().with_instructions(instructions)
 
83
  harmony_messages = [
84
  Message.from_role_and_content(Role.SYSTEM, system_content),
85
  Message.from_role_and_content(Role.DEVELOPER, developer_content),
86
  ]
87
 
 
88
  for m in processed_history + [new_message]:
89
  role = Role.USER if m["role"] == "user" else Role.ASSISTANT
90
  harmony_messages.append(Message.from_role_and_content(role, m["content"]))
 
 
91
  conversation = Conversation.from_messages(harmony_messages)
92
  prompt_tokens = enc.render_conversation_for_completion(conversation, Role.ASSISTANT)
93
  prompt_text = pipe.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
94
+
 
95
  streamer = TextIteratorStreamer(pipe.tokenizer, skip_prompt=True, skip_special_tokens=True)
96
+
97
  generation_kwargs = {
98
  "max_new_tokens": max_new_tokens,
99
  "do_sample": True,
 
104
  "streamer": streamer,
105
  "return_full_text": False,
106
  }
 
 
107
  thread = Thread(target=pipe, args=(prompt_text,), kwargs=generation_kwargs)
108
  thread.start()
109
+
110
+ # parsing thinking
111
  thinking = ""
112
  final = ""
113
  started_final = False
 
114
  for chunk in streamer:
115
  if not started_final:
116
  parts = RE_FINAL_MARKER.split(chunk, maxsplit=1)
 
120
  started_final = True
121
  else:
122
  final += chunk
 
 
123
  clean_thinking = RE_ANALYSIS_PREFIX.sub('', thinking).strip()
124
  clean_final = final.strip()
125
+ formatted = f"<details open><summary>🧠 عرض عملية التفكير (Thinking Process)</summary>\n\n{clean_thinking}\n\n</details>\n\n{clean_final}"
 
 
 
 
 
 
126
  yield formatted
127
 
 
 
 
128
  demo = gr.ChatInterface(
129
  fn=generate_response,
130
  additional_inputs=[
131
+ gr.Slider(label="Max new tokens", minimum=64, maximum=4096, step=1, value=2048),
 
 
 
 
 
 
132
  gr.Textbox(
133
  label="System Prompt",
134
  value=DEFAULT_SYSTEM_PROMPT,
135
  lines=6,
136
  placeholder="يمكنك تعديل التعليمات والمستوى: Reasoning: low/medium/high"
137
  ),
138
+ gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
139
+ gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
140
+ gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
141
+ gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  ],
143
  examples=[
144
  [{"text": "النت عندي بطيء جداً رغم باقة 100 ميجا. شرحلي الأسباب المحتملة والحلول خطوة بخطوة."}],