ibrahimlasfar commited on
Commit
1c08614
·
1 Parent(s): 2621d75

Fix model availability, audio/image submission, and enhance UI

Browse files
Files changed (3) hide show
  1. api/endpoints.py +1 -1
  2. main.py +191 -107
  3. utils/generation.py +1 -1
api/endpoints.py CHANGED
@@ -12,7 +12,7 @@ router = APIRouter()
12
  HF_TOKEN = os.getenv("HF_TOKEN")
13
  BACKUP_HF_TOKEN = os.getenv("BACKUP_HF_TOKEN")
14
  API_ENDPOINT = os.getenv("API_ENDPOINT", "https://router.huggingface.co/v1")
15
- MODEL_NAME = os.getenv("MODEL_NAME", "openai/gpt-oss-120b:cerebras")
16
  SECONDARY_MODEL_NAME = os.getenv("SECONDARY_MODEL_NAME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:featherless-ai")
17
  TERTIARY_MODEL_NAME = os.getenv("TERTIARY_MODEL_NAME", "openai/gpt-oss-20b:together")
18
 
 
12
  HF_TOKEN = os.getenv("HF_TOKEN")
13
  BACKUP_HF_TOKEN = os.getenv("BACKUP_HF_TOKEN")
14
  API_ENDPOINT = os.getenv("API_ENDPOINT", "https://router.huggingface.co/v1")
15
+ MODEL_NAME = os.getenv("MODEL_NAME", "openai/gpt-oss-120b:together")
16
  SECONDARY_MODEL_NAME = os.getenv("SECONDARY_MODEL_NAME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:featherless-ai")
17
  TERTIARY_MODEL_NAME = os.getenv("TERTIARY_MODEL_NAME", "openai/gpt-oss-20b:together")
18
 
main.py CHANGED
@@ -31,74 +31,101 @@ CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
31
 
32
  # إعداد CSS
33
  css = """
34
- .gradio-container { max-width: 1200px; margin: auto; font-family: Arial, sans-serif; }
 
 
 
 
 
35
  .chatbot {
36
- border: 1px solid #ccc;
37
- border-radius: 15px;
38
- padding: 20px;
39
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
 
 
 
40
  }
41
- .input-textbox {
42
- font-size: 18px;
43
- padding: 12px;
44
- border-radius: 8px;
45
- border: 1px solid #aaa;
 
 
 
 
 
 
 
46
  }
47
- .upload-button, .audio-input-button, .audio-record-button {
48
- background: #4CAF50;
49
- color: white;
50
- border-radius: 8px;
51
- padding: 10px 20px;
52
  font-size: 16px;
53
- cursor: pointer;
 
 
54
  }
55
- .upload-button:hover, .audio-input-button:hover, .audio-record-button:hover {
56
- background: #45a049;
 
 
 
 
 
 
57
  }
58
- .upload-button::before {
59
- content: '📷 ';
60
- font-size: 20px;
61
  }
62
- .audio-input-button::before {
63
- content: '🎤 ';
64
- font-size: 20px;
 
 
 
 
 
 
 
 
 
65
  }
66
- .audio-record-button::before {
67
- content: '🔊 ';
68
- font-size: 20px;
69
  }
70
- .loading::after {
71
- content: '';
72
- display: inline-block;
73
- width: 18px;
74
- height: 18px;
75
- border: 3px solid #333;
76
- border-top-color: transparent;
77
- border-radius: 50%;
78
- animation: spin 1s linear infinite;
79
- margin-left: 10px;
80
  }
81
- @keyframes spin {
82
- to { transform: rotate(360deg); }
 
 
 
 
83
  }
84
- .output-container {
85
- margin-top: 25px;
86
- padding: 15px;
87
- border: 1px solid #ddd;
 
 
 
88
  border-radius: 10px;
89
- background: #fff;
90
  }
91
- .audio-output-container {
92
- display: flex;
93
- align-items: center;
94
- gap: 15px;
95
- margin-top: 15px;
96
  }
97
- .output-format-radio {
98
- margin-top: 10px;
99
  }
100
  """
101
-
102
  # دالة لمعالجة الإدخال
103
  def process_input(message, audio_input=None, image_input=None, history=None, system_prompt=None, temperature=0.7, reasoning_effort="medium", enable_browsing=True, max_new_tokens=128000, output_format="text"):
104
  input_type = "text"
@@ -106,62 +133,93 @@ def process_input(message, audio_input=None, image_input=None, history=None, sys
106
  image_data = None
107
  if audio_input:
108
  input_type = "audio"
109
- with open(audio_input, "rb") as f:
110
- audio_data = f.read()
111
- message = "Transcribe this audio"
 
 
 
 
112
  elif image_input:
113
  input_type = "image"
114
- with open(image_input, "rb") as f:
115
- image_data = f.read()
116
- message = f"Analyze this image"
 
 
 
 
117
 
118
  response_text = ""
119
  audio_response = None
120
- for chunk in generate(
121
- message=message,
122
- history=history,
123
- system_prompt=system_prompt,
124
- temperature=temperature,
125
- reasoning_effort=reasoning_effort,
126
- enable_browsing=enable_browsing,
127
- max_new_tokens=max_new_tokens,
128
- input_type=input_type,
129
- audio_data=audio_data,
130
- image_data=image_data,
131
- output_format=output_format
132
- ):
133
- if isinstance(chunk, bytes):
134
- audio_response = io.BytesIO(chunk)
135
- audio_response.name = "response.wav"
136
- else:
137
- response_text += chunk
138
- yield response_text, audio_response
 
 
 
 
139
 
140
  # دالة لمعالجة زر إرسال الصوت
141
  def submit_audio(audio_input, output_format):
142
  if not audio_input:
143
  return "Please upload or record an audio file.", None
144
- return process_input(message="", audio_input=audio_input, output_format=output_format)
 
 
 
 
 
 
 
 
 
145
 
146
  # دالة لمعالجة زر إرسال الصورة
147
  def submit_image(image_input, output_format):
148
  if not image_input:
149
  return "Please upload an image.", None
150
- return process_input(message="", image_input=image_input, output_format=output_format)
 
 
 
 
 
 
 
 
 
151
 
 
152
  # إعداد واجهة Gradio
153
  with gr.Blocks(css=css, theme="gradio/soft") as chatbot_ui:
154
  gr.Markdown(
155
  """
156
  # MGZon Chatbot 🤖
157
- A versatile chatbot powered by DeepSeek, GPT-OSS, CLIP, Whisper, and Parler-TTS. Supports text, audio, and image inputs with text or voice outputs. Upload files, record audio, or type your query and choose your output format!
158
  """
159
  )
160
  with gr.Row():
161
  with gr.Column(scale=3):
162
- chatbot = gr.Chatbot(label="Chat", height=500, latex_delimiters=LATEX_DELIMS)
163
  with gr.Column(scale=1):
164
- with gr.Accordion("⚙️ Settings", open=True):
165
  system_prompt = gr.Textbox(
166
  label="System Prompt",
167
  value="You are an expert assistant providing detailed, comprehensive, and well-structured responses. Support text, audio, image, and file inputs. For audio, transcribe using Whisper. For text-to-speech, use Parler-TTS. For images, analyze content appropriately. Respond in the requested output format (text or audio).",
@@ -171,40 +229,66 @@ with gr.Blocks(css=css, theme="gradio/soft") as chatbot_ui:
171
  reasoning_effort = gr.Radio(label="Reasoning Effort", choices=["low", "medium", "high"], value="medium")
172
  enable_browsing = gr.Checkbox(label="Enable DeepSearch (web browsing)", value=True)
173
  max_new_tokens = gr.Slider(label="Max New Tokens", minimum=50, maximum=128000, step=50, value=128000)
174
- output_format = gr.Radio(
175
- label="Output Format",
176
- choices=["text", "audio"],
177
- value="text",
178
- elem_classes="output-format-radio"
179
- )
180
- with gr.Row():
181
- message = gr.Textbox(label="Type your message", placeholder="Enter your query or describe your request...", lines=2, elem_classes="input-textbox")
182
- submit_btn = gr.Button("Send", variant="primary")
183
  with gr.Row():
184
- with gr.Column(scale=1):
185
- audio_input = gr.Audio(label="Record or Upload Audio", type="filepath", elem_classes="audio-input")
186
- audio_submit_btn = gr.Button("Send Audio", elem_classes="audio-input-button")
187
- with gr.Column(scale=1):
188
- image_input = gr.File(label="Upload Image", file_types=["image"], elem_classes="upload-button")
189
- image_submit_btn = gr.Button("Send Image", elem_classes="upload-button")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  output_text = gr.Textbox(label="Response", lines=10, elem_classes="output-container")
191
  output_audio = gr.Audio(label="Voice Output", type="filepath", elem_classes="audio-output-container", autoplay=True)
192
 
193
- # ربط الأزرار
 
 
 
 
 
 
 
 
194
  submit_btn.click(
195
  fn=process_input,
196
- inputs=[message, audio_input, image_input, chatbot, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens, output_format],
197
- outputs=[output_text, output_audio]
 
198
  )
199
- audio_submit_btn.click(
200
- fn=submit_audio,
201
- inputs=[audio_input, output_format],
202
- outputs=[output_text, output_audio]
 
203
  )
204
- image_submit_btn.click(
205
  fn=submit_image,
206
- inputs=[image_input, output_format],
207
- outputs=[output_text, output_audio]
 
 
 
 
 
 
 
208
  )
209
 
210
  # إعداد FastAPI
 
31
 
32
  # إعداد CSS
33
  css = """
34
+ .gradio-container {
35
+ max-width: 1000px;
36
+ margin: auto;
37
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
38
+ background: #f0f2f5;
39
+ }
40
  .chatbot {
41
+ border: none;
42
+ border-radius: 20px;
43
+ padding: 15px;
44
+ background: #fff;
45
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
46
+ height: 600px;
47
+ overflow-y: auto;
48
  }
49
+ .input-container {
50
+ display: flex;
51
+ align-items: center;
52
+ gap: 8px;
53
+ border: 1px solid #ddd;
54
+ border-radius: 25px;
55
+ padding: 8px;
56
+ background: #fff;
57
+ box-shadow: 0 1px 3px rgba(0,0,0,0.1);
58
+ position: sticky;
59
+ bottom: 10px;
60
+ margin: 10px;
61
  }
62
+ .input-textbox {
63
+ flex-grow: 1;
64
+ border: none;
65
+ outline: none;
 
66
  font-size: 16px;
67
+ padding: 10px 15px;
68
+ border-radius: 20px;
69
+ background: transparent;
70
  }
71
+ .input-icon {
72
+ background: none;
73
+ border: none;
74
+ cursor: pointer;
75
+ font-size: 22px;
76
+ padding: 8px;
77
+ color: #555;
78
+ transition: color 0.2s;
79
  }
80
+ .input-icon:hover {
81
+ color: #0084ff;
 
82
  }
83
+ .submit-btn {
84
+ background: #0084ff;
85
+ color: white;
86
+ border-radius: 50%;
87
+ width: 36px;
88
+ height: 36px;
89
+ display: flex;
90
+ align-items: center;
91
+ justify-content: center;
92
+ font-size: 18px;
93
+ cursor: pointer;
94
+ box-shadow: 0 1px 3px rgba(0,0,0,0.2);
95
  }
96
+ .submit-btn:hover {
97
+ background: #0066cc;
 
98
  }
99
+ .output-container {
100
+ margin: 15px 0;
101
+ padding: 15px;
102
+ border-radius: 10px;
103
+ background: #f9f9f9;
104
+ border: 1px solid #e0e0e0;
 
 
 
 
105
  }
106
+ .settings-accordion {
107
+ background: #fff;
108
+ border-radius: 10px;
109
+ padding: 15px;
110
+ box-shadow: 0 1px 5px rgba(0,0,0,0.1);
111
+ margin-bottom: 10px;
112
  }
113
+ .audio-output-container {
114
+ display: flex;
115
+ align-items: center;
116
+ gap: 10px;
117
+ margin-top: 10px;
118
+ background: #fff;
119
+ padding: 10px;
120
  border-radius: 10px;
 
121
  }
122
+ .gr-button {
123
+ transition: background-color 0.2s, transform 0.1s;
 
 
 
124
  }
125
+ .gr-button:hover {
126
+ transform: scale(1.05);
127
  }
128
  """
 
129
  # دالة لمعالجة الإدخال
130
  def process_input(message, audio_input=None, image_input=None, history=None, system_prompt=None, temperature=0.7, reasoning_effort="medium", enable_browsing=True, max_new_tokens=128000, output_format="text"):
131
  input_type = "text"
 
133
  image_data = None
134
  if audio_input:
135
  input_type = "audio"
136
+ try:
137
+ with open(audio_input, "rb") as f:
138
+ audio_data = f.read()
139
+ message = "Transcribe this audio"
140
+ except Exception as e:
141
+ logger.error(f"Failed to read audio file: {e}")
142
+ return f"Error: Failed to read audio file: {e}", None
143
  elif image_input:
144
  input_type = "image"
145
+ try:
146
+ with open(image_input, "rb") as f:
147
+ image_data = f.read()
148
+ message = "Analyze this image"
149
+ except Exception as e:
150
+ logger.error(f"Failed to read image file: {e}")
151
+ return f"Error: Failed to read image file: {e}", None
152
 
153
  response_text = ""
154
  audio_response = None
155
+ try:
156
+ for chunk in generate(
157
+ message=message,
158
+ history=history,
159
+ system_prompt=system_prompt,
160
+ temperature=temperature,
161
+ reasoning_effort=reasoning_effort,
162
+ enable_browsing=enable_browsing,
163
+ max_new_tokens=max_new_tokens,
164
+ input_type=input_type,
165
+ audio_data=audio_data,
166
+ image_data=image_data,
167
+ output_format=output_format
168
+ ):
169
+ if isinstance(chunk, bytes):
170
+ audio_response = io.BytesIO(chunk)
171
+ audio_response.name = "response.wav"
172
+ else:
173
+ response_text += chunk
174
+ yield response_text or "Processing...", audio_response
175
+ except Exception as e:
176
+ logger.error(f"Generation failed: {e}")
177
+ yield f"Error: Generation failed: {e}", None
178
 
179
  # دالة لمعالجة زر إرسال الصوت
180
  def submit_audio(audio_input, output_format):
181
  if not audio_input:
182
  return "Please upload or record an audio file.", None
183
+ response_text = ""
184
+ audio_response = None
185
+ try:
186
+ for text, audio in process_input(message="", audio_input=audio_input, output_format=output_format):
187
+ response_text = text or "No text response generated."
188
+ audio_response = audio
189
+ return response_text, audio_response
190
+ except Exception as e:
191
+ logger.error(f"Audio submission failed: {e}")
192
+ return f"Error: Audio processing failed: {e}", None
193
 
194
  # دالة لمعالجة زر إرسال الصورة
195
  def submit_image(image_input, output_format):
196
  if not image_input:
197
  return "Please upload an image.", None
198
+ response_text = ""
199
+ audio_response = None
200
+ try:
201
+ for text, audio in process_input(message="", image_input=image_input, output_format=output_format):
202
+ response_text = text or "No text response generated."
203
+ audio_response = audio
204
+ return response_text, audio_response
205
+ except Exception as e:
206
+ logger.error(f"Image submission failed: {e}")
207
+ return f"Error: Image processing failed: {e}", None
208
 
209
+ # إعداد واجهة Gradio
210
  # إعداد واجهة Gradio
211
  with gr.Blocks(css=css, theme="gradio/soft") as chatbot_ui:
212
  gr.Markdown(
213
  """
214
  # MGZon Chatbot 🤖
215
+ A versatile chatbot powered by DeepSeek, GPT-OSS, CLIP, Whisper, and Parler-TTS. Type your query, upload images/files, or record audio in one sleek input form!
216
  """
217
  )
218
  with gr.Row():
219
  with gr.Column(scale=3):
220
+ chatbot = gr.Chatbot(label="Chat", height=600, latex_delimiters=LATEX_DELIMS, elem_classes="chatbot")
221
  with gr.Column(scale=1):
222
+ with gr.Accordion("⚙️ Settings", open=False, elem_classes="settings-accordion"):
223
  system_prompt = gr.Textbox(
224
  label="System Prompt",
225
  value="You are an expert assistant providing detailed, comprehensive, and well-structured responses. Support text, audio, image, and file inputs. For audio, transcribe using Whisper. For text-to-speech, use Parler-TTS. For images, analyze content appropriately. Respond in the requested output format (text or audio).",
 
229
  reasoning_effort = gr.Radio(label="Reasoning Effort", choices=["low", "medium", "high"], value="medium")
230
  enable_browsing = gr.Checkbox(label="Enable DeepSearch (web browsing)", value=True)
231
  max_new_tokens = gr.Slider(label="Max New Tokens", minimum=50, maximum=128000, step=50, value=128000)
232
+ output_format = gr.Radio(label="Output Format", choices=["text", "audio"], value="text")
 
 
 
 
 
 
 
 
233
  with gr.Row():
234
+ with gr.Column():
235
+ with gr.Group(elem_classes="input-container"):
236
+ message = gr.Textbox(
237
+ placeholder="Type your message, or use icons to upload files/audio...",
238
+ lines=1,
239
+ elem_classes="input-textbox",
240
+ show_label=False
241
+ )
242
+ file_input = gr.File(
243
+ file_types=["image", ".mp3", ".wav"],
244
+ show_label=False,
245
+ elem_classes="input-icon",
246
+ visible=False
247
+ )
248
+ audio_input = gr.Audio(
249
+ type="filepath",
250
+ show_label=False,
251
+ elem_classes="input-icon",
252
+ visible=False
253
+ )
254
+ file_btn = gr.Button("📎", elem_classes="input-icon")
255
+ audio_btn = gr.Button("🎤", elem_classes="input-icon")
256
+ submit_btn = gr.Button("➡️", elem_classes="submit-btn")
257
  output_text = gr.Textbox(label="Response", lines=10, elem_classes="output-container")
258
  output_audio = gr.Audio(label="Voice Output", type="filepath", elem_classes="audio-output-container", autoplay=True)
259
 
260
+ # ربط الأحداث
261
+ file_btn.click(
262
+ fn=lambda: gr.update(visible=True),
263
+ outputs=file_input
264
+ )
265
+ audio_btn.click(
266
+ fn=lambda: gr.update(visible=True),
267
+ outputs=audio_input
268
+ )
269
  submit_btn.click(
270
  fn=process_input,
271
+ inputs=[message, audio_input, file_input, chatbot, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens, output_format],
272
+ outputs=[output_text, output_audio, chatbot, message],
273
+ _js="() => { return ['', null, null, []]; }" # تنظيف المدخلات
274
  )
275
+ message.submit(
276
+ fn=process_input,
277
+ inputs=[message, audio_input, file_input, chatbot, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens, output_format],
278
+ outputs=[output_text, output_audio, chatbot, message],
279
+ _js="() => { return ['', null, null, []]; }" # تنظيف المدخلات
280
  )
281
+ file_input.change(
282
  fn=submit_image,
283
+ inputs=[file_input, output_format],
284
+ outputs=[output_text, output_audio, chatbot, message],
285
+ _js="() => { return ['', null, null, []]; }" # تنظيف المدخلات
286
+ )
287
+ audio_input.change(
288
+ fn=submit_audio,
289
+ inputs=[audio_input, output_format],
290
+ outputs=[output_text, output_audio, chatbot, message],
291
+ _js="() => { return ['', null, null, []]; }" # تنظيف المدخلات
292
  )
293
 
294
  # إعداد FastAPI
utils/generation.py CHANGED
@@ -35,7 +35,7 @@ HF_TOKEN = os.getenv("HF_TOKEN")
35
  BACKUP_HF_TOKEN = os.getenv("BACKUP_HF_TOKEN")
36
  API_ENDPOINT = os.getenv("API_ENDPOINT", "https://router.huggingface.co/v1")
37
  FALLBACK_API_ENDPOINT = "https://api-inference.huggingface.co/v1"
38
- MODEL_NAME = os.getenv("MODEL_NAME", "openai/gpt-oss-120b:cerebras")
39
  SECONDARY_MODEL_NAME = os.getenv("SECONDARY_MODEL_NAME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:featherless-ai")
40
  TERTIARY_MODEL_NAME = os.getenv("TERTIARY_MODEL_NAME", "openai/gpt-oss-20b:together")
41
  CLIP_BASE_MODEL = os.getenv("CLIP_BASE_MODEL", "openai/clip-vit-base-patch32")
 
35
  BACKUP_HF_TOKEN = os.getenv("BACKUP_HF_TOKEN")
36
  API_ENDPOINT = os.getenv("API_ENDPOINT", "https://router.huggingface.co/v1")
37
  FALLBACK_API_ENDPOINT = "https://api-inference.huggingface.co/v1"
38
+ MODEL_NAME = os.getenv("MODEL_NAME", "openai/gpt-oss-120b:together")
39
  SECONDARY_MODEL_NAME = os.getenv("SECONDARY_MODEL_NAME", "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B:featherless-ai")
40
  TERTIARY_MODEL_NAME = os.getenv("TERTIARY_MODEL_NAME", "openai/gpt-oss-20b:together")
41
  CLIP_BASE_MODEL = os.getenv("CLIP_BASE_MODEL", "openai/clip-vit-base-patch32")