anaspro commited on
Commit
4ecae52
·
1 Parent(s): 9fc653f
Files changed (1) hide show
  1. app.py +23 -133
app.py CHANGED
@@ -8,14 +8,11 @@ import av
8
  import gradio as gr
9
  import spaces
10
  import torch
11
- from gtts import gTTS
12
- import io
13
- import base64
14
  from transformers import AutoModelForImageTextToText, AutoProcessor
15
  from transformers.generation.streamers import TextIteratorStreamer
16
 
17
  # Model configuration
18
- model_id = "anaspro/Shako-4B-it-v2"
19
  processor = AutoProcessor.from_pretrained(model_id)
20
  model = AutoModelForImageTextToText.from_pretrained(
21
  model_id,
@@ -157,33 +154,9 @@ def process_history(history: list[dict]) -> list[dict]:
157
  return messages
158
 
159
 
160
- def generate_speech(text: str, lang: str = 'ar') -> tuple[str, str]:
161
- """Generate speech from text using Google TTS and return audio file path and base64 data."""
162
- try:
163
- # Create TTS object
164
- tts = gTTS(text=text, lang=lang, slow=False)
165
-
166
- # Save to temporary file
167
- temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
168
- temp_audio_file.close()
169
-
170
- tts.save(temp_audio_file.name)
171
-
172
- # Also create base64 version for direct playback
173
- audio_buffer = io.BytesIO()
174
- tts.write_to_fp(audio_buffer)
175
- audio_buffer.seek(0)
176
- audio_base64 = base64.b64encode(audio_buffer.read()).decode('utf-8')
177
-
178
- return temp_audio_file.name, f"data:audio/mp3;base64,{audio_base64}"
179
- except Exception as e:
180
- print(f"TTS Error: {e}")
181
- return None, None
182
-
183
-
184
  @spaces.GPU()
185
  @torch.inference_mode()
186
- def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512, enable_voice: bool = False) -> Iterator[tuple[str, str | None]]:
187
  if not validate_media_constraints(message):
188
  yield ""
189
  return
@@ -227,115 +200,32 @@ def generate(message: dict, history: list[dict], system_prompt: str = "", max_ne
227
  output += delta
228
  yield output
229
 
230
- # Generate voice if enabled
231
- if enable_voice and output.strip():
232
- _, audio_data = generate_speech(output.strip(), lang='ar')
233
- if audio_data:
234
- yield {"text": output, "audio": audio_data}
235
- else:
236
- yield output
237
- else:
238
- yield output
239
-
240
 
241
  # Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
242
  examples = [
243
- ["انت موديل عراقي تحكي هعراقي فقط وتكون ترفيهي", 700]
 
 
244
  ]
245
 
246
- # Create custom interface with voice recording
247
- def create_interface():
248
- with gr.Blocks(title="Shako IRAQI AI", theme=gr.themes.Soft()) as demo:
249
- gr.Markdown("# Shako IRAQI AI 🤖")
250
- gr.Markdown("تحدث مع الذكاء الاصطناعي العراقي - يدعم الصور والفيديو والصوت!")
251
-
252
- chatbot = gr.Chatbot(type="messages", height=500)
253
-
254
- with gr.Row():
255
- with gr.Column(scale=4):
256
- textbox = gr.MultimodalTextbox(
257
- file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
258
- file_count="multiple",
259
- placeholder="اكتب رسالتك هنا أو ارفع ملف...",
260
- show_label=False,
261
- autofocus=True,
262
- )
263
-
264
- with gr.Column(scale=1):
265
- voice_input = gr.Audio(
266
- sources=["microphone"],
267
- type="filepath",
268
- label="🎤 تسجيل صوتي",
269
- show_label=True,
270
- )
271
-
272
- with gr.Accordion("⚙️ إعدادات متقدمة", open=False):
273
- system_prompt = gr.Textbox(
274
- label="System Prompt",
275
- value="انت ذكاء صناعي يتحدث باللهجة العراقية بس ما تستخدم فصحى ابدا",
276
- lines=2
277
- )
278
- max_tokens = gr.Slider(
279
- label="Max New Tokens",
280
- minimum=100,
281
- maximum=2000,
282
- step=10,
283
- value=700
284
- )
285
- enable_voice = gr.Checkbox(
286
- label="تفعيل الصوت في الردود",
287
- value=False
288
- )
289
-
290
- def process_input(message, voice_file, history, system_prompt, max_tokens, enable_voice):
291
- """Process both text and voice inputs"""
292
- if voice_file:
293
- # If voice input is provided, create a message with the audio file
294
- voice_message = {"files": [voice_file], "text": message.get("text", "")}
295
- else:
296
- voice_message = message
297
-
298
- # Generate response
299
- response_text = ""
300
- for partial_response in generate(voice_message, history, system_prompt, max_tokens, enable_voice):
301
- if isinstance(partial_response, dict):
302
- # Handle audio response
303
- response_text = partial_response["text"]
304
- yield partial_response
305
- else:
306
- response_text = partial_response
307
- yield partial_response
308
-
309
- # Handle submission
310
- textbox.submit(
311
- fn=process_input,
312
- inputs=[textbox, voice_input, chatbot, system_prompt, max_tokens, enable_voice],
313
- outputs=[chatbot]
314
- ).then(
315
- fn=lambda: None,
316
- inputs=[],
317
- outputs=[voice_input] # Clear voice input after submission
318
- )
319
-
320
- # Clear voice input when text is submitted
321
- textbox.submit(
322
- fn=lambda: None,
323
- inputs=[],
324
- outputs=[voice_input]
325
- )
326
-
327
- # Examples
328
- gr.Examples(
329
- examples=[
330
- "مرحبا، كيف حالك؟",
331
- "شرح لي عن الذكاء الاصطناعي",
332
- "أخبرني نكتة عراقية"
333
- ],
334
- inputs=[textbox]
335
- )
336
-
337
- return demo
338
 
339
  if __name__ == "__main__":
340
- demo = create_interface()
341
  demo.launch()
 
8
  import gradio as gr
9
  import spaces
10
  import torch
 
 
 
11
  from transformers import AutoModelForImageTextToText, AutoProcessor
12
  from transformers.generation.streamers import TextIteratorStreamer
13
 
14
  # Model configuration
15
+ model_id = "anaspro/Shako-4B-it"
16
  processor = AutoProcessor.from_pretrained(model_id)
17
  model = AutoModelForImageTextToText.from_pretrained(
18
  model_id,
 
154
  return messages
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  @spaces.GPU()
158
  @torch.inference_mode()
159
+ def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
160
  if not validate_media_constraints(message):
161
  yield ""
162
  return
 
200
  output += delta
201
  yield output
202
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  # Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
205
  examples = [
206
+ ["What is the capital of France?", "You are a helpful assistant.", 700],
207
+ ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512],
208
+ ["Write a short story about a robot learning to paint", "You are a helpful assistant.", 1000]
209
  ]
210
 
211
+ # Create the chat interface
212
+ demo = gr.ChatInterface(
213
+ fn=generate,
214
+ type="messages",
215
+ textbox=gr.MultimodalTextbox(
216
+ file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
217
+ file_count="multiple",
218
+ autofocus=True,
219
+ ),
220
+ multimodal=True,
221
+ additional_inputs=[
222
+ gr.Textbox(label="System Prompt", value="انت ذكاء صناعي يتحدث باللهجة العراقية بس ما تستخدم فصحى ابدا"),
223
+ gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
224
+ ],
225
+ title="Shako IRAQI AI",
226
+ examples=examples,
227
+ stop_btn=False,
228
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  if __name__ == "__main__":
 
231
  demo.launch()