anaspro
commited on
Commit
·
4ecae52
1
Parent(s):
9fc653f
updatE
Browse files
app.py
CHANGED
|
@@ -8,14 +8,11 @@ import av
|
|
| 8 |
import gradio as gr
|
| 9 |
import spaces
|
| 10 |
import torch
|
| 11 |
-
from gtts import gTTS
|
| 12 |
-
import io
|
| 13 |
-
import base64
|
| 14 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 15 |
from transformers.generation.streamers import TextIteratorStreamer
|
| 16 |
|
| 17 |
# Model configuration
|
| 18 |
-
model_id = "anaspro/Shako-4B-it
|
| 19 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 20 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 21 |
model_id,
|
|
@@ -157,33 +154,9 @@ def process_history(history: list[dict]) -> list[dict]:
|
|
| 157 |
return messages
|
| 158 |
|
| 159 |
|
| 160 |
-
def generate_speech(text: str, lang: str = 'ar') -> tuple[str, str]:
|
| 161 |
-
"""Generate speech from text using Google TTS and return audio file path and base64 data."""
|
| 162 |
-
try:
|
| 163 |
-
# Create TTS object
|
| 164 |
-
tts = gTTS(text=text, lang=lang, slow=False)
|
| 165 |
-
|
| 166 |
-
# Save to temporary file
|
| 167 |
-
temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
| 168 |
-
temp_audio_file.close()
|
| 169 |
-
|
| 170 |
-
tts.save(temp_audio_file.name)
|
| 171 |
-
|
| 172 |
-
# Also create base64 version for direct playback
|
| 173 |
-
audio_buffer = io.BytesIO()
|
| 174 |
-
tts.write_to_fp(audio_buffer)
|
| 175 |
-
audio_buffer.seek(0)
|
| 176 |
-
audio_base64 = base64.b64encode(audio_buffer.read()).decode('utf-8')
|
| 177 |
-
|
| 178 |
-
return temp_audio_file.name, f"data:audio/mp3;base64,{audio_base64}"
|
| 179 |
-
except Exception as e:
|
| 180 |
-
print(f"TTS Error: {e}")
|
| 181 |
-
return None, None
|
| 182 |
-
|
| 183 |
-
|
| 184 |
@spaces.GPU()
|
| 185 |
@torch.inference_mode()
|
| 186 |
-
def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512
|
| 187 |
if not validate_media_constraints(message):
|
| 188 |
yield ""
|
| 189 |
return
|
|
@@ -227,115 +200,32 @@ def generate(message: dict, history: list[dict], system_prompt: str = "", max_ne
|
|
| 227 |
output += delta
|
| 228 |
yield output
|
| 229 |
|
| 230 |
-
# Generate voice if enabled
|
| 231 |
-
if enable_voice and output.strip():
|
| 232 |
-
_, audio_data = generate_speech(output.strip(), lang='ar')
|
| 233 |
-
if audio_data:
|
| 234 |
-
yield {"text": output, "audio": audio_data}
|
| 235 |
-
else:
|
| 236 |
-
yield output
|
| 237 |
-
else:
|
| 238 |
-
yield output
|
| 239 |
-
|
| 240 |
|
| 241 |
# Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
|
| 242 |
examples = [
|
| 243 |
-
["
|
|
|
|
|
|
|
| 244 |
]
|
| 245 |
|
| 246 |
-
# Create
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
with gr.Column(scale=1):
|
| 265 |
-
voice_input = gr.Audio(
|
| 266 |
-
sources=["microphone"],
|
| 267 |
-
type="filepath",
|
| 268 |
-
label="🎤 تسجيل صوتي",
|
| 269 |
-
show_label=True,
|
| 270 |
-
)
|
| 271 |
-
|
| 272 |
-
with gr.Accordion("⚙️ إعدادات متقدمة", open=False):
|
| 273 |
-
system_prompt = gr.Textbox(
|
| 274 |
-
label="System Prompt",
|
| 275 |
-
value="انت ذكاء صناعي يتحدث باللهجة العراقية بس ما تستخدم فصحى ابدا",
|
| 276 |
-
lines=2
|
| 277 |
-
)
|
| 278 |
-
max_tokens = gr.Slider(
|
| 279 |
-
label="Max New Tokens",
|
| 280 |
-
minimum=100,
|
| 281 |
-
maximum=2000,
|
| 282 |
-
step=10,
|
| 283 |
-
value=700
|
| 284 |
-
)
|
| 285 |
-
enable_voice = gr.Checkbox(
|
| 286 |
-
label="تفعيل الصوت في الردود",
|
| 287 |
-
value=False
|
| 288 |
-
)
|
| 289 |
-
|
| 290 |
-
def process_input(message, voice_file, history, system_prompt, max_tokens, enable_voice):
|
| 291 |
-
"""Process both text and voice inputs"""
|
| 292 |
-
if voice_file:
|
| 293 |
-
# If voice input is provided, create a message with the audio file
|
| 294 |
-
voice_message = {"files": [voice_file], "text": message.get("text", "")}
|
| 295 |
-
else:
|
| 296 |
-
voice_message = message
|
| 297 |
-
|
| 298 |
-
# Generate response
|
| 299 |
-
response_text = ""
|
| 300 |
-
for partial_response in generate(voice_message, history, system_prompt, max_tokens, enable_voice):
|
| 301 |
-
if isinstance(partial_response, dict):
|
| 302 |
-
# Handle audio response
|
| 303 |
-
response_text = partial_response["text"]
|
| 304 |
-
yield partial_response
|
| 305 |
-
else:
|
| 306 |
-
response_text = partial_response
|
| 307 |
-
yield partial_response
|
| 308 |
-
|
| 309 |
-
# Handle submission
|
| 310 |
-
textbox.submit(
|
| 311 |
-
fn=process_input,
|
| 312 |
-
inputs=[textbox, voice_input, chatbot, system_prompt, max_tokens, enable_voice],
|
| 313 |
-
outputs=[chatbot]
|
| 314 |
-
).then(
|
| 315 |
-
fn=lambda: None,
|
| 316 |
-
inputs=[],
|
| 317 |
-
outputs=[voice_input] # Clear voice input after submission
|
| 318 |
-
)
|
| 319 |
-
|
| 320 |
-
# Clear voice input when text is submitted
|
| 321 |
-
textbox.submit(
|
| 322 |
-
fn=lambda: None,
|
| 323 |
-
inputs=[],
|
| 324 |
-
outputs=[voice_input]
|
| 325 |
-
)
|
| 326 |
-
|
| 327 |
-
# Examples
|
| 328 |
-
gr.Examples(
|
| 329 |
-
examples=[
|
| 330 |
-
"مرحبا، كيف حالك؟",
|
| 331 |
-
"شرح لي عن الذكاء الاصطناعي",
|
| 332 |
-
"أخبرني نكتة عراقية"
|
| 333 |
-
],
|
| 334 |
-
inputs=[textbox]
|
| 335 |
-
)
|
| 336 |
-
|
| 337 |
-
return demo
|
| 338 |
|
| 339 |
if __name__ == "__main__":
|
| 340 |
-
demo = create_interface()
|
| 341 |
demo.launch()
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
import spaces
|
| 10 |
import torch
|
|
|
|
|
|
|
|
|
|
| 11 |
from transformers import AutoModelForImageTextToText, AutoProcessor
|
| 12 |
from transformers.generation.streamers import TextIteratorStreamer
|
| 13 |
|
| 14 |
# Model configuration
|
| 15 |
+
model_id = "anaspro/Shako-4B-it"
|
| 16 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 17 |
model = AutoModelForImageTextToText.from_pretrained(
|
| 18 |
model_id,
|
|
|
|
| 154 |
return messages
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
@spaces.GPU()
|
| 158 |
@torch.inference_mode()
|
| 159 |
+
def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
|
| 160 |
if not validate_media_constraints(message):
|
| 161 |
yield ""
|
| 162 |
return
|
|
|
|
| 200 |
output += delta
|
| 201 |
yield output
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
# Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
|
| 205 |
examples = [
|
| 206 |
+
["What is the capital of France?", "You are a helpful assistant.", 700],
|
| 207 |
+
["Explain quantum computing in simple terms", "You are a helpful assistant.", 512],
|
| 208 |
+
["Write a short story about a robot learning to paint", "You are a helpful assistant.", 1000]
|
| 209 |
]
|
| 210 |
|
| 211 |
+
# Create the chat interface
|
| 212 |
+
demo = gr.ChatInterface(
|
| 213 |
+
fn=generate,
|
| 214 |
+
type="messages",
|
| 215 |
+
textbox=gr.MultimodalTextbox(
|
| 216 |
+
file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
|
| 217 |
+
file_count="multiple",
|
| 218 |
+
autofocus=True,
|
| 219 |
+
),
|
| 220 |
+
multimodal=True,
|
| 221 |
+
additional_inputs=[
|
| 222 |
+
gr.Textbox(label="System Prompt", value="انت ذكاء صناعي يتحدث باللهجة العراقية بس ما تستخدم فصحى ابدا"),
|
| 223 |
+
gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
|
| 224 |
+
],
|
| 225 |
+
title="Shako IRAQI AI",
|
| 226 |
+
examples=examples,
|
| 227 |
+
stop_btn=False,
|
| 228 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
if __name__ == "__main__":
|
|
|
|
| 231 |
demo.launch()
|