kemo2003 commited on
Commit
5ea0a54
·
verified ·
1 Parent(s): 4f69bb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +263 -84
app.py CHANGED
@@ -1,136 +1,315 @@
 
1
  import gradio as gr
2
  import torch
3
- from TTS.api import TTS
4
  import numpy as np
5
  from PIL import Image
6
  import fitz # PyMuPDF
7
  import pandas as pd
8
  from huggingface_hub import login
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
 
 
 
 
 
 
 
 
10
  import os
 
 
11
 
12
- # تسجيل الدخول باستخدام المفتاح من المتغير البيئي
 
 
13
  token = os.getenv("HF_API_TOKEN")
14
  if token:
15
  login(token=token)
16
  else:
17
- raise ValueError("لم يتم تعيين المفتاح البيئي HF_API_TOKEN.")
 
 
 
 
18
 
19
- # اسم النموذج
20
- mistral_model_name = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
 
 
 
 
 
 
 
 
 
21
 
22
- # تحميل التوكنيزر والموديل
23
- mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
24
- mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name)
 
 
 
 
 
 
 
25
 
26
- # Initialize device
27
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
28
 
29
- # Text model: Mistral 7B
30
- mistral_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
31
- mistral_tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
32
- mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)
33
 
34
- # Image model: BLIP-2
35
- blip_model_name = "Salesforce/blip2-opt-2.7b"
36
- blip_processor = Blip2Processor.from_pretrained(blip_model_name)
37
- blip_model = Blip2ForConditionalGeneration.from_pretrained(blip_model_name, torch_dtype=torch.float16 if device == "cuda" else torch.float32).to(device)
 
 
38
 
39
- # Speech-to-text: Whisper
40
- whisper_model_name = "openai/whisper-small"
41
- whisper_processor = WhisperProcessor.from_pretrained(whisper_model_name)
42
- whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # Text-to-speech: Coqui TTS
45
- tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 
 
46
 
47
- # === Text generation ===
48
- def generate_text_response(prompt):
 
 
49
  try:
50
- friendly_prompt = f"أجب على السؤال التالي بطريقة ودية وواضحة:\n{prompt}"
51
- inputs = mistral_tokenizer(friendly_prompt, return_tensors="pt").to(device)
52
- outputs = mistral_model.generate(**inputs, max_length=200, num_return_sequences=1)
53
- return mistral_tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  except Exception as e:
 
55
  return f"خطأ في معالجة النص: {str(e)}"
56
 
57
- # === Image analysis ===
58
- def analyze_image(image, question=None):
59
  try:
60
- image = Image.fromarray(image).convert("RGB")
61
- prompt = question if question else "صف محتوى الصورة بالتفصيل"
62
- inputs = blip_processor(images=image, text=prompt, return_tensors="pt").to(device)
63
- outputs = blip_model.generate(**inputs)
64
- return blip_processor.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  except Exception as e:
 
66
  return f"خطأ في تحليل الصورة: {str(e)}"
67
 
68
- # === Audio processing ===
69
- def process_audio(audio):
70
  try:
71
- sample_rate, audio_data = audio
72
- audio_data = audio_data.astype(np.float32) / np.max(np.abs(audio_data))
73
- input_features = whisper_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
74
- predicted_ids = whisper_model.generate(input_features)
75
- transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  text_response = generate_text_response(transcription)
77
- audio_output = tts.tts(text_response)
78
- audio_output = np.array(audio_output, dtype=np.float32)
79
- return transcription, text_response, (16000, audio_output)
 
 
 
 
 
 
 
 
80
  except Exception as e:
81
- return f"خطأ في معالجة الصوت: {str(e)}", "", None
 
 
 
82
 
83
- # === File processing ===
84
- def process_file(file):
85
  try:
86
- if file.name.endswith(".pdf"):
87
- with fitz.open(file.name) as doc:
88
- text = "\n".join(page.get_text() for page in doc)
89
- elif file.name.endswith((".xlsx", ".xls")):
90
- df = pd.read_excel(file.name)
91
- text = df.to_string()
92
- elif file.name.endswith(".csv"):
93
- df = pd.read_csv(file.name)
94
- text = df.to_string()
 
 
 
 
95
  else:
96
- return "❌ نوع الملف غير مدعوم حالياً."
 
 
 
97
 
98
- response = generate_text_response(f"الملف يحتوي على:\n{text}\n\nلخص المحتوى.")
 
 
 
 
 
 
99
  return response
100
  except Exception as e:
 
101
  return f"خطأ في قراءة الملف: {str(e)}"
102
 
103
- # === Gradio Interface ===
104
- with gr.Blocks(css=".gradio-container {background-color: #f0f4f8; font-family: Arial; color: #333; padding: 20px;}") as demo:
105
- gr.Markdown("# 🤖 Kemo Chat - مساعد ذكي متعدد الوسائط")
106
- gr.Markdown("🎯 تفاعل معي عبر النصوص، الصور، الصوت أو الملفات! يدعم العربية والإنجليزية.")
107
- gr.Markdown("📁 يدعم الملفات: PDF، Excel، CSV\n🖼️ يدعم الوصف الذكي للصور\n🎙️ تحويل الصوت إلى نص والرد صوتياً")
108
 
109
  with gr.Tab("💬 المحادثة النصية"):
110
- text_input = gr.Textbox(label="اكتب سؤالك أو رسالتك")
111
- text_output = gr.Textbox(label="الرد")
112
- text_submit = gr.Button("إرسال")
113
  text_submit.click(fn=generate_text_response, inputs=text_input, outputs=text_output)
114
 
115
  with gr.Tab("🖼️ تحليل الصور"):
116
- image_input = gr.Image(label="ارفع صورة")
117
- image_question = gr.Textbox(label="اختياري: اسأل سؤال عن الصورة")
118
- image_output = gr.Textbox(label="الوصف أو الإجابة")
119
- image_submit = gr.Button("تحليل الصورة")
 
 
 
120
  image_submit.click(fn=analyze_image, inputs=[image_input, image_question], outputs=image_output)
121
 
122
  with gr.Tab("🎤 التفاعل الصوتي"):
123
- audio_input = gr.Audio(source="microphone", label="سجّل رسالتك")
124
- audio_transcription = gr.Textbox(label="النص المستخرج")
125
- audio_text_response = gr.Textbox(label="الرد النصي")
126
- audio_output = gr.Audio(label="الرد الص��تي")
127
- audio_submit = gr.Button("معالجة الصوت")
128
- audio_submit.click(fn=process_audio, inputs=audio_input, outputs=[audio_transcription, audio_text_response, audio_output])
 
 
 
 
129
 
130
  with gr.Tab("📄 تحليل الملفات"):
131
- file_input = gr.File(label="ارفع ملفك (PDF, Excel, CSV)")
132
- file_output = gr.Textbox(label="الرد على محتوى الملف")
133
- file_submit = gr.Button("تحليل الملف")
 
134
  file_submit.click(fn=process_file, inputs=file_input, outputs=file_output)
135
 
136
- demo.launch()
 
 
 
 
1
+ # coding: utf-8
2
  import gradio as gr
3
  import torch
 
4
  import numpy as np
5
  from PIL import Image
6
  import fitz # PyMuPDF
7
  import pandas as pd
8
  from huggingface_hub import login
9
+ from transformers import (
10
+ AutoTokenizer,
11
+ AutoModelForCausalLM,
12
+ LlavaNextProcessor,
13
+ LlavaNextForConditionalGeneration,
14
+ WhisperProcessor,
15
+ WhisperForConditionalGeneration,
16
+ SpeechT5Processor,
17
+ SpeechT5ForTextToSpeech,
18
+ SpeechT5HifiGan
19
+ )
20
+ from datasets import load_dataset
21
  import os
22
+ import scipy.io.wavfile as wavfile
23
+ import io
24
 
25
+ # --- Configuration & Model Loading ---
26
+
27
+ # Hugging Face Hub Login
28
  token = os.getenv("HF_API_TOKEN")
29
  if token:
30
  login(token=token)
31
  else:
32
+ print("Warning: HF_API_TOKEN not set. Some models might require login.")
33
+
34
+ # Device Configuration
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ print(f"Using device: {device}")
37
 
38
+ # 1. Text Generation Model: microsoft/Phi-4-reasoning-plus
39
+ text_model_name = "microsoft/Phi-4-reasoning-plus"
40
+ print(f"Loading text model: {text_model_name}")
41
+ text_tokenizer = AutoTokenizer.from_pretrained(text_model_name)
42
+ text_model = AutoModelForCausalLM.from_pretrained(
43
+ text_model_name,
44
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, # bfloat16 for better performance if available
45
+ device_map="auto", # Automatically uses available GPUs or CPU
46
+ trust_remote_code=True # Phi models may require this
47
+ )
48
+ print("Text model loaded.")
49
 
50
+ # 2. Image Analysis Model: llava-hf/llava-1.5-7b-hf
51
+ image_model_name = "llava-hf/llava-1.5-7b-hf"
52
+ print(f"Loading image model: {image_model_name}")
53
+ image_processor = LlavaNextProcessor.from_pretrained(image_model_name)
54
+ image_model = LlavaNextForConditionalGeneration.from_pretrained(
55
+ image_model_name,
56
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
57
+ device_map="auto"
58
+ )
59
+ print("Image model loaded.")
60
 
61
+ # 3. Speech-to-Text Model: openai/whisper-large-v3
62
+ stt_model_name = "openai/whisper-large-v3"
63
+ print(f"Loading STT model: {stt_model_name}")
64
+ stt_processor = WhisperProcessor.from_pretrained(stt_model_name)
65
+ stt_model = WhisperForConditionalGeneration.from_pretrained(stt_model_name).to(device)
66
+ stt_model.config.forced_decoder_ids = None # For multilingual capabilities
67
+ print("STT model loaded.")
68
 
69
+ # 4. Text-to-Speech Model: microsoft/speecht5_tts & microsoft/speecht5_hifigan
70
+ tts_processor_name = "microsoft/speecht5_tts"
71
+ tts_model_name = "microsoft/speecht5_tts"
72
+ tts_vocoder_name = "microsoft/speecht5_hifigan"
73
 
74
+ print(f"Loading TTS processor: {tts_processor_name}")
75
+ tts_processor = SpeechT5Processor.from_pretrained(tts_processor_name)
76
+ print(f"Loading TTS model: {tts_model_name}")
77
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained(tts_model_name).to(device)
78
+ print(f"Loading TTS vocoder: {tts_vocoder_name}")
79
+ tts_vocoder = SpeechT5HifiGan.from_pretrained(tts_vocoder_name).to(device)
80
 
81
+ # Load speaker embeddings (example from CMU ARCTIC dataset)
82
+ print("Loading speaker embeddings for TTS...")
83
+ try:
84
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
85
+ # speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device) # A specific speaker
86
+ # Let's pick a female voice, e.g., slt (female)
87
+ # Find an entry for 'slt' speaker, or use a default if not easily found by index.
88
+ # For simplicity, using a known good index for a female voice if available, otherwise a default.
89
+ # This part might need adjustment based on the dataset structure or desired voice.
90
+ # Example: embeddings_dataset is a list of dicts. Find one with speaker 'slt'.
91
+ # For now, let's use a fixed index that often corresponds to a clear female voice.
92
+ speaker_id_index = 7306 # Example index, might need to find a better one or allow selection
93
+ # A common female speaker from this dataset is 'slt'. Let's try to find her or use a default.
94
+ # The dataset has 'slt_arctic_a0001' to 'slt_arctic_b0593'. Index 7306 is 'awb_arctic_a0001'.
95
+ # Let's try to find 'slt' if possible, otherwise use a default. The dataset has 8803 entries.
96
+ # For simplicity, we'll use a default index. A common one used in examples is 7306.
97
+ # A known female voice index from this dataset (often 'slt' or similar) is around this range.
98
+ # Let's use a specific one for 'slt' if we can find it, otherwise a default.
99
+ # The dataset items are like {'audio': ..., 'filename': 'arctic_a0001.wav', 'speaker': 'slt', 'text': ..., 'xvector': ...}
100
+ # Let's try to find an 'slt' speaker explicitly
101
+ slt_speaker_embedding = None
102
+ for i in range(len(embeddings_dataset)):
103
+ if embeddings_dataset[i]['speaker'] == 'slt':
104
+ slt_speaker_embedding = torch.tensor(embeddings_dataset[i]["xvector"]).unsqueeze(0).to(device)
105
+ print("Found 'slt' speaker embedding.")
106
+ break
107
+ if slt_speaker_embedding is None:
108
+ print("Could not find 'slt' speaker, using default index 7306.")
109
+ slt_speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device)
110
+ speaker_embeddings = slt_speaker_embedding
111
 
112
+ except Exception as e:
113
+ print(f"Could not load speaker embeddings: {e}. TTS might not work optimally or at all.")
114
+ speaker_embeddings = torch.randn((1, 512)).to(device) # Fallback to random embeddings
115
+ print("TTS components loaded.")
116
 
117
+ # --- Helper Functions for Model Inference ---
118
+
119
+ # 1. Text Generation
120
+ def generate_text_response(prompt_text):
121
  try:
122
+ # Phi-4-reasoning-plus uses a specific chat template
123
+ # System prompt from the model card (simplified for general use here)
124
+ system_prompt = "You are a helpful AI assistant. Please provide a clear and friendly answer."
125
+ # The model card suggests a more complex system prompt for reasoning tasks.
126
+ # For a general chatbot, a simpler one might be fine, or adjust as needed.
127
+ # Original prompt: "أجب على السؤال التالي بطريقة ودية وواضحة:"
128
+ # We will use the user's prompt directly with a general system prompt.
129
+
130
+ messages = [
131
+ {"role": "system", "content": system_prompt},
132
+ {"role": "user", "content": f"أجب على السؤال التالي بطريقة ودية وواضحة: {prompt_text}"}
133
+ ]
134
+
135
+ # Tokenize the chat
136
+ # The model card for Phi-4-reasoning-plus shows: inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
137
+ # However, apply_chat_template might not be available or work the same for all AutoTokenizers directly without specific setup.
138
+ # A more general approach for models expecting chat format:
139
+ prompt_for_model = text_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
140
+ inputs = text_tokenizer(prompt_for_model, return_tensors="pt", padding=True, truncation=True, max_length=2048).to(text_model.device)
141
+
142
+ outputs = text_model.generate(
143
+ **inputs,
144
+ max_new_tokens=300, # Increased from 200 for potentially longer answers
145
+ # Parameters from Phi-4 model card for reasoning, can be adjusted for chat
146
+ # temperature=0.8,
147
+ # top_p=0.95,
148
+ # do_sample=True,
149
+ # For more deterministic chat, we might use different settings
150
+ temperature=0.7,
151
+ top_k=50,
152
+ do_sample=True,
153
+ pad_token_id=text_tokenizer.eos_token_id
154
+ )
155
+ # Decode, skipping the prompt part
156
+ response_text = text_tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
157
+ return response_text.strip()
158
  except Exception as e:
159
+ print(f"Error in text generation: {e}")
160
  return f"خطأ في معالجة النص: {str(e)}"
161
 
162
+ # 2. Image Analysis
163
+ def analyze_image(pil_image, question_text=None):
164
  try:
165
+ if pil_image is None:
166
+ return "الرجاء رفع صورة أولاً."
167
+ # Convert numpy array from Gradio to PIL Image if necessary
168
+ if isinstance(pil_image, np.ndarray):
169
+ pil_image = Image.fromarray(pil_image).convert("RGB")
170
+ else: # Assuming it's already a PIL image path or object from gr.Image
171
+ pil_image = pil_image.convert("RGB")
172
+
173
+ if not question_text or question_text.strip() == "":
174
+ prompt_for_model = "USER: <image>\nصف محتوى الصورة بالتفصيل\nASSISTANT:"
175
+ else:
176
+ prompt_for_model = f"USER: <image>\n{question_text}\nASSISTANT:"
177
+
178
+ inputs = image_processor(text=prompt_for_model, images=pil_image, return_tensors="pt").to(image_model.device)
179
+ outputs = image_model.generate(**inputs, max_new_tokens=100)
180
+ response_text = image_processor.decode(outputs[0], skip_special_tokens=True)
181
+ # LLaVA response often includes the prompt, so we might need to clean it.
182
+ # Typically, the response starts after "ASSISTANT: "
183
+ assistant_marker = "ASSISTANT:"
184
+ if assistant_marker in response_text:
185
+ response_text = response_text.split(assistant_marker, 1)[-1].strip()
186
+ return response_text
187
  except Exception as e:
188
+ print(f"Error in image analysis: {e}")
189
  return f"خطأ في تحليل الصورة: {str(e)}"
190
 
191
+ # 3. Audio Processing (STT and TTS)
192
+ def process_audio(audio_input):
193
  try:
194
+ if audio_input is None:
195
+ return "الرجاء تسجيل الصوت أولاً.", "", (16000, np.array([], dtype=np.int16))
196
+
197
+ sample_rate, audio_data = audio_input
198
+
199
+ # Ensure audio_data is float32 and normalized for Whisper
200
+ if audio_data.dtype != np.float32:
201
+ audio_data = audio_data.astype(np.float32)
202
+ if np.max(np.abs(audio_data)) > 0:
203
+ audio_data = audio_data / np.max(np.abs(audio_data))
204
+ else: # Handle silent audio
205
+ return "تم استقبال صوت صامت.", "", (16000, np.array([], dtype=np.int16))
206
+
207
+ # Speech-to-Text
208
+ input_features = stt_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
209
+ # Generate token ids
210
+ predicted_ids = stt_model.generate(input_features, language="ar") # Specify language for better accuracy if known
211
+ # Decode token ids to text
212
+ transcription = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
213
+ transcription = transcription.strip()
214
+
215
+ if not transcription:
216
+ return "لم يتمكن النموذج من استخراج نص من الصوت.", "", (16000, np.array([], dtype=np.int16))
217
+
218
+ # Text Generation (Response to transcription)
219
  text_response = generate_text_response(transcription)
220
+
221
+ # Text-to-Speech
222
+ inputs = tts_processor(text=text_response, return_tensors="pt")
223
+ speech_values = tts_model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=tts_vocoder)
224
+
225
+ # Ensure output is a NumPy array and handle sample rate
226
+ audio_output_np = speech_values.cpu().numpy()
227
+ # SpeechT5 typically outputs at 16kHz
228
+ tts_sample_rate = 16000
229
+
230
+ return transcription, text_response, (tts_sample_rate, audio_output_np)
231
  except Exception as e:
232
+ print(f"Error in audio processing: {e}")
233
+ # Return empty audio with a valid sample rate for Gradio
234
+ empty_audio_data = np.array([], dtype=np.float32)
235
+ return f"خطأ في معالجة الصوت: {str(e)}", "", (16000, empty_audio_data)
236
 
237
+ # 4. File Processing
238
+ def process_file(file_obj):
239
  try:
240
+ if file_obj is None:
241
+ return "الرجاء رفع ملف أولاً."
242
+ file_path = file_obj.name # Gradio File object has a .name attribute with the temp path
243
+ text_content = ""
244
+ if file_path.endswith(".pdf"):
245
+ with fitz.open(file_path) as doc:
246
+ text_content = "\n".join(page.get_text() for page in doc)
247
+ elif file_path.endswith((".xlsx", ".xls")):
248
+ df = pd.read_excel(file_path)
249
+ text_content = df.to_string()
250
+ elif file_path.endswith(".csv"):
251
+ df = pd.read_csv(file_path)
252
+ text_content = df.to_string()
253
  else:
254
+ return "❌ نوع الملف غير مدعوم حالياً (يدعم PDF, Excel, CSV)."
255
+
256
+ if not text_content.strip():
257
+ return "الملف فارغ أو لا يمكن قراءة محتواه النصي."
258
 
259
+ # Summarize or query the content
260
+ # Limit context size for the model if necessary
261
+ max_context_len = 1500 # Adjust based on model limits and typical file sizes
262
+ if len(text_content) > max_context_len:
263
+ text_content = text_content[:max_context_len] + "... [المحتوى تم اختصاره]"
264
+
265
+ response = generate_text_response(f"لخص المحتوى التالي من الملف: \n\n{text_content}")
266
  return response
267
  except Exception as e:
268
+ print(f"Error processing file: {e}")
269
  return f"خطأ في قراءة الملف: {str(e)}"
270
 
271
+ # --- Gradio Interface ---
272
+ with gr.Blocks(css=".gradio-container {background-color: #f0f4f8; font-family: Arial; color: #333; padding: 20px;}", theme=gr.themes.Soft()) as demo:
273
+ gr.Markdown("# 🤖 Kemo Chat V2 - مساعد ذكي متعدد الوسائط (بأدوات Hugging Face الأحدث)")
274
+ gr.Markdown("🎯 تفاعل معي عبر النصوص، الصور، الصوت أو الملفات! يدعم العربية والإنجليزية (النماذج قد تتفاوت في دعم اللغات).")
275
+ gr.Markdown("📁 يدعم الملفات: PDF، Excel، CSV\n🖼️ يدعم الوصف الذكي للصور والسؤال عنها\n🎙️ تحويل الصوت إلى نص والرد صوتياً")
276
 
277
  with gr.Tab("💬 المحادثة النصية"):
278
+ text_input = gr.Textbox(label="اكتب سؤالك أو رسالتك هنا", lines=3)
279
+ text_output = gr.Textbox(label="الرد", lines=5, interactive=False)
280
+ text_submit = gr.Button("إرسال", variant="primary")
281
  text_submit.click(fn=generate_text_response, inputs=text_input, outputs=text_output)
282
 
283
  with gr.Tab("🖼️ تحليل الصور"):
284
+ gr.Markdown("ارفع صورة واسأل عنها أو اطلب وصفًا لها.")
285
+ with gr.Row():
286
+ image_input = gr.Image(type="pil", label="ارفع صورة") # type="pil" for PIL Image object
287
+ with gr.Column():
288
+ image_question = gr.Textbox(label="اختياري: اسأل سؤال عن الصورة", lines=2)
289
+ image_output = gr.Textbox(label="الوصف أو الإجابة", lines=5, interactive=False)
290
+ image_submit = gr.Button("تحليل الصورة", variant="primary")
291
  image_submit.click(fn=analyze_image, inputs=[image_input, image_question], outputs=image_output)
292
 
293
  with gr.Tab("🎤 التفاعل الصوتي"):
294
+ gr.Markdown("سجّل رسالة صوتية، سيتم تحويلها إلى نص، ثم الرد عليها نصيًا وصوتيًا.")
295
+ audio_input = gr.Audio(sources=["microphone"], type="numpy", label="سجّل رسالتك الصوتية") # type="numpy" for (sr, data)
296
+ with gr.Row():
297
+ audio_transcription = gr.Textbox(label="النص المستخرج من صوتك", interactive=False, lines=2)
298
+ audio_text_response = gr.Textbox(label="الرد النصي على رسالتك", interactive=False, lines=3)
299
+ audio_output_player = gr.Audio(label="الرد الصوتي من المساعد", type="numpy", interactive=False) # type="numpy" for (sr, data)
300
+ audio_submit = gr.Button("معالجة الصوت", variant="primary")
301
+ audio_submit.click(fn=process_audio,
302
+ inputs=audio_input,
303
+ outputs=[audio_transcription, audio_text_response, audio_output_player])
304
 
305
  with gr.Tab("📄 تحليل الملفات"):
306
+ gr.Markdown("ارفع ملف (PDF, Excel, CSV) وسأقوم بتلخيص محتواه أو الإجابة على أسئلتك حوله.")
307
+ file_input = gr.File(label="ارفع ملفك (PDF, Excel, CSV)", file_types=[".pdf", ".xls", ".xlsx", ".csv"])
308
+ file_output = gr.Textbox(label="الرد على محتوى الملف", lines=5, interactive=False)
309
+ file_submit = gr.Button("تحليل الملف", variant="primary")
310
  file_submit.click(fn=process_file, inputs=file_input, outputs=file_output)
311
 
312
+ if __name__ == "__main__":
313
+ print("Launching Gradio Demo...")
314
+ demo.launch(share=True) # share=True for public link, remove if not needed
315
+