Waqas167 commited on
Commit
5bebbc8
·
verified ·
1 Parent(s): 1d0a9b2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +460 -0
  2. requirements.txt +14 -3
app.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import streamlit as st
2
+ # from transformers import AutoProcessor, Wav2Vec2ForCTC
3
+ # import torch
4
+ # import librosa
5
+ # import os
6
+ # from pydub import AudioSegment
7
+ # from moviepy.editor import VideoFileClip
8
+ # from google import genai
9
+ # from google.genai import types
10
+
11
+ # # ----------- Configuration -----------
12
+ # model_id = "facebook/mms-1b-l1107"
13
+ # lang_code = "urd-script_arabic"
14
+ # api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # ⚠️ Replace with st.secrets for production
15
+
16
+ # # ----------- Load Processor and Model -----------
17
+ # @st.cache_resource
18
+ # def load_model_and_processor():
19
+ # processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
20
+ # model = Wav2Vec2ForCTC.from_pretrained(
21
+ # model_id,
22
+ # target_lang=lang_code,
23
+ # ignore_mismatched_sizes=True
24
+ # )
25
+ # model.load_adapter(lang_code)
26
+ # return processor, model
27
+
28
+ # processor, model = load_model_and_processor()
29
+
30
+ # # ----------- Audio Conversion -----------
31
+ # def get_wav_from_input(file_path, output_path="converted.wav"):
32
+ # ext = os.path.splitext(file_path)[-1].lower()
33
+ # if ext in [".mp4", ".mkv", ".avi", ".mov"]:
34
+ # video = VideoFileClip(file_path)
35
+ # video.audio.write_audiofile(output_path, fps=16000)
36
+ # elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
37
+ # audio = AudioSegment.from_file(file_path)
38
+ # audio = audio.set_frame_rate(16000).set_channels(1)
39
+ # audio.export(output_path, format="wav")
40
+ # elif ext == ".wav":
41
+ # audio = AudioSegment.from_wav(file_path)
42
+ # audio.export(output_path, format="wav")
43
+ # else:
44
+ # raise ValueError("Unsupported file format.")
45
+ # return output_path
46
+
47
+ # # ----------- Transcription -----------
48
+ # def transcribe(file_path):
49
+ # wav_path = get_wav_from_input(file_path)
50
+ # audio, sr = librosa.load(wav_path, sr=16000)
51
+ # inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
52
+ # with torch.no_grad():
53
+ # logits = model(**inputs).logits
54
+ # pred_ids = torch.argmax(logits, dim=-1)
55
+ # return processor.batch_decode(pred_ids)[0]
56
+
57
+ # # ----------- Gemini Analysis -----------
58
+ # def analyze_transcript(transcript):
59
+ # client = genai.Client(api_key=api_key)
60
+
61
+ # system_instr = """
62
+ # You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
63
+
64
+ # Then:
65
+ # 1. Translate the corrected Urdu transcript into English.
66
+ # 2. Determine whether the transcript involves a single speaker or multiple speakers.
67
+ # 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
68
+
69
+ # ⚠️ Format the segmented transcript *exactly* like this:
70
+
71
+ # **Segmented Transcript**
72
+
73
+ # **Urdu:**
74
+ # Person 01:
75
+ # [Urdu line here]
76
+
77
+ # Person 02:
78
+ # [Urdu line here]
79
+
80
+ # ...
81
+
82
+ # **English:**
83
+ # Person 01:
84
+ # [English line here]
85
+
86
+ # Person 02:
87
+ # [English line here]
88
+
89
+ # ...
90
+
91
+ # After that, provide your analysis in the following format:
92
+
93
+ # **Speaker-wise Analysis**
94
+ # [One or two sentences per speaker about tone, emotion, behavior]
95
+
96
+ # **Sentiment and Communication Style**
97
+ # [Concise overall tone: e.g., friendly, formal, tense, etc.]
98
+
99
+ # **Summary of Discussion**
100
+ # [A 2–3 line summary of what the speakers talked about, in English]
101
+ # """
102
+
103
+ # response = client.models.generate_content(
104
+ # model="gemini-2.5-flash",
105
+ # contents=[transcript],
106
+ # config=types.GenerateContentConfig(
107
+ # system_instruction=system_instr,
108
+ # temperature=0.0
109
+ # )
110
+ # )
111
+ # return response.text
112
+
113
+ # # ----------- Format Display Helper -----------
114
+ # def format_transcript_block(text: str) -> str:
115
+ # lines = text.split("Person ")
116
+ # formatted = ""
117
+ # for line in lines:
118
+ # line = line.strip()
119
+ # if not line:
120
+ # continue
121
+ # if line.startswith("01:") or line.startswith("02:"):
122
+ # formatted += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
123
+ # else:
124
+ # formatted += f"{line.strip()}\n\n"
125
+ # return formatted
126
+
127
+ # # ----------- Streamlit UI -----------
128
+ # # Styled Header
129
+ # st.markdown("""
130
+ # <div style="text-align: left; padding-bottom: 1rem;">
131
+ # <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
132
+ # 🎙️ Urdu Audio & Video Speech Analyzer
133
+ # </h1>
134
+ # <p style='color: #CCCCCC; font-size: 1.05em; margin-top: 0;'>
135
+ # Upload Urdu audio or video to get structured transcription, speaker diarization, and smart AI analysis.
136
+ # </p>
137
+ # </div>
138
+ # """, unsafe_allow_html=True)
139
+
140
+ # # File Upload
141
+ # st.markdown("### 📂 Upload an audio or video file")
142
+ # with st.container():
143
+ # uploaded_file = st.file_uploader(
144
+ # label="",
145
+ # type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
146
+ # label_visibility="collapsed"
147
+ # )
148
+
149
+ # if uploaded_file is not None:
150
+ # with st.spinner("⏳ Transcribing..."):
151
+ # file_name = uploaded_file.name
152
+ # temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
153
+ # with open(temp_path, "wb") as f:
154
+ # f.write(uploaded_file.read())
155
+ # transcript = transcribe(temp_path)
156
+
157
+ # st.markdown("### 📝 Raw Urdu Transcription")
158
+ # st.text(transcript)
159
+
160
+ # with st.spinner("🔍 Analyzing with Gemini..."):
161
+ # report = analyze_transcript(transcript)
162
+
163
+ # # Extract Segmented Urdu and English
164
+ # segmented_urdu = ""
165
+ # segmented_english = ""
166
+ # analysis_only = ""
167
+
168
+ # if "Urdu:" in report and "English:" in report:
169
+ # urdu_start = report.find("Urdu:")
170
+ # english_start = report.find("English:")
171
+ # segmented_urdu = report[urdu_start + len("Urdu:"):english_start].strip()
172
+
173
+ # english_section = report[english_start + len("English:"):].strip()
174
+ # if "**Speaker-wise Analysis**" in english_section:
175
+ # parts = english_section.split("**Speaker-wise Analysis**")
176
+ # segmented_english = parts[0].strip()
177
+ # analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
178
+ # else:
179
+ # segmented_english = english_section.strip()
180
+ # analysis_only = "⚠️ Could not extract structured analysis."
181
+
182
+ # # Show Segmented Transcript
183
+ # if segmented_urdu and segmented_english:
184
+ # st.markdown("### 🗣️ Segmented Transcript")
185
+ # col1, col2 = st.columns(2)
186
+
187
+ # with col1:
188
+ # st.markdown("#### Urdu")
189
+ # st.markdown(format_transcript_block(segmented_urdu))
190
+
191
+ # with col2:
192
+ # st.markdown("#### English")
193
+ # st.markdown(format_transcript_block(segmented_english))
194
+
195
+ # # Show Gemini Analysis Only (No transcript repeat)
196
+ # if analysis_only:
197
+ # st.markdown("### 🧠 Gemini Analysis Summary")
198
+ # st.markdown(analysis_only)
199
+ # app.py
200
+
201
+ # api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo"
202
+
203
+ import io, os, numpy as np, streamlit as st, librosa, torch, soundfile as sf
204
+ from transformers import AutoProcessor, Wav2Vec2ForCTC
205
+ from pydub import AudioSegment
206
+ from moviepy.editor import VideoFileClip
207
+ from google import genai
208
+ from google.genai import types
209
+
210
+ # ✅ programmatic Start/Stop mic (no WebRTC)
211
+ from streamlit_mic_recorder import mic_recorder
212
+
213
+ # ---------------- Config ----------------
214
+ st.set_page_config(page_title="Urdu Speech Analyzer", page_icon="🎙️", layout="wide")
215
+ PAGE_TITLE = "🎙️ Urdu Audio & Video Speech Analyzer"
216
+ model_id = "facebook/mms-1b-l1107"
217
+ lang_code = "urd-script_arabic"
218
+ api_key = "AIzaSyBEWWn32PxVEaUsoe67GJOEpF4FQT87Kxo" # hard-coded as requested
219
+
220
+ # ---------------- Model ----------------
221
+ @st.cache_resource
222
+ def load_model_and_processor():
223
+ processor = AutoProcessor.from_pretrained(model_id, target_lang=lang_code)
224
+ model = Wav2Vec2ForCTC.from_pretrained(
225
+ model_id, target_lang=lang_code, ignore_mismatched_sizes=True
226
+ )
227
+ model.load_adapter(lang_code)
228
+ return processor, model
229
+
230
+ processor, model = load_model_and_processor()
231
+
232
+ # ---------------- Helpers ----------------
233
+ def get_wav_from_input(file_path, output_path="converted.wav"):
234
+ ext = os.path.splitext(file_path)[-1].lower()
235
+ if ext in [".mp4", ".mkv", ".avi", ".mov"]:
236
+ video = VideoFileClip(file_path)
237
+ video.audio.write_audiofile(output_path, fps=16000)
238
+ elif ext in [".mp3", ".aac", ".flac", ".ogg", ".m4a"]:
239
+ audio = AudioSegment.from_file(file_path)
240
+ audio = audio.set_frame_rate(16000).set_channels(1)
241
+ audio.export(output_path, format="wav")
242
+ elif ext == ".wav":
243
+ audio = AudioSegment.from_wav(file_path)
244
+ audio = audio.set_frame_rate(16000).set_channels(1)
245
+ audio.export(output_path, format="wav")
246
+ else:
247
+ raise ValueError("Unsupported file format.")
248
+ return output_path
249
+
250
+ def save_wav_resampled(audio_f32: np.ndarray, sr_in: int, path: str):
251
+ if sr_in != 16000:
252
+ audio_f32 = librosa.resample(audio_f32, orig_sr=sr_in, target_sr=16000)
253
+ audio_f32 = librosa.util.normalize(audio_f32)
254
+ sf.write(path, audio_f32.astype(np.float32), 16000)
255
+
256
+ def transcribe(wav_path) -> str:
257
+ audio, sr = librosa.load(wav_path, sr=16000, mono=True)
258
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
259
+ with torch.no_grad():
260
+ logits = model(**inputs).logits
261
+ pred_ids = torch.argmax(logits, dim=-1)
262
+ return processor.batch_decode(pred_ids)[0]
263
+
264
+ def analyze_transcript(transcript: str) -> str:
265
+ client = genai.Client(api_key=api_key)
266
+ system_instr = """
267
+ You are a speech analyst. The following transcription is in Urdu and contains no punctuation — your first task is to correct the transcript by segmenting it into grammatically correct sentences.
268
+
269
+ Then:
270
+ 1. Translate the corrected Urdu transcript into English.
271
+ 2. Determine whether the transcript involves a single speaker or multiple speakers.
272
+ 3. If multiple speakers are detected, perform diarization by segmenting the transcript with clear speaker labels.
273
+
274
+ ⚠️ Format the segmented transcript *exactly* like this:
275
+
276
+ **Segmented Transcript**
277
+
278
+ **Urdu:**
279
+ Person 01:
280
+ [Urdu line here]
281
+
282
+ Person 02:
283
+ [Urdu line here]
284
+
285
+ ...
286
+
287
+ **English:**
288
+ Person 01:
289
+ [English line here]
290
+
291
+ Person 02:
292
+ [English line here]
293
+
294
+ ...
295
+
296
+ After that, provide your analysis in the following format:
297
+
298
+ **Speaker-wise Analysis**
299
+ [One or two sentences per speaker about tone, emotion, behavior]
300
+
301
+ **Sentiment and Communication Style**
302
+ [Concise overall tone: e.g., friendly, formal, tense, etc.]
303
+
304
+ **Summary of Discussion**
305
+ [A 2–3 line summary of what the speakers talked about, in English]
306
+ """
307
+ resp = client.models.generate_content(
308
+ model="gemini-2.5-flash",
309
+ contents=[transcript],
310
+ config=types.GenerateContentConfig(system_instruction=system_instr, temperature=0.0)
311
+ )
312
+ return resp.text
313
+
314
+ def format_transcript_block(text: str) -> str:
315
+ lines = text.split("Person ")
316
+ out = ""
317
+ for line in lines:
318
+ line = line.strip()
319
+ if not line:
320
+ continue
321
+ if line.startswith("01:") or line.startswith("02:"):
322
+ out += f"\n**Person {line[:2]}**:\n{line[3:].strip()}\n\n"
323
+ else:
324
+ out += f"{line}\n\n"
325
+ return out
326
+
327
+ # ---------------- Header ----------------
328
+ st.markdown(f"""
329
+ <div style="text-align: left; padding-bottom: 1rem;">
330
+ <h1 style='color:#1f77b4; font-size: 2.5em; font-weight: 800; margin-bottom: 0.2em;'>
331
+ {PAGE_TITLE}
332
+ </h1>
333
+ <p style='color: #7c8a98; font-size: 1.05em; margin-top: 0;'>
334
+ Record or upload Urdu speech for structured transcription, diarization, and smart AI analysis.
335
+ </p>
336
+ </div>
337
+ """, unsafe_allow_html=True)
338
+
339
+ # ================= Mic: true Start/Stop + narrow Analyze =================
340
+ st.markdown("### 🎤 Live recording")
341
+
342
+ # The component renders **Start** and **Stop** buttons and keeps recording until you press Stop.
343
+ rec = mic_recorder(
344
+ start_prompt="▶️ Start",
345
+ stop_prompt="⏹️ Stop",
346
+ just_once=False, # allow multiple recordings in a session
347
+ key="recorder",
348
+ format="wav" # returns WAV bytes
349
+ )
350
+
351
+ # `rec` returns after Stop. Different versions return bytes or a dict — handle both.
352
+ audio_bytes, sr_in = None, 44100
353
+ if rec is not None:
354
+ if isinstance(rec, dict) and "bytes" in rec:
355
+ audio_bytes = rec["bytes"]
356
+ sr_in = int(rec.get("sample_rate", 44100))
357
+ elif isinstance(rec, (bytes, bytearray)):
358
+ audio_bytes = rec
359
+ sr_in = 44100 # component default
360
+ else:
361
+ # fallback: try to extract .get("audio") etc if lib changes
362
+ audio_bytes = rec.get("audio") if isinstance(rec, dict) else None
363
+
364
+ if audio_bytes:
365
+ st.success("Audio captured.")
366
+ # Convert to mono float32
367
+ data, sr_read = sf.read(io.BytesIO(audio_bytes), dtype="float32", always_2d=False)
368
+ if data.ndim > 1:
369
+ data = data.mean(axis=1)
370
+ if sr_read: # prefer the rate embedded in the WAV
371
+ sr_in = sr_read
372
+
373
+ # Save as 16 kHz mono for the model
374
+ tmp_wav = "mic_recording.wav"
375
+ save_wav_resampled(data, sr_in, tmp_wav)
376
+
377
+ # Minimal playback (no waveform)
378
+ st.audio(audio_bytes, format="audio/wav")
379
+ st.caption(f"Duration: {data.size / sr_in:.2f} s")
380
+
381
+ # Slim Analyze button (not full width)
382
+ if st.button("🔍 Analyze", type="primary"):
383
+ with st.spinner("⏳ Transcribing & analyzing..."):
384
+ transcript = transcribe(tmp_wav) # raw not displayed
385
+ report = analyze_transcript(transcript)
386
+
387
+ segmented_urdu = segmented_english = analysis_only = ""
388
+ if "Urdu:" in report and "English:" in report:
389
+ u0 = report.find("Urdu:")
390
+ e0 = report.find("English:")
391
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
392
+ english_section = report[e0 + len("English:"):].strip()
393
+ if "**Speaker-wise Analysis**" in english_section:
394
+ parts = english_section.split("**Speaker-wise Analysis**")
395
+ segmented_english = parts[0].strip()
396
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
397
+ else:
398
+ segmented_english = english_section.strip()
399
+ analysis_only = "⚠️ Could not extract structured analysis."
400
+
401
+ if segmented_urdu or segmented_english:
402
+ st.markdown("### 🗣️ Segmented Transcript")
403
+ c1, c2 = st.columns(2)
404
+ with c1:
405
+ st.markdown("#### Urdu")
406
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
407
+ with c2:
408
+ st.markdown("#### English")
409
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
410
+ if analysis_only:
411
+ st.markdown("### 🧠 Gemini Analysis Summary")
412
+ st.markdown(analysis_only)
413
+
414
+ st.markdown("---")
415
+
416
+ # ================= Upload (unchanged) =================
417
+ st.markdown("### 📂 Or upload an audio/video file")
418
+ uploaded_file = st.file_uploader(
419
+ label="",
420
+ type=["mp3", "mp4", "wav", "mkv", "aac", "ogg", "m4a", "flac"],
421
+ label_visibility="collapsed"
422
+ )
423
+ if uploaded_file is not None:
424
+ with st.spinner("⏳ Transcribing..."):
425
+ file_name = uploaded_file.name
426
+ temp_path = f"temp_input{os.path.splitext(file_name)[-1]}"
427
+ with open(temp_path, "wb") as f:
428
+ f.write(uploaded_file.read())
429
+ wav_path = get_wav_from_input(temp_path)
430
+ transcript = transcribe(wav_path)
431
+
432
+ with st.spinner("🔍 Analyzing with Gemini..."):
433
+ report = analyze_transcript(transcript)
434
+
435
+ segmented_urdu = segmented_english = analysis_only = ""
436
+ if "Urdu:" in report and "English:" in report:
437
+ u0 = report.find("Urdu:")
438
+ e0 = report.find("English:")
439
+ segmented_urdu = report[u0 + len("Urdu:"):e0].strip()
440
+ english_section = report[e0 + len("English:"):].strip()
441
+ if "**Speaker-wise Analysis**" in english_section:
442
+ parts = english_section.split("**Speaker-wise Analysis**")
443
+ segmented_english = parts[0].strip()
444
+ analysis_only = "**Speaker-wise Analysis**" + parts[1].strip()
445
+ else:
446
+ segmented_english = english_section.strip()
447
+ analysis_only = "⚠️ Could not extract structured analysis."
448
+
449
+ if segmented_urdu or segmented_english:
450
+ st.markdown("### 🗣️ Segmented Transcript")
451
+ c1, c2 = st.columns(2)
452
+ with c1:
453
+ st.markdown("#### Urdu")
454
+ st.markdown(format_transcript_block(segmented_urdu) if segmented_urdu else "_(none)_")
455
+ with c2:
456
+ st.markdown("#### English")
457
+ st.markdown(format_transcript_block(segmented_english) if segmented_english else "_(none)_")
458
+ if analysis_only:
459
+ st.markdown("### 🧠 Gemini Analysis Summary")
460
+ st.markdown(analysis_only)
requirements.txt CHANGED
@@ -1,3 +1,14 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ torch
3
+ torchaudio
4
+ accelerate
5
+ datasets
6
+ transformers>=4.41.0
7
+ moviepy==1.0.3
8
+ pydub
9
+ librosa
10
+ google-generativeai
11
+ streamlit-webrtc
12
+ av
13
+ soundfile
14
+ audio-recorder-streamlit