prasanacodes commited on
Commit
262ccb4
·
verified ·
1 Parent(s): 54304bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -22
app.py CHANGED
@@ -6,6 +6,9 @@ import ffmpeg
6
  import nltk
7
  import re
8
  from deep_translator import MyMemoryTranslator
 
 
 
9
 
10
  # You only need to run this download command once
11
  nltk.download('punkt_tab')
@@ -54,6 +57,34 @@ def transcribe_audio(audio_path):
54
  """
55
  # Initialize the ASR pipeline from Hugging Face Transformers
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  transcriber = pipeline(
58
  "automatic-speech-recognition",
59
  model="openai/whisper-large-v3-turbo",
@@ -66,16 +97,19 @@ def transcribe_audio(audio_path):
66
  return "No audio file provided. Please upload or record an audio file."
67
 
68
  print(f"Transcribing audio file: {audio_path}")
69
- try:
70
- # The pipeline handles all the complex steps of loading and processing the audio
71
- result = transcriber(audio_path)
72
- # The result is a dictionary, and we need the 'text' key
73
- transcription = result["text"]
74
- print(f"Transcription successful: {transcription}")
75
- return transcription
76
- except Exception as e:
77
- print(f"An error occurred during transcription: {e}")
78
- return f"Sorry, an error occurred. Please try again. Details: {str(e)}"
 
 
 
79
 
80
  def lang_select(target_lang):
81
  LANGUAGE_NAME_TO_CODE = {
@@ -128,17 +162,38 @@ def translate_local(text_to_translate, target_lang='ta-IN', device=None):
128
 
129
  return translated_text
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def main_run(video_path,target_lang):
132
  original_audio_file = extract_audio_from_video(video_path)
133
- original_text = transcribe_audio(original_audio_file)
134
  translated_text = translate_local(original_text,target_lang)
135
- (original_text,target_lang)
136
- return translated_text
137
 
138
  def audio_pipeline_run(audio_path,target_lang):
139
- original_text = transcribe_audio(audio_path)
140
  translated_text = translate_local(original_text,target_lang)
141
- return translated_text
 
142
 
143
 
144
  # --- Gradio Interface Definition ---
@@ -176,7 +231,7 @@ with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
176
  submit_btn_vid = gr.Button("Translate Video", variant="primary")
177
 
178
  with gr.Column():
179
- video_out = gr.Textbox(label="Translation")
180
 
181
  '''
182
  with gr.Column():
@@ -191,15 +246,10 @@ with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
191
  audio_in_pipe = gr.Audio(type="filepath", label="Input Audio")
192
  lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
193
  submit_btn_aud = gr.Button("Translate Audio", variant="primary")
194
-
195
- with gr.Column():
196
- audio_out_pipe = gr.Textbox(label="Translation")
197
 
198
- '''
199
  with gr.Column():
200
  audio_out_pipe = gr.Audio(label="Output Audio", interactive=False)
201
- '''
202
-
203
  submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in_pipe, lang_radio_aud], outputs=[audio_out_pipe])
204
 
205
  # --- Launch the App ---
 
6
  import nltk
7
  import re
8
  from deep_translator import MyMemoryTranslator
9
+ import soundfile as sf
10
+ from gradio_client_api import Client, handle_file
11
+
12
 
13
  # You only need to run this download command once
14
  nltk.download('punkt_tab')
 
57
  """
58
  # Initialize the ASR pipeline from Hugging Face Transformers
59
 
60
+ THRESHOLDS = {
61
+ "very_slow": 80,
62
+ "slow": 110,
63
+ "normal": 150,
64
+ "fast": 200,
65
+ "very_fast": float("inf")
66
+ }
67
+
68
+ def get_audio_duration(path: str) -> float:
69
+ """Return duration of audio file in seconds."""
70
+ with sf.SoundFile(path) as f:
71
+ return len(f) / f.samplerate
72
+
73
+ def compute_wpm(transcript: str, duration_s: float) -> float:
74
+ """Compute words per minute."""
75
+ if not transcript or duration_s == 0:
76
+ return 0.0
77
+ words = transcript.strip().split()
78
+ return len(words) / (duration_s / 60.0)
79
+
80
+ def categorize_wpm(wpm: float) -> str:
81
+ """Map a WPM value to one of the pace categories."""
82
+ for label, threshold in THRESHOLDS.items():
83
+ if wpm < threshold:
84
+ return label
85
+ return "unknown"
86
+
87
+
88
  transcriber = pipeline(
89
  "automatic-speech-recognition",
90
  model="openai/whisper-large-v3-turbo",
 
97
  return "No audio file provided. Please upload or record an audio file."
98
 
99
  print(f"Transcribing audio file: {audio_path}")
100
+
101
+ # The pipeline handles all the complex steps of loading and processing the audio
102
+ result = transcriber(audio_path)
103
+ # The result is a dictionary, and we need the 'text' key
104
+ transcription = result["text"]
105
+ print(f"Transcription successful: {transcription}")
106
+
107
+ duration_s = get_audio_duration(audio_path)
108
+ wpm = compute_wpm(transcribed_text, duration_s)
109
+ pace = categorize_wpm(wpm)
110
+ print(f" > Pace detected: {pace.upper()} ({wpm:.1f} WPM)")
111
+
112
+ return transcription, pace
113
 
114
  def lang_select(target_lang):
115
  LANGUAGE_NAME_TO_CODE = {
 
162
 
163
  return translated_text
164
 
165
+ def synthesize_speech(synth_text, target_lang, gender="Male", pace="normal", output_path="temp_audio_synthesized.wav", device=None):
166
+
167
+ ref_audio_path = str('reference/'+target_lang.upper()+'/'+gender.upper()+'_'+pace.upper()+'.wav')
168
+ ref_text_path = str('reference/'+target_lang.upper()+'/'+gender.upper()+'_'+pace.upper()+'.txt')
169
+
170
+ if os.path.exists(ref_audio_path) and os.path.exists(ref_text_path):
171
+ ref_audio_path = ref_audio_path
172
+ with open(ref_text_path, encoding='utf-8') as f:
173
+ ref_text = f.read()
174
+
175
+ client = Client("ai4bharat/IndicF5")
176
+ result = client.predict(
177
+ text=synth_text,
178
+ ref_audio=handle_file(ref_audio_path),
179
+ ref_text=ref_text,
180
+ api_name="/synthesize_speech"
181
+ )
182
+ print("synthesize_speech saved to ", result)
183
+ return result
184
+
185
  def main_run(video_path,target_lang):
186
  original_audio_file = extract_audio_from_video(video_path)
187
+ original_text , pace = transcribe_audio(original_audio_file)
188
  translated_text = translate_local(original_text,target_lang)
189
+ translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
190
+ return translated_audio
191
 
192
  def audio_pipeline_run(audio_path,target_lang):
193
+ original_text , pace = transcribe_audio(audio_path)
194
  translated_text = translate_local(original_text,target_lang)
195
+ translated_audio = synthesize_speech(translated_text, target_lang, "Male", pace)
196
+ return translated_audio
197
 
198
 
199
  # --- Gradio Interface Definition ---
 
231
  submit_btn_vid = gr.Button("Translate Video", variant="primary")
232
 
233
  with gr.Column():
234
+ video_out = gr.Audio(label="Output Audio", interactive=False)
235
 
236
  '''
237
  with gr.Column():
 
246
  audio_in_pipe = gr.Audio(type="filepath", label="Input Audio")
247
  lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
248
  submit_btn_aud = gr.Button("Translate Audio", variant="primary")
 
 
 
249
 
 
250
  with gr.Column():
251
  audio_out_pipe = gr.Audio(label="Output Audio", interactive=False)
252
+
 
253
  submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in_pipe, lang_radio_aud], outputs=[audio_out_pipe])
254
 
255
  # --- Launch the App ---