# app.py import gradio as gr from transformers import pipeline, AutoModel import torch import ffmpeg import nltk import re from deep_translator import MyMemoryTranslator import num2words import soundfile as sf from gradio_client import Client, handle_file from openvoice_cli.__main__ import tune_one import pyrubberband as rb import librosa import os import numpy as np # You only need to run this download command once nltk.download('punkt_tab') # --- Model Loading --- # We load the model once when the app starts, not on every function call. # This makes the app much more efficient. # We also check for GPU availability to speed things up if possible. device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 print(f"Using device: {device}") # --- Transcription Function --- def extract_audio_from_video(video_path, output_audio_path="temp_extracted_audio.wav"): """ Extracts audio from a video file using python-ffmpeg. """ print(f"\n[STEP 1/9] Extracting audio from video: {video_path}") try: ( ffmpeg .input(video_path) .output( output_audio_path, vn=None, # Disable video acodec='mp3', # Audio codec ab='192k', # Audio bitrate ar='44100', # Sample rate ac=2, # Audio channels f='wav' # Output format ) .run(overwrite_output=True, quiet=True) ) print(f"✅ Audio extracted successfully to: {output_audio_path}") return output_audio_path except ffmpeg.Error as e: print(f"Error: Failed to extract audio from video. stderr: {e.stderr.decode('utf8')}") return None def transcribe_audio(audio_path): """ This function takes an audio file path, transcribes it using the Whisper model, and returns the transcribed text. """ # Initialize the ASR pipeline from Hugging Face Transformers THRESHOLDS = { "very_slow": 80, "slow": 110, "normal": 150, "fast": 200, "very_fast": float("inf") } def get_audio_duration(path: str) -> float: """Return duration of audio file in seconds.""" with sf.SoundFile(path) as f: return len(f) / f.samplerate def compute_wpm(transcript: str, duration_s: float) -> float: """Compute words per minute.""" if not transcript or duration_s == 0: return 0.0 words = transcript.strip().split() return len(words) / (duration_s / 60.0) def categorize_wpm(wpm: float) -> str: """Map a WPM value to one of the pace categories.""" for label, threshold in THRESHOLDS.items(): if wpm < threshold: return label return "unknown" transcriber = pipeline( "automatic-speech-recognition", model="openai/whisper-large-v3-turbo", torch_dtype=torch_dtype, device=device, generate_kwargs={"language": "english"}, ) if audio_path is None: return "No audio file provided. Please upload or record an audio file." print(f"Transcribing audio file: {audio_path}") # The pipeline handles all the complex steps of loading and processing the audio result = transcriber(audio_path) # The result is a dictionary, and we need the 'text' key transcription = result["text"] print(f"✅ Transcription successful: {transcription}") duration_s = get_audio_duration(audio_path) wpm = compute_wpm(transcription, duration_s) pace = categorize_wpm(wpm) print(f"✅ > Pace detected: {pace.upper()} ({wpm:.1f} WPM)") return transcription, pace def lang_select(target_lang): LANGUAGE_NAME_TO_CODE = { "Bengali": "bn-IN", "English": "en-IN", "Gujarati": "gu-IN", "Hindi": "hi-IN", "Kannada": "kn-IN", "Malayalam": "ml-IN", "Marathi": "mr-IN", "Odia": "or-IN", "Punjabi": "pa-IN", "Tamil": "ta-IN", "Telugu": "te-IN" } return LANGUAGE_NAME_TO_CODE[target_lang] def translate_local(text_to_translate, target_lang='ta-IN', device=None): """ Translates text from English to a target language, handling texts longer than 500 characters by splitting them into sentence-based chunks. """ # 1. Pre-process the text (same as your original code) text_to_translate = re.sub(r'\d+', lambda match: num2words(int(match.group(0))), text_to_translate) target_lang=lang_select(target_lang.capitalize()) # 2. Split the entire text into individual sentences sentences = nltk.sent_tokenize(text_to_translate) # 3. Group sentences into chunks under 500 characters chunks = [] current_chunk = "" for sentence in sentences: # Check if adding the next sentence exceeds the limit if len(current_chunk) + len(sentence) + 1 < 500: current_chunk += sentence + " " else: # If it exceeds, add the current chunk to the list and start a new one chunks.append(current_chunk.strip()) current_chunk = sentence + " " # Add the last remaining chunk to the list if current_chunk: chunks.append(current_chunk.strip()) # 4. Translate each chunk and combine the results translator = MyMemoryTranslator(source='en-GB', target=target_lang) translated_chunks = [] for chunk in chunks: try: translated_chunks.append(translator.translate(chunk)) except Exception as e: print(f"Could not translate chunk: {chunk}\nError: {e}") translated_chunks.append("") # Add an empty string on error translated_text = " ".join(translated_chunks) print(f"✅ Translated Text to {target_lang} Successfully") return translated_text def synthesize_speech(synth_text, target_lang, pace="normal", output_path="temp_audio_synthesized.wav", device="cpu"): ref_audio_path = str('reference/TAMIL/MALE_'+pace.upper()+'.wav') ref_text_path = str('reference/TAMIL/MALE_'+pace.upper()+'.txt') ref_audio_path = ref_audio_path with open(ref_text_path, encoding='utf-8') as f: ref_text = f.read() print("> Loading IndicF5 TTS model (ai4bharat/IndicF5)...") indicf5_repo_id = "ai4bharat/IndicF5" token = os.environ.get("HF_TOKEN") tts_model = AutoModel.from_pretrained(indicf5_repo_id, trust_remote_code=True).to(device) audio = tts_model(synth_text, ref_audio_path=ref_audio_path, ref_text=ref_text) if audio.dtype == np.int16: audio = audio.astype(np.float32) / 32768.0 sf.write(output_path, np.array(audio, dtype=np.float32), samplerate=24000) print(f"✅ Speech synthesis complete.") print(f"> Final audio saved to: {output_path}") return output_path def match_audio_duration(original_path, translated_path, output_path="temp_audio_synced.wav"): """ Matches Synthesized Audio duration to Original Audio duration """ print("\n[STEP 7/9] Syncing Audio durations") # Load original audio original_audio, original_sr = librosa.load(original_path, sr=None) original_duration = librosa.get_duration(y=original_audio, sr=original_sr) print(f"Original audio duration: {original_duration:.2f} seconds") # Load translated audio translated_audio, translated_sr = librosa.load(translated_path, sr=None) translated_duration = librosa.get_duration(y=translated_audio, sr=translated_sr) print(f"Translated audio duration: {translated_duration:.2f} seconds") # Compute the speed-up/slow-down rate # If rate > 1.0, audio is sped up. If rate < 1.0, audio is slowed down. rate = translated_duration / original_duration print(f"Stretch rate: {rate:.4f}") # Apply time-stretch using the high-quality rubberband library # The parameters are: audio_data, sample_rate, and the desired rate adjusted_audio = rb.time_stretch(translated_audio, translated_sr, rate=rate) # Save output # The sample rate remains the same as the translated audio's original rate sf.write(output_path, adjusted_audio, translated_sr) print(f"✅ Duration Adjusted audio saved as: {output_path}") return output_path def clone_voice(translated_audio_path, original_audio_path, output_path="temp_audio_cloned.wav", device="cpu"): print("Cloning Voice") # Convert the tone color of a single audio file tune_one(input_file=translated_audio_path, ref_file=original_audio_path, output_file=output_path, device=device) print(f"✅ Voice cloned audio saved to {output_path}") return output_path def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"): """ Merges an audio file with a video file into a single output video. """ print("\n[STEP] Merging audio and video...") video_input = ffmpeg.input(video_path) audio_input = ffmpeg.input(audio_path) ( ffmpeg.output(video_input.video, audio_input.audio, output_path, vcodec='copy', acodec='aac', shortest=None) .run(overwrite_output=True, quiet=True) ) print(f"✅ Merged video saved to {output_path}") return output_path def main_run(video_path,target_lang,user_transcript=None, user_translation=None): original_audio_file = extract_audio_from_video(video_path) if user_transcript: original_text , pace = transcribe_audio(original_audio_file) original_text = user_transcript print(f"Using provided transcript: {original_text}") else: original_text , pace = transcribe_audio(original_audio_file) if user_translation: translated_text = user_translation print(f"Using provided translation: {translated_text}") else: translated_text = translate_local(original_text,target_lang) print(f"Translated Text: {translated_text}") translated_audio = synthesize_speech(translated_text, target_lang, pace) synced_translated_audio = match_audio_duration(original_audio_file, translated_audio) cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file) final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio) print(f"✅ Pipeline finished") return final_video_nobgm def audio_pipeline_run(audio_path,target_lang,user_transcript=None, user_translation=None): if user_transcript: original_text , pace = transcribe_audio(audio_path) original_text = user_transcript print(f"Using provided transcript: {original_text}") else: original_text , pace = transcribe_audio(audio_path) if user_translation: translated_text = user_translation print(f"Using provided translation: {translated_text}") else: translated_text = translate_local(original_text,target_lang) print(f"Translated Text: {translated_text}") translated_audio = synthesize_speech(translated_text, target_lang, pace) synced_translated_audio = match_audio_duration(audio_path, translated_audio) cloned_synced_translated_audio = clone_voice(synced_translated_audio, audio_path) print(f"✅ Pipeline finished") return cloned_synced_translated_audio # --- Gradio Interface Definition --- # Title and description for the new Space title = "Custom Whisper Transcription App" description = """ This is a custom Gradio app that uses the openai/whisper-large-v2 model from the Hugging Face Hub for transcription. Upload an audio file or record directly from your microphone to get the transcript. """ article = "

Model Card

" # Create the Gradio interface with our custom function # We define the input as an Audio component and the output as a Textbox ''' app_interface = gr.Interface( fn=main_run, inputs=gr.Video(label="Upload Video"),gr. outputs=gr.Textbox(label="Translation Result"),gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil") title=title, description=description, article=article, allow_flagging="never" ) ''' with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface: gr.Markdown("# 🚀 Audio/Video Translation Toolkit") gr.Markdown("This might take a while to generate as it's running on the free tier.") gr.Markdown("Please input only English Audio/Video under 30secs.") gr.Markdown("Time taken for 10secs of audio/video is 5-10 mins.") with gr.Tabs(): with gr.Tab("🎬 Translate Video"): with gr.Column(): with gr.Row(): video_in = gr.Video(label="Input Video", height=500) video_out = gr.Video(label="Output Video", interactive=False, height=500) with gr.Row(): # Radio buttons for selecting target language # This allows users to choose one of the mutually exclusive options lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil") # Single-select option for mutually exclusive choices option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input") # Textboxes for user input, initially hidden user_transcript_vid = gr.Textbox(label="Your English Transcript", lines=3, visible=False) user_translation_vid = gr.Textbox(label="Your Translated Text", lines=3, visible=False) submit_btn_vid = gr.Button("Translate Video", variant="primary") # Toggle visibility based on selected option (only one can be active) option_select.change( fn=lambda choice: ( gr.update(visible=(choice == "Use my Transcript")), gr.update(visible=(choice == "Use my Translation")), ), inputs=option_select, outputs=[user_transcript_vid, user_translation_vid], ) # Include the optional transcript/translation textboxes as inputs (they may be hidden) submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid, user_transcript_vid, user_translation_vid], outputs=[video_out]) with gr.Tab("🎵 Translate Audio"): with gr.Column(): with gr.Row(): audio_in = gr.Audio(label="Input Audio") audio_out = gr.Audio(label="Output Audio", interactive=False) with gr.Row(): # Radio buttons for selecting target language # This allows users to choose one of the mutually exclusive options lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil") # Single-select option for mutually exclusive choices option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input") # Textboxes for user input, initially hidden user_transcript_aud = gr.Textbox(label="Your English Transcript", lines=3, visible=False) user_translation_aud = gr.Textbox(label="Your Translated Text", lines=3, visible=False) submit_btn_aud = gr.Button("Translate Audio", variant="primary") # Toggle visibility based on selected option (only one can be active) option_select.change( fn=lambda choice: ( gr.update(visible=(choice == "Use my Transcript")), gr.update(visible=(choice == "Use my Translation")), ), inputs=option_select, outputs=[user_transcript_aud, user_translation_aud], ) submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in, lang_radio_aud, user_transcript_aud, user_translation_aud], outputs=[audio_out]) with gr.Tab("✂️ Extract Audio"): with gr.Row(): video_in_ext = gr.Video(label="Input Video", height=500) audio_out_ext = gr.Audio(label="Extracted Audio") btn_ext = gr.Button("Extract", variant="secondary") btn_ext.click(fn=extract_audio_from_video, inputs=video_in_ext, outputs=audio_out_ext) with gr.Tab("✍️ Transcribe"): with gr.Row(): audio_in_trans = gr.Audio(type="filepath", label="Input Audio") with gr.Column(): text_out_trans = gr.Textbox(label="Transcription") text_out_pace = gr.Textbox(label="Detected Pace") btn_trans = gr.Button("Transcribe", variant="secondary") btn_trans.click(lambda aud: transcribe_audio(aud), inputs=audio_in_trans, outputs=[text_out_trans, text_out_pace]) with gr.Tab("🌐 Translate Text"): with gr.Row(): with gr.Column(): text_in_tran = gr.Textbox(label="Text to Translate", lines=5) lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil") btn_tran = gr.Button("Translate", variant="secondary") text_out_tran = gr.Textbox(label="Translated Text", lines=5, interactive=False) btn_tran.click(fn=translate_local, inputs=[text_in_tran, lang_radio_tran], outputs=text_out_tran) with gr.Tab("🔊 Synthesize Speech"): with gr.Column(): with gr.Row(): text_in_synth = gr.Textbox(label="Text to Synthesize", lines=5) audio_out_synth = gr.Audio(label="Synthesized Speech") with gr.Row(): lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil") gender_radio_tran = gr.Radio(choices=["Male", "Female"], label="Speaker Gender", value="Male") pace_radio_tran = gr.Radio(choices=["Very_Slow", "Slow", "Normal", "Fast", "Very_Fast"], label="Target Language", value="Normal") btn_synth = gr.Button("Synthesize", variant="secondary") btn_synth.click(fn=synthesize_speech, inputs=[text_in_synth,lang_radio_tran,pace_radio_tran], outputs=audio_out_synth) with gr.Tab("⏱️ Sync Duration"): with gr.Row(): audio_in_sync1 = gr.Audio(type="filepath", label="Original Audio (for duration reference)") audio_in_sync2 = gr.Audio(type="filepath", label="Translated Audio (to be resized)") audio_out_sync = gr.Audio(label="Duration-Synced Audio") btn_sync = gr.Button("Sync Duration", variant="secondary") btn_sync.click(fn=match_audio_duration, inputs=[audio_in_sync1, audio_in_sync2], outputs=audio_out_sync) with gr.Tab("🧬 Clone Voice"): with gr.Row(): audio_in_clone1 = gr.Audio(type="filepath", label="Target Audio (e.g., Synthesized Speech)") audio_in_clone2 = gr.Audio(type="filepath", label="Reference Audio (Original Speaker's Voice)") audio_out_clone = gr.Audio(label="Cloned Voice Audio") btn_clone = gr.Button("Clone Voice", variant="secondary") btn_clone.click(fn=clone_voice, inputs=[audio_in_clone1, audio_in_clone2], outputs=audio_out_clone) with gr.Tab("🎞️ Replace Audio"): with gr.Row(): video_in_rep = gr.Video(label="Input Video", height=500) audio_in_rep = gr.Audio(type="filepath", label="New Audio") video_out_rep = gr.Video(label="Video with Replaced Audio", height=500) btn_rep = gr.Button("Replace Audio", variant="secondary") btn_rep.click(fn=merge_audio_video, inputs=[video_in_rep, audio_in_rep], outputs=video_out_rep) # --- Launch the App --- if __name__ == "__main__": # The launch() method creates a web server and makes the interface accessible. app_interface.launch()