Spaces:

prasanacodes
/

Indic-Translation-Toolkit

Running

App Files Files Community

Indic-Translation-Toolkit / app.py

prasanacodes

Update app.py

e1b1f60 verified 4 months ago

raw

history blame contribute delete

20.5 kB

	# app.py
	import gradio as gr
	from transformers import pipeline, AutoModel
	import torch
	import ffmpeg
	import nltk
	import re
	from deep_translator import MyMemoryTranslator
	import num2words
	import soundfile as sf
	from gradio_client import Client, handle_file
	from openvoice_cli.__main__ import tune_one
	import pyrubberband as rb
	import librosa
	import os
	import numpy as np

	# You only need to run this download command once
	nltk.download('punkt_tab')

	# --- Model Loading ---
	# We load the model once when the app starts, not on every function call.
	# This makes the app much more efficient.
	# We also check for GPU availability to speed things up if possible.
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	print(f"Using device: {device}")

	# --- Transcription Function ---
	def extract_audio_from_video(video_path, output_audio_path="temp_extracted_audio.wav"):
	"""
	Extracts audio from a video file using python-ffmpeg.
	"""
	print(f"\n[STEP 1/9] Extracting audio from video: {video_path}")

	try:
	(
	ffmpeg
	.input(video_path)
	.output(
	output_audio_path,
	vn=None, # Disable video
	acodec='mp3', # Audio codec
	ab='192k', # Audio bitrate
	ar='44100', # Sample rate
	ac=2, # Audio channels
	f='wav' # Output format
	)
	.run(overwrite_output=True, quiet=True)
	)
	print(f"✅ Audio extracted successfully to: {output_audio_path}")
	return output_audio_path
	except ffmpeg.Error as e:
	print(f"Error: Failed to extract audio from video. stderr: {e.stderr.decode('utf8')}")
	return None

	def transcribe_audio(audio_path):
	"""
	This function takes an audio file path, transcribes it using the Whisper model,
	and returns the transcribed text.
	"""
	# Initialize the ASR pipeline from Hugging Face Transformers

	THRESHOLDS = {
	"very_slow": 80,
	"slow": 110,
	"normal": 150,
	"fast": 200,
	"very_fast": float("inf")
	}

	def get_audio_duration(path: str) -> float:
	"""Return duration of audio file in seconds."""
	with sf.SoundFile(path) as f:
	return len(f) / f.samplerate

	def compute_wpm(transcript: str, duration_s: float) -> float:
	"""Compute words per minute."""
	if not transcript or duration_s == 0:
	return 0.0
	words = transcript.strip().split()
	return len(words) / (duration_s / 60.0)

	def categorize_wpm(wpm: float) -> str:
	"""Map a WPM value to one of the pace categories."""
	for label, threshold in THRESHOLDS.items():
	if wpm < threshold:
	return label
	return "unknown"


	transcriber = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-large-v3-turbo",
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": "english"},
	)

	if audio_path is None:
	return "No audio file provided. Please upload or record an audio file."

	print(f"Transcribing audio file: {audio_path}")

	# The pipeline handles all the complex steps of loading and processing the audio
	result = transcriber(audio_path)
	# The result is a dictionary, and we need the 'text' key
	transcription = result["text"]

	print(f"✅ Transcription successful: {transcription}")

	duration_s = get_audio_duration(audio_path)
	wpm = compute_wpm(transcription, duration_s)
	pace = categorize_wpm(wpm)
	print(f"✅ > Pace detected: {pace.upper()} ({wpm:.1f} WPM)")

	return transcription, pace

	def lang_select(target_lang):
	LANGUAGE_NAME_TO_CODE = {
	"Bengali": "bn-IN", "English": "en-IN", "Gujarati": "gu-IN",
	"Hindi": "hi-IN", "Kannada": "kn-IN", "Malayalam": "ml-IN",
	"Marathi": "mr-IN", "Odia": "or-IN", "Punjabi": "pa-IN",
	"Tamil": "ta-IN", "Telugu": "te-IN"
	}
	return LANGUAGE_NAME_TO_CODE[target_lang]

	def translate_local(text_to_translate, target_lang='ta-IN', device=None):
	"""
	Translates text from English to a target language, handling texts longer
	than 500 characters by splitting them into sentence-based chunks.
	"""
	# 1. Pre-process the text (same as your original code)
	text_to_translate = re.sub(r'\d+', lambda match: num2words(int(match.group(0))), text_to_translate)
	target_lang=lang_select(target_lang.capitalize())

	# 2. Split the entire text into individual sentences
	sentences = nltk.sent_tokenize(text_to_translate)

	# 3. Group sentences into chunks under 500 characters
	chunks = []
	current_chunk = ""
	for sentence in sentences:
	# Check if adding the next sentence exceeds the limit
	if len(current_chunk) + len(sentence) + 1 < 500:
	current_chunk += sentence + " "
	else:
	# If it exceeds, add the current chunk to the list and start a new one
	chunks.append(current_chunk.strip())
	current_chunk = sentence + " "

	# Add the last remaining chunk to the list
	if current_chunk:
	chunks.append(current_chunk.strip())

	# 4. Translate each chunk and combine the results
	translator = MyMemoryTranslator(source='en-GB', target=target_lang)
	translated_chunks = []
	for chunk in chunks:
	try:
	translated_chunks.append(translator.translate(chunk))
	except Exception as e:
	print(f"Could not translate chunk: {chunk}\nError: {e}")
	translated_chunks.append("") # Add an empty string on error

	translated_text = " ".join(translated_chunks)

	print(f"✅ Translated Text to {target_lang} Successfully")

	return translated_text

	def synthesize_speech(synth_text, target_lang, pace="normal", output_path="temp_audio_synthesized.wav", device="cpu"):

	ref_audio_path = str('reference/TAMIL/MALE_'+pace.upper()+'.wav')
	ref_text_path = str('reference/TAMIL/MALE_'+pace.upper()+'.txt')

	ref_audio_path = ref_audio_path
	with open(ref_text_path, encoding='utf-8') as f:
	ref_text = f.read()

	print("> Loading IndicF5 TTS model (ai4bharat/IndicF5)...")
	indicf5_repo_id = "ai4bharat/IndicF5"
	token = os.environ.get("HF_TOKEN")
	tts_model = AutoModel.from_pretrained(indicf5_repo_id, trust_remote_code=True).to(device)

	audio = tts_model(synth_text, ref_audio_path=ref_audio_path, ref_text=ref_text)

	if audio.dtype == np.int16:
	audio = audio.astype(np.float32) / 32768.0

	sf.write(output_path, np.array(audio, dtype=np.float32), samplerate=24000)
	print(f"✅ Speech synthesis complete.")
	print(f"> Final audio saved to: {output_path}")

	return output_path

	def match_audio_duration(original_path, translated_path, output_path="temp_audio_synced.wav"):
	"""
	Matches Synthesized Audio duration to Original Audio duration
	"""
	print("\n[STEP 7/9] Syncing Audio durations")
	# Load original audio
	original_audio, original_sr = librosa.load(original_path, sr=None)
	original_duration = librosa.get_duration(y=original_audio, sr=original_sr)
	print(f"Original audio duration: {original_duration:.2f} seconds")

	# Load translated audio
	translated_audio, translated_sr = librosa.load(translated_path, sr=None)
	translated_duration = librosa.get_duration(y=translated_audio, sr=translated_sr)
	print(f"Translated audio duration: {translated_duration:.2f} seconds")

	# Compute the speed-up/slow-down rate
	# If rate > 1.0, audio is sped up. If rate < 1.0, audio is slowed down.
	rate = translated_duration / original_duration
	print(f"Stretch rate: {rate:.4f}")

	# Apply time-stretch using the high-quality rubberband library
	# The parameters are: audio_data, sample_rate, and the desired rate
	adjusted_audio = rb.time_stretch(translated_audio, translated_sr, rate=rate)

	# Save output
	# The sample rate remains the same as the translated audio's original rate
	sf.write(output_path, adjusted_audio, translated_sr)
	print(f"✅ Duration Adjusted audio saved as: {output_path}")
	return output_path

	def clone_voice(translated_audio_path, original_audio_path, output_path="temp_audio_cloned.wav", device="cpu"):
	print("Cloning Voice")
	# Convert the tone color of a single audio file
	tune_one(input_file=translated_audio_path, ref_file=original_audio_path, output_file=output_path, device=device)
	print(f"✅ Voice cloned audio saved to {output_path}")
	return output_path

	def merge_audio_video(video_path, audio_path, output_path="temp_merged.mp4"):
	"""
	Merges an audio file with a video file into a single output video.
	"""
	print("\n[STEP] Merging audio and video...")
	video_input = ffmpeg.input(video_path)
	audio_input = ffmpeg.input(audio_path)
	(
	ffmpeg.output(video_input.video, audio_input.audio, output_path, vcodec='copy', acodec='aac', shortest=None)
	.run(overwrite_output=True, quiet=True)
	)
	print(f"✅ Merged video saved to {output_path}")
	return output_path

	def main_run(video_path,target_lang,user_transcript=None, user_translation=None):
	original_audio_file = extract_audio_from_video(video_path)
	if user_transcript:
	original_text , pace = transcribe_audio(original_audio_file)
	original_text = user_transcript
	print(f"Using provided transcript: {original_text}")
	else:
	original_text , pace = transcribe_audio(original_audio_file)
	if user_translation:
	translated_text = user_translation
	print(f"Using provided translation: {translated_text}")
	else:
	translated_text = translate_local(original_text,target_lang)
	print(f"Translated Text: {translated_text}")
	translated_audio = synthesize_speech(translated_text, target_lang, pace)
	synced_translated_audio = match_audio_duration(original_audio_file, translated_audio)
	cloned_synced_translated_audio = clone_voice(synced_translated_audio, original_audio_file)
	final_video_nobgm = merge_audio_video(video_path, cloned_synced_translated_audio)
	print(f"✅ Pipeline finished")
	return final_video_nobgm

	def audio_pipeline_run(audio_path,target_lang,user_transcript=None, user_translation=None):
	if user_transcript:
	original_text , pace = transcribe_audio(audio_path)
	original_text = user_transcript
	print(f"Using provided transcript: {original_text}")
	else:
	original_text , pace = transcribe_audio(audio_path)
	if user_translation:
	translated_text = user_translation
	print(f"Using provided translation: {translated_text}")
	else:
	translated_text = translate_local(original_text,target_lang)
	print(f"Translated Text: {translated_text}")
	translated_audio = synthesize_speech(translated_text, target_lang, pace)
	synced_translated_audio = match_audio_duration(audio_path, translated_audio)
	cloned_synced_translated_audio = clone_voice(synced_translated_audio, audio_path)
	print(f"✅ Pipeline finished")
	return cloned_synced_translated_audio


	# --- Gradio Interface Definition ---
	# Title and description for the new Space
	title = "Custom Whisper Transcription App"
	description = """
	This is a custom Gradio app that uses the <b>openai/whisper-large-v2</b> model
	from the Hugging Face Hub for transcription. Upload an audio file or record
	directly from your microphone to get the transcript.
	"""
	article = "<p style='text-align: center'><a href='https://huggingface.co/openai/whisper-large-v3-turbo' target='_blank'>Model Card</a></p>"


	# Create the Gradio interface with our custom function
	# We define the input as an Audio component and the output as a Textbox
	'''
	app_interface = gr.Interface(
	fn=main_run,
	inputs=gr.Video(label="Upload Video"),gr.
	outputs=gr.Textbox(label="Translation Result"),gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
	title=title,
	description=description,
	article=article,
	allow_flagging="never"
	)
	'''

	with gr.Blocks(title="Audio/Video Translation Toolkit") as app_interface:
	gr.Markdown("# 🚀 Audio/Video Translation Toolkit")
	gr.Markdown("This might take a while to generate as it's running on the free tier.")
	gr.Markdown("Please input only English Audio/Video under 30secs.")
	gr.Markdown("Time taken for 10secs of audio/video is 5-10 mins.")
	with gr.Tabs():
	with gr.Tab("🎬 Translate Video"):
	with gr.Column():
	with gr.Row():
	video_in = gr.Video(label="Input Video", height=500)
	video_out = gr.Video(label="Output Video", interactive=False, height=500)
	with gr.Row():
	# Radio buttons for selecting target language
	# This allows users to choose one of the mutually exclusive options
	lang_radio_vid = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
	# Single-select option for mutually exclusive choices
	option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
	# Textboxes for user input, initially hidden
	user_transcript_vid = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
	user_translation_vid = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
	submit_btn_vid = gr.Button("Translate Video", variant="primary")

	# Toggle visibility based on selected option (only one can be active)
	option_select.change(
	fn=lambda choice: (
	gr.update(visible=(choice == "Use my Transcript")),
	gr.update(visible=(choice == "Use my Translation")),
	),
	inputs=option_select,
	outputs=[user_transcript_vid, user_translation_vid],
	)

	# Include the optional transcript/translation textboxes as inputs (they may be hidden)
	submit_btn_vid.click(fn=main_run, inputs=[video_in, lang_radio_vid, user_transcript_vid, user_translation_vid], outputs=[video_out])

	with gr.Tab("🎵 Translate Audio"):
	with gr.Column():
	with gr.Row():
	audio_in = gr.Audio(label="Input Audio")
	audio_out = gr.Audio(label="Output Audio", interactive=False)
	with gr.Row():
	# Radio buttons for selecting target language
	# This allows users to choose one of the mutually exclusive options
	lang_radio_aud = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
	# Single-select option for mutually exclusive choices
	option_select = gr.Radio(choices=["Use my Transcript", "Use my Translation"], label="Optional Input")
	# Textboxes for user input, initially hidden
	user_transcript_aud = gr.Textbox(label="Your English Transcript", lines=3, visible=False)
	user_translation_aud = gr.Textbox(label="Your Translated Text", lines=3, visible=False)
	submit_btn_aud = gr.Button("Translate Audio", variant="primary")

	# Toggle visibility based on selected option (only one can be active)
	option_select.change(
	fn=lambda choice: (
	gr.update(visible=(choice == "Use my Transcript")),
	gr.update(visible=(choice == "Use my Translation")),
	),
	inputs=option_select,
	outputs=[user_transcript_aud, user_translation_aud],
	)
	submit_btn_aud.click(fn=audio_pipeline_run, inputs=[audio_in, lang_radio_aud, user_transcript_aud, user_translation_aud], outputs=[audio_out])

	with gr.Tab("✂️ Extract Audio"):
	with gr.Row():
	video_in_ext = gr.Video(label="Input Video", height=500)
	audio_out_ext = gr.Audio(label="Extracted Audio")
	btn_ext = gr.Button("Extract", variant="secondary")
	btn_ext.click(fn=extract_audio_from_video, inputs=video_in_ext, outputs=audio_out_ext)

	with gr.Tab("✍️ Transcribe"):
	with gr.Row():
	audio_in_trans = gr.Audio(type="filepath", label="Input Audio")
	with gr.Column():
	text_out_trans = gr.Textbox(label="Transcription")
	text_out_pace = gr.Textbox(label="Detected Pace")
	btn_trans = gr.Button("Transcribe", variant="secondary")
	btn_trans.click(lambda aud: transcribe_audio(aud), inputs=audio_in_trans, outputs=[text_out_trans, text_out_pace])

	with gr.Tab("🌐 Translate Text"):
	with gr.Row():
	with gr.Column():
	text_in_tran = gr.Textbox(label="Text to Translate", lines=5)
	lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
	btn_tran = gr.Button("Translate", variant="secondary")
	text_out_tran = gr.Textbox(label="Translated Text", lines=5, interactive=False)
	btn_tran.click(fn=translate_local, inputs=[text_in_tran, lang_radio_tran], outputs=text_out_tran)

	with gr.Tab("🔊 Synthesize Speech"):
	with gr.Column():
	with gr.Row():
	text_in_synth = gr.Textbox(label="Text to Synthesize", lines=5)
	audio_out_synth = gr.Audio(label="Synthesized Speech")
	with gr.Row():
	lang_radio_tran = gr.Radio(choices=["Tamil", "Telugu", "Hindi"], label="Target Language", value="Tamil")
	gender_radio_tran = gr.Radio(choices=["Male", "Female"], label="Speaker Gender", value="Male")
	pace_radio_tran = gr.Radio(choices=["Very_Slow", "Slow", "Normal", "Fast", "Very_Fast"], label="Target Language", value="Normal")
	btn_synth = gr.Button("Synthesize", variant="secondary")
	btn_synth.click(fn=synthesize_speech, inputs=[text_in_synth,lang_radio_tran,pace_radio_tran], outputs=audio_out_synth)

	with gr.Tab("⏱️ Sync Duration"):
	with gr.Row():
	audio_in_sync1 = gr.Audio(type="filepath", label="Original Audio (for duration reference)")
	audio_in_sync2 = gr.Audio(type="filepath", label="Translated Audio (to be resized)")
	audio_out_sync = gr.Audio(label="Duration-Synced Audio")
	btn_sync = gr.Button("Sync Duration", variant="secondary")
	btn_sync.click(fn=match_audio_duration, inputs=[audio_in_sync1, audio_in_sync2], outputs=audio_out_sync)

	with gr.Tab("🧬 Clone Voice"):
	with gr.Row():
	audio_in_clone1 = gr.Audio(type="filepath", label="Target Audio (e.g., Synthesized Speech)")
	audio_in_clone2 = gr.Audio(type="filepath", label="Reference Audio (Original Speaker's Voice)")
	audio_out_clone = gr.Audio(label="Cloned Voice Audio")
	btn_clone = gr.Button("Clone Voice", variant="secondary")
	btn_clone.click(fn=clone_voice, inputs=[audio_in_clone1, audio_in_clone2], outputs=audio_out_clone)

	with gr.Tab("🎞️ Replace Audio"):
	with gr.Row():
	video_in_rep = gr.Video(label="Input Video", height=500)
	audio_in_rep = gr.Audio(type="filepath", label="New Audio")
	video_out_rep = gr.Video(label="Video with Replaced Audio", height=500)
	btn_rep = gr.Button("Replace Audio", variant="secondary")
	btn_rep.click(fn=merge_audio_video, inputs=[video_in_rep, audio_in_rep], outputs=video_out_rep)

	# --- Launch the App ---
	if __name__ == "__main__":
	# The launch() method creates a web server and makes the interface accessible.
	app_interface.launch()