Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import uuid | |
| import os | |
| from typing import Optional | |
| import tempfile | |
| from pydub import AudioSegment | |
| import re | |
| import subprocess | |
| import numpy as np | |
| import soundfile as sf | |
| import sounddevice as sd | |
| import time | |
| import sox | |
| from io import BytesIO | |
| import asyncio | |
| import aiohttp | |
| from moviepy.editor import VideoFileClip | |
| import threading | |
| import socketio | |
| import base64 | |
| ASR_API = "http://astarwiz.com:9998/asr" | |
| TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak' | |
| TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave' | |
| #bSegByPunct = True | |
| bSegByPunct = False | |
| LANGUAGE_MAP = { | |
| "en": "English", | |
| "ma": "Malay", | |
| "ta": "Tamil", | |
| "zh": "Chinese" | |
| } | |
| DEVELOPER_PASSWORD = os.getenv("DEV_PWD") | |
| RAPID_API_KEY = os.getenv("RAPID_API_KEY") | |
| AVAILABLE_SPEAKERS = { | |
| "en": ["MS"], | |
| "ma": ["msFemale"], | |
| "ta": ["ta_female1"], | |
| "zh": ["childChinese2"] | |
| } | |
| audio_update_event = asyncio.Event() | |
| acc_cosy_audio = None | |
| # cosy voice tts related; | |
| #TTS_SOCKET_SERVER = "http://localhost:9244" | |
| TTS_SOCKET_SERVER = "http://astarwiz.com:9244" | |
| sio = socketio.AsyncClient() | |
| def on_connect(): | |
| print('Connected to server') | |
| def on_disconnect(): | |
| print('Disconnected from server') | |
| async def on_audio_chunk(data): | |
| global translation_update, audio_update, acc_cosy_audio | |
| translated_seg_txt = data['trans_text'] | |
| with translation_lock: | |
| translation_update["content"] = translation_update["content"] + " " + translated_seg_txt | |
| translation_update["new"] = True | |
| audio_base64 = data['audio'] | |
| audio_bytes = base64.b64decode(audio_base64) | |
| audio_np = np.frombuffer(audio_bytes, dtype=np.int16) | |
| if (acc_cosy_audio is None): | |
| acc_cosy_audio = audio_np | |
| else: | |
| acc_cosy_audio = np.concatenate((acc_cosy_audio, audio_np)) | |
| with audio_lock: | |
| audio_update["content"] = (22050, audio_np) | |
| audio_update["new"] = True | |
| #audio_float = audio_np.astype(np.float32) / 32767.0 | |
| #audio_queue.append(audio_float) | |
| #accumulated_audio.extend(audio_float) | |
| async def on_tts_complete(): | |
| await sio.disconnect() | |
| print("Disconnected from server after TTS completion") | |
| audio_update_event.set() | |
| # Global variables for storing update information | |
| transcription_update = {"content": "", "new": False} | |
| translation_update = {"content": "", "new": False} | |
| audio_update = {"content": None, "new": False} | |
| # Locks for thread-safe operations | |
| transcription_lock = threading.Lock() | |
| translation_lock = threading.Lock() | |
| audio_lock = threading.Lock() | |
| def replace_audio_in_video(video_path, audio_path, output_path): | |
| command = [ | |
| 'ffmpeg', | |
| '-i', video_path, | |
| '-i', audio_path, | |
| '-c:v', 'copy', | |
| '-map', '0:v:0', | |
| '-map', '1:a:0', | |
| '-shortest', | |
| output_path | |
| ] | |
| subprocess.run(command, check=True) | |
| return output_path | |
| async def replace_audio_and_generate_video(temp_video_path, gradio_audio): | |
| print ("gradio_audio:", gradio_audio) | |
| if not temp_video_path or gradio_audio is None: | |
| return "Both video and audio are required to replace audio.", None | |
| if not os.path.exists(temp_video_path): | |
| return "Video file not found.", None | |
| # Unpack the Gradio audio output | |
| sample_rate, audio_data = gradio_audio | |
| # Ensure audio_data is a numpy array | |
| if not isinstance(audio_data, np.ndarray): | |
| audio_data = np.array(audio_data) | |
| # Create a temporary WAV file for the original audio | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file: | |
| original_audio_path = temp_audio_file.name | |
| sf.write(original_audio_path, audio_data, sample_rate) | |
| # Get video duration | |
| video_clip = VideoFileClip(temp_video_path) | |
| video_duration = video_clip.duration | |
| video_clip.close() | |
| # Get audio duration | |
| audio_duration = len(audio_data) / sample_rate | |
| # Calculate tempo factor | |
| tempo_factor = audio_duration / video_duration | |
| # Create a temporary WAV file for the tempo-adjusted audio | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_audio_file: | |
| adjusted_audio_path = temp_audio_file.name | |
| # Adjust audio tempo | |
| tfm = sox.Transformer() | |
| tfm.tempo(tempo_factor, 's') | |
| tfm.build(original_audio_path, adjusted_audio_path) | |
| # Generate output video path | |
| output_video_path = os.path.join(tempfile.gettempdir(), f"output_{uuid.uuid4()}.mp4") | |
| try: | |
| replace_audio_in_video(temp_video_path, adjusted_audio_path, output_video_path) | |
| return "Audio replaced successfully.", output_video_path | |
| except subprocess.CalledProcessError as e: | |
| return f"Error replacing audio: {str(e)}", None | |
| finally: | |
| os.unlink(original_audio_path) # Clean up the original audio file | |
| os.unlink(adjusted_audio_path) # Clean up the adjusted audio file | |
| async def fetch_youtube_id(youtube_url: str) -> str: | |
| if 'v=' in youtube_url: | |
| return youtube_url.split("v=")[1].split("&")[0] | |
| elif 'youtu.be/' in youtube_url: | |
| return youtube_url.split("youtu.be/")[1] | |
| elif 'shorts' in youtube_url: | |
| return youtube_url.split("/")[-1] | |
| else: | |
| raise Exception("Unsupported URL format") | |
| async def download_youtube_audio(youtube_url: str, output_dir: Optional[str] = None) -> Optional[tuple[str, str]]: | |
| video_id = await fetch_youtube_id(youtube_url) | |
| if not video_id: | |
| return None | |
| if output_dir is None: | |
| output_dir = tempfile.gettempdir() | |
| output_filename = os.path.join(output_dir, f"{video_id}.mp3") | |
| temp_filename = os.path.join(output_dir, f"{video_id}.mp4") | |
| if os.path.exists(output_filename) and os.path.exists(temp_filename): | |
| return (output_filename, temp_filename) | |
| url = "https://youtube86.p.rapidapi.com/api/youtube/links" | |
| headers = { | |
| 'Content-Type': 'application/json', | |
| 'x-rapidapi-host': 'youtube86.p.rapidapi.com', | |
| 'x-rapidapi-key': RAPID_API_KEY | |
| } | |
| data = { | |
| "url": youtube_url | |
| } | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(url, headers=headers, json=data) as response: | |
| if response.status == 200: | |
| result = await response.json() | |
| for url in result[0]['urls']: | |
| if url.get('isBundle'): | |
| audio_url = url['url'] | |
| extension = url['extension'] | |
| print ("audio_url :", audio_url) | |
| async with session.get(audio_url) as audio_response: | |
| print ("audio_response:", audio_response) | |
| if audio_response.status == 200: | |
| content = await audio_response.read() | |
| temp_filename = os.path.join(output_dir, f"{video_id}.{extension}") | |
| with open(temp_filename, 'wb') as audio_file: | |
| audio_file.write(content) | |
| audio = AudioSegment.from_file(temp_filename, format=extension) | |
| audio = audio.set_frame_rate(16000) | |
| audio.export(output_filename, format="mp3", parameters=["-ar", "16000"]) | |
| return (output_filename, temp_filename) | |
| else: | |
| print("Error:", response.status, await response.text()) | |
| return None | |
| punctuation_marks = r'([\.!?!?。])' | |
| def split_text_with_punctuation(text): | |
| # Split the text using the punctuation marks, keeping the punctuation marks | |
| split_text = re.split(punctuation_marks, text) | |
| # Combine each punctuation mark with the preceding segment | |
| combined_segments = [] | |
| # Loop through the split text in steps of 2 | |
| for i in range(0, len(split_text) - 1, 2): | |
| combined_segments.append(split_text[i] + split_text[i + 1]) | |
| # Handle any remaining text that doesn't have a punctuation following it | |
| if len(split_text) % 2 != 0 and split_text[-1]: | |
| combined_segments.append(split_text[-1]) | |
| # Split any segment that exceeds 50 words | |
| final_segments = [] | |
| for segment in combined_segments: | |
| words = segment.split() # Split each segment into words | |
| if len(words) > 50: | |
| # Split the segment into chunks of no more than 50 words | |
| for j in range(0, len(words), 50): | |
| final_segments.append(' '.join(words[j:j+50])) | |
| else: | |
| final_segments.append(segment) | |
| return [segment for segment in final_segments if segment] # Filter out empty strings | |
| def extract_segments(text): | |
| pattern = r'\[(\d+\.\d+)s\s*->\s*(\d+\.\d+)s\]\s*(.*?)(?=\[\d+\.\d+s|\Z)' | |
| matches = re.findall(pattern, text, re.DOTALL) | |
| if not matches: | |
| return [] | |
| segments = [] | |
| for start, end, content in matches: | |
| segments.append({ | |
| 'start': float(start), | |
| 'end': float(end), | |
| 'text': content.strip() | |
| }) | |
| return segments | |
| def adjust_tempo_pysox_array(gradio_audio, duration): | |
| # Unpack the Gradio audio output | |
| sample_rate, audio_data = gradio_audio | |
| # Ensure audio_data is a numpy array | |
| if not isinstance(audio_data, np.ndarray): | |
| audio_data = np.array(audio_data) | |
| # Calculate the current duration of the audio in seconds | |
| current_duration = len(audio_data) / sample_rate | |
| # Calculate the necessary tempo factor to match the desired duration | |
| tempo_factor = current_duration / duration | |
| # Create a pysox Transformer | |
| tfm = sox.Transformer() | |
| tfm.tempo(tempo_factor) | |
| # Use pysox to transform the audio directly in memory | |
| adjusted_audio = tfm.build_array(input_array=audio_data, sample_rate_in=sample_rate) | |
| # Trim or pad the audio to exactly match the desired duration | |
| target_length = int(sample_rate * duration) | |
| if len(adjusted_audio) > target_length: | |
| adjusted_audio = adjusted_audio[:target_length] # Trim if too long | |
| else: | |
| # Pad with zeros if too short | |
| adjusted_audio = np.pad(adjusted_audio, (0, target_length - len(adjusted_audio)), mode='constant') | |
| # Return the processed audio in the Gradio format (sample_rate, adjusted_audio) | |
| return sample_rate, adjusted_audio | |
| async def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64): | |
| print(input_text) | |
| one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant" | |
| vllm_api = 'http://astarwiz.com:2333/' + "v1/completions" | |
| data = { | |
| "prompt": one_vllm_input, | |
| 'model': "./Edu-4B-NewTok-V2-20240904/", | |
| 'min_tokens': min_new_tokens, | |
| 'max_tokens': max_new_tokens, | |
| 'temperature': 0.1, | |
| 'top_p': 0.75, | |
| 'repetition_penalty': 1.1, | |
| "stop_token_ids": [151645, ], | |
| } | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(vllm_api, headers={"Content-Type": "application/json"}, json=data) as response: | |
| if response.status == 200: | |
| result = await response.json() | |
| if "choices" in result: | |
| return result["choices"][0]['text'].strip() | |
| return "The system got some error during vLLM generation. Please try it again." | |
| async def upload_file(file_path, upload_url): | |
| print(f"1. Client sends request: {time.time()}") | |
| async with aiohttp.ClientSession() as session: | |
| with open(file_path, 'rb') as f: | |
| form_data = aiohttp.FormData() | |
| form_data.add_field('file', f, filename=os.path.basename(file_path)) | |
| async with session.post(upload_url, data=form_data) as response: | |
| print(f"5. Client receives headers: {time.time()}") | |
| print(f"Status: {response.status}") | |
| result = await response.json() | |
| print(f"7. Client fully received and parsed response: {time.time()}") | |
| if response.status == 200: | |
| return result | |
| else: | |
| return {"file_id",""} | |
| async def transcribe_and_speak(audio, source_lang, target_lang, youtube_url=None, target_speaker=None, progress_tracker=None): | |
| global transcription_update, translation_update, audio_update, acc_cosy_audio,audio_update_event | |
| transcription_update = {"content": "", "new": True} | |
| translation_update = {"content": "", "new": True} | |
| audio_update = {"content": None, "new": True} | |
| acc_cosy_audio =None | |
| video_path = None | |
| audio_update_event.clear() | |
| #progress = gr.Progress(); | |
| #progress(0.1, "started:") | |
| if youtube_url: | |
| audio = await download_youtube_audio(youtube_url) | |
| if audio is None: | |
| return "Failed to download YouTube audio.", None, None, video_path,(22050, accumulated_audio) | |
| audio, video_path = audio | |
| if not audio: | |
| return "Please provide an audio input or a valid YouTube URL.", None, None, video_path,(22050, accumulated_audio) | |
| # ASR | |
| #progress(0.2, "ASR started:") | |
| file_id = str(uuid.uuid4()) | |
| data = aiohttp.FormData() | |
| data.add_field('file', open(audio, 'rb')) | |
| data.add_field('language', 'ms' if source_lang == 'ma' else source_lang) | |
| if bSegByPunct: | |
| data.add_field('model_name', 'whisper-large-v2-local-cs') | |
| data.add_field('with_timestamp', 'false') | |
| else: | |
| data.add_field('model_name', 'official-v3') | |
| data.add_field('with_timestamp', 'true') | |
| async with aiohttp.ClientSession() as session: | |
| async with session.post(ASR_API, data=data) as asr_response: | |
| if asr_response.status == 200: | |
| result = await asr_response.json() | |
| transcription = result['text'] | |
| with transcription_lock: | |
| transcription_update["content"] = transcription | |
| transcription_update["new"] = True | |
| else: | |
| return "ASR failed", None, None, video_path,(22050, accumulated_audio) | |
| #progress(0.4, "ASR done:") | |
| # use cosy voice if target_lang == 'en' or target_lang == 'zh' | |
| if target_lang == 'en' or target_lang == 'zh': | |
| try: | |
| if not sio.connected: | |
| server_url = TTS_SOCKET_SERVER | |
| await sio.connect(server_url) | |
| print(f"Connected to {server_url}") | |
| # Handle the audio file | |
| file_id="" | |
| if audio and os.path.exists(audio): | |
| print("upload_url") | |
| upload_url = f"{server_url}/upload" # Adjust this URL as needed | |
| print("before call upload_file:") | |
| upload_result = await upload_file(audio, upload_url) | |
| #print (type(upload_result)) | |
| print ("upload_result:", upload_result) | |
| file_id = upload_result['file_id'] | |
| # use defualt voice | |
| tts_request = { | |
| 'text': transcription, | |
| 'overwrite_prompt': False, | |
| 'promptText':"", | |
| 'promptAudio':file_id, | |
| 'sourceLang':source_lang, | |
| 'targetLang':target_lang | |
| } | |
| await sio.emit('tts_request', tts_request) | |
| # wait until all cosy voice tts is done : | |
| await audio_update_event.wait() | |
| print('cosy tts complete,',audio_update) | |
| return transcription, translation_update["content"], audio_update["content"], video_path, (22050, acc_cosy_audio) | |
| except Exception as e: | |
| print(f"Failed to process request: {str(e)}") | |
| print("let use vits then") | |
| if bSegByPunct: | |
| split_result = split_text_with_punctuation(transcription) | |
| else: | |
| split_result = extract_segments(transcription); | |
| translate_segments = [] | |
| accumulated_audio = None | |
| sample_rate = 22050 | |
| global is_playing | |
| for i, segment in enumerate(split_result): | |
| if bSegByPunct: | |
| translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment}" | |
| else: | |
| translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {segment['text']}" | |
| translated_seg_txt = await inference_via_llm_api(translation_prompt) | |
| translate_segments.append(translated_seg_txt) | |
| print(f"Translation: {translated_seg_txt}") | |
| with translation_lock: | |
| translation_update["content"] = " ".join(translate_segments) | |
| translation_update["new"] = True | |
| # Generate TTS for each translated segment | |
| #progress(0.4 + (0.5 * (i + 1) / len(split_result)), "translation and tts in progress :") | |
| tts_params = { | |
| 'language': target_lang, | |
| 'speed': 1.1, | |
| 'speaker': target_speaker or AVAILABLE_SPEAKERS[target_lang][0], | |
| 'text': translated_seg_txt | |
| } | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(TTS_SPEAK_SERVICE, params=tts_params) as tts_response: | |
| if tts_response.status == 200: | |
| audio_file = await tts_response.text() | |
| audio_file = audio_file.strip() | |
| audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}" | |
| async with session.get(audio_url) as response: | |
| content = await response.read() | |
| audio_chunk, sr = sf.read(BytesIO(content)) | |
| #print ('audio_chunk:', type(audio_chunk),audio_chunk) | |
| #print ('audio_chunk:, src:', segment['end'] -segment['start'], ' tts:', len(audio_chunk)/sr) | |
| # _, audio_chunk = adjust_tempo_pysox_array( (sr, audio_chunk), segment['end'] -segment['start']) | |
| if accumulated_audio is None: | |
| accumulated_audio = audio_chunk | |
| sample_rate = sr | |
| else: | |
| accumulated_audio = np.concatenate((accumulated_audio, audio_chunk)) | |
| with audio_lock: | |
| audio_update["content"] = (sample_rate, audio_chunk) | |
| audio_update["new"] = True | |
| else: | |
| print(f"TTS failed for segment: {translated_seg_txt}") | |
| translated_text = " ".join(translate_segments) | |
| #progress(1, "all done.") | |
| print("sigal the playing could stop now. all tts generated") | |
| is_playing =False; | |
| if accumulated_audio is not None: | |
| return transcription, translated_text, audio_update["content"], video_path, (sample_rate,accumulated_audio) | |
| else: | |
| return transcription, translated_text, "TTS failed", video_path, (sample_rate, accumulated_audio) | |
| """ | |
| async def run_speech_translation(audio, source_lang, target_lang, youtube_url, target_speaker): | |
| temp_video_path = None | |
| transcription, translated_text, audio_chunksr, temp_video_path = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker) | |
| return transcription, translated_text, audio_chunksr, temp_video_path | |
| """ | |
| async def update_transcription(): | |
| global transcription_update | |
| with transcription_lock: | |
| if transcription_update["new"]: | |
| content = transcription_update["content"] | |
| transcription_update["new"] = False | |
| return content | |
| return gr.update() | |
| async def update_translation(): | |
| global translation_update | |
| with translation_lock: | |
| if translation_update["new"]: | |
| content = translation_update["content"] | |
| translation_update["new"] = False | |
| return content | |
| return gr.update() | |
| async def update_audio(): | |
| global audio_update | |
| with audio_lock: | |
| if audio_update["new"]: | |
| content = audio_update["content"] | |
| audio_update["new"] = False | |
| return content | |
| return gr.update() | |
| def disable_button(): | |
| # Disable the button during processing | |
| return gr.update(interactive=False) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Speech Translation") | |
| gr.Markdown("Speak into the microphone, upload an audio file, or provide a YouTube URL. The app will translate and speak it back to you.") | |
| with gr.Row(): | |
| user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
| user_youtube_url = gr.Textbox(label="YouTube URL (optional)") | |
| with gr.Row(): | |
| user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") | |
| user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") | |
| user_target_speaker = gr.Dropdown(choices=AVAILABLE_SPEAKERS['zh'], label="Target Speaker", value="childChinese2") | |
| with gr.Row(): | |
| user_button = gr.Button("Translate and Speak", interactive=False) | |
| with gr.Row(): | |
| user_transcription_output = gr.Textbox(label="Transcription") | |
| user_translation_output = gr.Textbox(label="Translation") | |
| user_audio_output = gr.Audio(label="Translated Speech", visible =False) | |
| user_audio_final = gr.Audio(label="Final total Speech") | |
| status_message = gr.Textbox(label="Status", interactive=False) | |
| user_video_output = gr.HTML(label="YouTube Video") | |
| replace_audio_button = gr.Button("Replace Audio", interactive=False, visible =False) | |
| final_video_output = gr.Video(label="Video with Replaced Audio",visible=False) | |
| temp_video_path = gr.State() | |
| translation_progress = gr.State(0.0) | |
| async def update_button_state(audio, youtube_url, progress): | |
| print(audio, youtube_url, progress) | |
| # Button is interactive if there's input and progress is 0 or 1 (not in progress) | |
| print ("progress:", audio, youtube_url,bool(audio) , bool(youtube_url), progress == 0 or progress == 1) | |
| return gr.Button(interactive=(bool(audio) or bool(youtube_url)) and (progress == 0 or progress == 1)) | |
| user_audio_input.change( | |
| fn=update_button_state, | |
| inputs=[user_audio_input, user_youtube_url, translation_progress], | |
| outputs=user_button | |
| ) | |
| user_youtube_url.change( | |
| fn=update_button_state, | |
| inputs=[user_audio_input, user_youtube_url, translation_progress], | |
| outputs=user_button | |
| ) | |
| async def run_speech_translation_wrapper(audio, source_lang, target_lang, youtube_url, target_speaker,progress): | |
| progress = 0.1 | |
| temp_video_path = None | |
| transcription, translated_text, audio_chunksr, temp_video_path, accumulated_aud_buf = await transcribe_and_speak(audio, source_lang, target_lang, youtube_url, target_speaker) | |
| progress = 1 | |
| return transcription, translated_text, audio_chunksr, temp_video_path, "Translation complete", accumulated_aud_buf, gr.update(interactive=True) | |
| user_button.click( | |
| fn=disable_button, | |
| inputs=[], | |
| outputs=[user_button] # Disable the button during processing | |
| ).then( | |
| fn=run_speech_translation_wrapper, | |
| inputs=[user_audio_input, user_source_lang, user_target_lang, user_youtube_url, user_target_speaker, translation_progress], | |
| outputs=[user_transcription_output, user_translation_output, user_audio_output, temp_video_path, status_message,user_audio_final,user_button] | |
| ) | |
| async def update_replace_audio_button(audio_url, video_path): | |
| print("update replace:", audio_url, video_path) | |
| return gr.Button(interactive=bool(audio_url) and bool(video_path)) | |
| user_audio_output.change( | |
| fn=update_replace_audio_button, | |
| inputs=[user_audio_output, temp_video_path], | |
| outputs=[replace_audio_button] | |
| ) | |
| replace_audio_button.click( | |
| fn=replace_audio_and_generate_video, | |
| inputs=[temp_video_path, user_audio_final], | |
| outputs=[status_message, final_video_output] | |
| ) | |
| async def update_video_embed(youtube_url): | |
| if youtube_url: | |
| try: | |
| video_id = await fetch_youtube_id(youtube_url) | |
| return f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{video_id}" frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>' | |
| except Exception as e: | |
| print(f"Error embedding video: {e}") | |
| return "" | |
| user_youtube_url.change( | |
| fn=update_video_embed, | |
| inputs=[user_youtube_url], | |
| outputs=[user_video_output] | |
| ) | |
| async def update_target_speakers(target_lang): | |
| return gr.Dropdown(choices=AVAILABLE_SPEAKERS[target_lang], value=AVAILABLE_SPEAKERS[target_lang][0]) | |
| user_target_lang.change( | |
| fn=update_target_speakers, | |
| inputs=[user_target_lang], | |
| outputs=[user_target_speaker] | |
| ) | |
| async def periodic_update(): | |
| transcription = await update_transcription() | |
| translation = await update_translation() | |
| audio = await update_audio() | |
| return ( | |
| transcription, | |
| translation, | |
| audio | |
| ) | |
| demo.load( | |
| periodic_update, | |
| inputs=[], | |
| outputs=[ | |
| user_transcription_output, | |
| user_translation_output, | |
| user_audio_output, | |
| ], | |
| every=0.1 | |
| ) | |
| # JavaScript for client-side queue and playback handling | |
| user_audio_output.change( | |
| None, # No backend change needed, we only handle frontend actions | |
| inputs=user_audio_output, # Set the user_audio_output as input to capture its audio changes | |
| outputs=None, | |
| js=""" | |
| async (audioFilePath) => { | |
| // Debug: Log received audio file path | |
| console.log("Received audio file path:", audioFilePath); | |
| if (!window.audioQueue) { | |
| window.audioQueue = []; | |
| window.isPlaying = false; | |
| } | |
| // Ensure the correct URL for the audio file is available | |
| if (audioFilePath && audioFilePath.url) { | |
| console.log("Processing audio file..."); | |
| try { | |
| // Fetch and decode the audio file | |
| const response = await fetch(audioFilePath.url); | |
| if (!response.ok) { | |
| console.error("Failed to fetch audio file:", response.statusText); | |
| return; | |
| } | |
| const audioData = await response.arrayBuffer(); | |
| const audioContext = new AudioContext(); | |
| const decodedData = await audioContext.decodeAudioData(audioData); | |
| // Split the decoded audio buffer into two chunks | |
| const totalDuration = decodedData.duration; | |
| const midPoint = Math.floor(decodedData.length / 2); // Midpoint for splitting | |
| const sampleRate = decodedData.sampleRate; | |
| // Create two separate AudioBuffers for each chunk | |
| const firstHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, midPoint, sampleRate); | |
| const secondHalfBuffer = audioContext.createBuffer(decodedData.numberOfChannels, decodedData.length - midPoint, sampleRate); | |
| // Copy data from original buffer to the two new buffers | |
| for (let channel = 0; channel < decodedData.numberOfChannels; channel++) { | |
| firstHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(0, midPoint), channel, 0); | |
| secondHalfBuffer.copyToChannel(decodedData.getChannelData(channel).slice(midPoint), channel, 0); | |
| } | |
| // Add both chunks to the queue | |
| window.audioQueue.push(firstHalfBuffer); | |
| window.audioQueue.push(secondHalfBuffer); | |
| console.log("Two audio chunks added to queue. Queue length:", window.audioQueue.length); | |
| // Function to play the next audio chunk from the queue | |
| const playNextChunk = async () => { | |
| console.log("Attempting to play next chunk. isPlaying:", window.isPlaying); | |
| if (!window.isPlaying && window.audioQueue.length > 0) { | |
| console.log("Starting playback..."); | |
| window.isPlaying = true; | |
| // Get the next audio buffer from the queue | |
| const audioBuffer = window.audioQueue.shift(); | |
| console.log("Playing audio chunk from buffer."); | |
| const source = audioContext.createBufferSource(); | |
| source.buffer = audioBuffer; | |
| source.connect(audioContext.destination); | |
| // When the audio finishes playing, play the next chunk | |
| source.onended = () => { | |
| console.log("Audio chunk finished playing."); | |
| window.isPlaying = false; | |
| playNextChunk(); // Play the next audio chunk in the queue | |
| }; | |
| source.start(0); // Start playing the current chunk | |
| console.log("Audio chunk started."); | |
| } else { | |
| console.log("Already playing or queue is empty."); | |
| } | |
| }; | |
| // Start playing the next chunk if not already playing | |
| playNextChunk(); | |
| } catch (error) { | |
| console.error("Error during audio playback:", error); | |
| window.isPlaying = false; | |
| } | |
| } else { | |
| console.log("No valid audio file path received."); | |
| } | |
| } | |
| """ | |
| ) | |
| demo.queue() | |
| #demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))) | |
| asyncio.run(demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD")))) | |