""" Qwen 3.5 Omni Engine — End-to-end speech-to-speech translation. Takes English audio in, returns translated audio + transcript out. No separate ASR/MT/TTS needed — Qwen handles everything in one call. """ import os import base64 import struct import subprocess import tempfile import time import shutil import logging logger = logging.getLogger(__name__) QWEN_MODEL = "qwen3.5-omni-plus" QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" def _get_client(): """Create OpenAI-compatible client for Qwen Dashscope API.""" from openai import OpenAI api_key = os.environ.get("DASHSCOPE_API_KEY", "") if not api_key: raise RuntimeError( "DASHSCOPE_API_KEY not set. Add it as a Space secret." ) return OpenAI(api_key=api_key, base_url=QWEN_BASE_URL) def _wav_to_base64(wav_path): """Read WAV file and return base64 string.""" with open(wav_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") def _base64_to_wav(b64_data, output_path): """Convert raw PCM base64 audio to WAV file (24kHz, mono, 16-bit).""" audio_bytes = base64.b64decode(b64_data) sample_rate = 24000 num_channels = 1 bits_per_sample = 16 byte_rate = sample_rate * num_channels * bits_per_sample // 8 block_align = num_channels * bits_per_sample // 8 data_size = len(audio_bytes) with open(output_path, "wb") as f: f.write(b"RIFF") f.write(struct.pack(" 3600: return None, "Video longer than 1 hour — please use a shorter clip." # Split into chunks if progress_fn: progress_fn(0.1, desc="Extracting audio chunks...") num_chunks = max(1, int(total_duration // chunk_seconds) + (1 if total_duration % chunk_seconds > 0 else 0)) log.append(f"**Chunks:** {num_chunks} ({chunk_seconds}s each)") input_chunks = [] for i in range(num_chunks): start = i * chunk_seconds duration = min(chunk_seconds, total_duration - start) chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav") _extract_audio_chunk(video_path, chunk_path, start, duration) input_chunks.append(chunk_path) # Translate each chunk output_chunks = [] all_transcripts = [] for i, chunk_path in enumerate(input_chunks): if progress_fn: frac = 0.15 + 0.7 * (i / num_chunks) progress_fn(frac, desc=f"Translating chunk {i+1}/{num_chunks}...") result_path, transcript = translate_chunk_qwen( chunk_path, voice, language_name, i ) if transcript: all_transcripts.append(f"**[{i+1}]** {transcript}") if result_path: output_chunks.append(result_path) else: # Silence fallback duration = _get_duration(chunk_path) silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav") subprocess.run( ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono", "-t", str(duration), "-acodec", "pcm_s16le", silence_path], capture_output=True, check=True, ) output_chunks.append(silence_path) # Concatenate if progress_fn: progress_fn(0.88, desc="Assembling audio...") full_audio = os.path.join(tmp_dir, "full_dubbed.wav") _concatenate_wavs(output_chunks, full_audio) # Mux onto video if progress_fn: progress_fn(0.93, desc="Combining audio and video...") output_video = os.path.join(tmp_dir, "dubbed_output.mp4") subprocess.run( ["ffmpeg", "-y", "-i", video_path, "-i", full_audio, "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0", "-shortest", output_video], capture_output=True, check=True, ) if progress_fn: progress_fn(1.0, desc="Done!") log.append(f"\n**Transcript:**") log.extend(all_transcripts) return output_video, "\n".join(log) except Exception as e: logger.exception("Qwen dubbing failed") shutil.rmtree(tmp_dir, ignore_errors=True) return None, f"Error: {str(e)}"