PlotweaverModel commited on
Commit
639147f
·
verified ·
1 Parent(s): f7b3ceb

Delete qwen_engine.py

Browse files
Files changed (1) hide show
  1. qwen_engine.py +0 -294
qwen_engine.py DELETED
@@ -1,294 +0,0 @@
1
- """
2
- Qwen 3.5 Omni Engine — End-to-end speech-to-speech translation.
3
- Takes English audio in, returns translated audio + transcript out.
4
- No separate ASR/MT/TTS needed — Qwen handles everything in one call.
5
- """
6
-
7
- import os
8
- import base64
9
- import struct
10
- import subprocess
11
- import tempfile
12
- import time
13
- import shutil
14
- import logging
15
-
16
- logger = logging.getLogger(__name__)
17
-
18
- QWEN_MODEL = "qwen3.5-omni-plus"
19
- QWEN_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
20
-
21
-
22
- def _get_client():
23
- """Create OpenAI-compatible client for Qwen Dashscope API."""
24
- from openai import OpenAI
25
- api_key = os.environ.get("DASHSCOPE_API_KEY", "")
26
- if not api_key:
27
- raise RuntimeError(
28
- "DASHSCOPE_API_KEY not set. Add it as a Space secret."
29
- )
30
- return OpenAI(api_key=api_key, base_url=QWEN_BASE_URL)
31
-
32
-
33
- def _wav_to_base64(wav_path):
34
- """Read WAV file and return base64 string."""
35
- with open(wav_path, "rb") as f:
36
- return base64.b64encode(f.read()).decode("utf-8")
37
-
38
-
39
- def _base64_to_wav(b64_data, output_path):
40
- """Convert raw PCM base64 audio to WAV file (24kHz, mono, 16-bit)."""
41
- audio_bytes = base64.b64decode(b64_data)
42
- sample_rate = 24000
43
- num_channels = 1
44
- bits_per_sample = 16
45
- byte_rate = sample_rate * num_channels * bits_per_sample // 8
46
- block_align = num_channels * bits_per_sample // 8
47
- data_size = len(audio_bytes)
48
- with open(output_path, "wb") as f:
49
- f.write(b"RIFF")
50
- f.write(struct.pack("<I", 36 + data_size))
51
- f.write(b"WAVE")
52
- f.write(b"fmt ")
53
- f.write(struct.pack("<I", 16))
54
- f.write(struct.pack("<H", 1))
55
- f.write(struct.pack("<H", num_channels))
56
- f.write(struct.pack("<I", sample_rate))
57
- f.write(struct.pack("<I", byte_rate))
58
- f.write(struct.pack("<H", block_align))
59
- f.write(struct.pack("<H", bits_per_sample))
60
- f.write(b"data")
61
- f.write(struct.pack("<I", data_size))
62
- f.write(audio_bytes)
63
-
64
-
65
- def _extract_audio_chunk(video_path, output_wav, start_sec, duration_sec):
66
- """Extract a chunk of audio from video as 16kHz mono WAV."""
67
- subprocess.run(
68
- ["ffmpeg", "-y", "-ss", str(start_sec), "-t", str(duration_sec),
69
- "-i", video_path, "-vn", "-acodec", "pcm_s16le",
70
- "-ar", "16000", "-ac", "1", output_wav],
71
- capture_output=True, check=True,
72
- )
73
-
74
-
75
- def _get_duration(filepath):
76
- """Get media file duration in seconds."""
77
- result = subprocess.run(
78
- ["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
79
- "-of", "default=noprint_wrappers=1:nokey=1", filepath],
80
- capture_output=True, text=True,
81
- )
82
- return float(result.stdout.strip())
83
-
84
-
85
- def _concatenate_wavs(wav_files, output_path):
86
- """Concatenate WAV files using ffmpeg."""
87
- if len(wav_files) == 1:
88
- shutil.copy2(wav_files[0], output_path)
89
- return
90
- list_file = output_path + ".txt"
91
- with open(list_file, "w") as f:
92
- for wav in wav_files:
93
- f.write(f"file '{wav}'\n")
94
- subprocess.run(
95
- ["ffmpeg", "-y", "-f", "concat", "-safe", "0",
96
- "-i", list_file, "-c", "copy", output_path],
97
- capture_output=True, check=True,
98
- )
99
- os.remove(list_file)
100
-
101
-
102
- def _build_system_prompt(language_name):
103
- """Build Qwen system prompt for a target language."""
104
- return (
105
- f"You are a professional video dubbing translator. You will receive audio in English.\n"
106
- f"Your task:\n"
107
- f"1. Listen carefully to the English speech.\n"
108
- f"2. Translate it into natural, fluent {language_name}.\n"
109
- f"3. Respond ONLY with the {language_name} translation spoken aloud — no English, no commentary,\n"
110
- f" no meta-text, no transliteration. Speak entirely in {language_name}.\n"
111
- f"4. Match the tone, emotion, and pacing of the original speaker as closely as possible.\n"
112
- f"5. If there are pauses or silence in the original audio, maintain similar pacing.\n"
113
- f"6. Translate idioms and cultural references into their {language_name} equivalents.\n"
114
- f"7. Use clear, professional pronunciation suitable for a broad audience."
115
- )
116
-
117
-
118
- def translate_chunk_qwen(wav_path, voice, language_name, chunk_index=0):
119
- """
120
- Translate a single audio chunk using Qwen Omni.
121
-
122
- Args:
123
- wav_path: Path to input WAV file (English audio)
124
- voice: Qwen voice name (e.g. "Ethan", "Cherry")
125
- language_name: Full language name for the system prompt
126
- chunk_index: For logging
127
-
128
- Returns:
129
- (output_wav_path, transcript) or (None, transcript) if no audio
130
- """
131
- client = _get_client()
132
- audio_b64 = _wav_to_base64(wav_path)
133
- output_wav = wav_path.replace(".wav", f"_qwen_{chunk_index}.wav")
134
-
135
- system_prompt = _build_system_prompt(language_name)
136
- user_prompt = f"Translate this English speech into {language_name}. Respond only with the spoken {language_name} translation."
137
-
138
- t0 = time.time()
139
- completion = client.chat.completions.create(
140
- model=QWEN_MODEL,
141
- messages=[
142
- {"role": "system", "content": system_prompt},
143
- {
144
- "role": "user",
145
- "content": [
146
- {
147
- "type": "input_audio",
148
- "input_audio": {
149
- "data": f"data:audio/wav;base64,{audio_b64}",
150
- "format": "wav",
151
- },
152
- },
153
- {"type": "text", "text": user_prompt},
154
- ],
155
- },
156
- ],
157
- modalities=["text", "audio"],
158
- audio={"voice": voice, "format": "wav"},
159
- stream=True,
160
- stream_options={"include_usage": True},
161
- )
162
-
163
- audio_chunks = []
164
- transcript_parts = []
165
-
166
- for event in completion:
167
- if not event.choices:
168
- continue
169
- delta = event.choices[0].delta
170
- if hasattr(delta, "content") and delta.content:
171
- transcript_parts.append(delta.content)
172
- if hasattr(delta, "audio") and delta.audio:
173
- if isinstance(delta.audio, dict):
174
- if "data" in delta.audio:
175
- audio_chunks.append(delta.audio["data"])
176
- elif hasattr(delta.audio, "data") and delta.audio.data:
177
- audio_chunks.append(delta.audio.data)
178
-
179
- transcript = "".join(transcript_parts)
180
- elapsed = time.time() - t0
181
- logger.info(f"Qwen chunk {chunk_index}: {elapsed:.1f}s, transcript={transcript[:60]}")
182
-
183
- if audio_chunks:
184
- full_audio_b64 = "".join(audio_chunks)
185
- _base64_to_wav(full_audio_b64, output_wav)
186
- return output_wav, transcript
187
-
188
- return None, transcript
189
-
190
-
191
- def dub_video_qwen(video_path, language_name, voice="Ethan", chunk_seconds=120, progress_fn=None):
192
- """
193
- Full video dubbing pipeline using Qwen Omni.
194
- Splits video into chunks, translates each chunk via Qwen API,
195
- concatenates results, and muxes back onto video.
196
-
197
- Args:
198
- video_path: Path to input video
199
- language_name: Full language name (e.g. "French", "Arabic")
200
- voice: Qwen voice name
201
- chunk_seconds: Audio chunk duration for API calls
202
- progress_fn: Optional gradio progress callback
203
-
204
- Returns:
205
- (output_video_path, log_text)
206
- """
207
- tmp_dir = tempfile.mkdtemp(prefix=f"qwen_dub_")
208
- log = []
209
-
210
- try:
211
- # Duration
212
- if progress_fn:
213
- progress_fn(0.05, desc="Analyzing video...")
214
- total_duration = _get_duration(video_path)
215
- log.append(f"**Video:** {total_duration:.1f}s")
216
- log.append(f"**Engine:** Qwen 3.5 Omni")
217
- log.append(f"**Voice:** {voice}")
218
- log.append(f"**Language:** {language_name}")
219
-
220
- if total_duration > 3600:
221
- return None, "Video longer than 1 hour — please use a shorter clip."
222
-
223
- # Split into chunks
224
- if progress_fn:
225
- progress_fn(0.1, desc="Extracting audio chunks...")
226
- num_chunks = max(1, int(total_duration // chunk_seconds) + (1 if total_duration % chunk_seconds > 0 else 0))
227
- log.append(f"**Chunks:** {num_chunks} ({chunk_seconds}s each)")
228
-
229
- input_chunks = []
230
- for i in range(num_chunks):
231
- start = i * chunk_seconds
232
- duration = min(chunk_seconds, total_duration - start)
233
- chunk_path = os.path.join(tmp_dir, f"chunk_{i:03d}.wav")
234
- _extract_audio_chunk(video_path, chunk_path, start, duration)
235
- input_chunks.append(chunk_path)
236
-
237
- # Translate each chunk
238
- output_chunks = []
239
- all_transcripts = []
240
-
241
- for i, chunk_path in enumerate(input_chunks):
242
- if progress_fn:
243
- frac = 0.15 + 0.7 * (i / num_chunks)
244
- progress_fn(frac, desc=f"Translating chunk {i+1}/{num_chunks}...")
245
-
246
- result_path, transcript = translate_chunk_qwen(
247
- chunk_path, voice, language_name, i
248
- )
249
- if transcript:
250
- all_transcripts.append(f"**[{i+1}]** {transcript}")
251
-
252
- if result_path:
253
- output_chunks.append(result_path)
254
- else:
255
- # Silence fallback
256
- duration = _get_duration(chunk_path)
257
- silence_path = os.path.join(tmp_dir, f"silence_{i:03d}.wav")
258
- subprocess.run(
259
- ["ffmpeg", "-y", "-f", "lavfi",
260
- "-i", "anullsrc=r=24000:cl=mono",
261
- "-t", str(duration), "-acodec", "pcm_s16le", silence_path],
262
- capture_output=True, check=True,
263
- )
264
- output_chunks.append(silence_path)
265
-
266
- # Concatenate
267
- if progress_fn:
268
- progress_fn(0.88, desc="Assembling audio...")
269
- full_audio = os.path.join(tmp_dir, "full_dubbed.wav")
270
- _concatenate_wavs(output_chunks, full_audio)
271
-
272
- # Mux onto video
273
- if progress_fn:
274
- progress_fn(0.93, desc="Combining audio and video...")
275
- output_video = os.path.join(tmp_dir, "dubbed_output.mp4")
276
- subprocess.run(
277
- ["ffmpeg", "-y", "-i", video_path, "-i", full_audio,
278
- "-c:v", "copy", "-map", "0:v:0", "-map", "1:a:0",
279
- "-shortest", output_video],
280
- capture_output=True, check=True,
281
- )
282
-
283
- if progress_fn:
284
- progress_fn(1.0, desc="Done!")
285
-
286
- log.append(f"\n**Transcript:**")
287
- log.extend(all_transcripts)
288
-
289
- return output_video, "\n".join(log)
290
-
291
- except Exception as e:
292
- logger.exception("Qwen dubbing failed")
293
- shutil.rmtree(tmp_dir, ignore_errors=True)
294
- return None, f"Error: {str(e)}"