owaski commited on
Commit
556cf42
·
verified ·
1 Parent(s): 52f79e2

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +2 -10
  2. app.py +145 -85
  3. requirements.txt +5 -8
README.md CHANGED
@@ -1,14 +1,6 @@
1
  ---
2
- title: Open LiveTranslate
3
- emoji: 👀
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: multilingual models of streaming speech translation
12
  ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Open-LiveTranslate
3
+ app_file: app.py
 
 
4
  sdk: gradio
5
  sdk_version: 5.49.1
 
 
 
 
6
  ---
 
 
app.py CHANGED
@@ -1,42 +1,12 @@
1
  import re
2
  import argparse
3
 
4
- import spaces
5
-
6
  import gradio as gr
7
  import numpy as np
8
 
9
  import torch
10
  import torchaudio.functional as F
11
 
12
- from transformers import (
13
- AutoProcessor,
14
- Qwen3OmniMoeThinkerForConditionalGeneration,
15
- Qwen3OmniMoeForConditionalGeneration,
16
- Qwen3OmniMoeProcessor,
17
- GenerationConfig,
18
- Qwen3OmniMoeConfig
19
- )
20
- from qwen_omni_utils import process_mm_info
21
-
22
- model_name = "owaski/Open-LiveTranslate-v0-En-Zh"
23
- model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
24
- model_name,
25
- dtype="auto",
26
- device_map="auto",
27
- attn_implementation="flash_attention_2",
28
- enable_audio_output=False,
29
- )
30
- processor = Qwen3OmniMoeProcessor.from_pretrained(model_name)
31
- generation_config = GenerationConfig(
32
- num_beams=1,
33
- do_sample=False,
34
- temperature=0.6,
35
- top_p=0.95,
36
- top_k=1,
37
- max_new_tokens=2048,
38
- )
39
-
40
  def prepare_speech(new_chunk):
41
  sr, y = new_chunk
42
  # Convert to mono if stereo
@@ -50,75 +20,150 @@ def prepare_speech(new_chunk):
50
 
51
  return resampled_y.numpy()
52
 
53
- def prepare_inputs(messages, y):
54
- if messages is None:
 
 
 
 
 
 
 
 
 
 
 
55
  messages = [
56
  {
57
- "role": "system",
58
- "content": [
59
- {"type": "text", "text": f"You are a professional simultaneous interpreter. You will be given chunks of English audio and you need to translate the audio into Chinese text."}
60
- ]
61
- }
62
  ]
63
  messages.append(
64
  {
65
  "role": "user",
66
- "content": [{"type": "audio", "audio": y}]
67
  }
68
  )
 
69
 
70
- print("len(messages)", len(messages))
71
-
72
- text = processor.apply_chat_template(
73
- messages,
74
- add_generation_prompt=True,
75
- tokenize=False
76
- )
77
- audios, images, videos = process_mm_info(messages, use_audio_in_video=False)
78
-
79
- inputs = processor(
80
- text=text,
81
- audio=audios,
82
- images=images,
83
- videos=videos,
84
- return_tensors="pt",
85
- padding=True,
86
- use_audio_in_video=False
87
- ).to('cuda')
88
- inputs['input_features'] = inputs['input_features'].to(model.dtype)
89
-
90
- return messages, inputs
91
-
92
- @spaces.GPU
93
- def translate(messages, new_chunk):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  y = prepare_speech(new_chunk)
95
- messages, inputs = prepare_inputs(messages, y)
96
- text_ids, _ = model.generate(
97
- **inputs,
98
- generation_config=generation_config,
99
- return_audio=False,
100
- thinker_return_dict_in_generate=True,
101
- use_audio_in_video=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  )
103
- translation = processor.batch_decode(
104
- text_ids.sequences[:, inputs["input_ids"].shape[1] :],
105
- skip_special_tokens=True,
106
- clean_up_tokenization_spaces=False
107
- )[0]
108
  messages.append(
109
  {
110
  "role": "assistant",
111
- "content": [{"type": "text", "text": translation}]
112
  }
113
  )
114
- full_translation = ''.join([message["content"][0]["text"] for message in messages if message["role"] == "assistant"])
115
- return messages, full_translation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
 
118
  with gr.Blocks(css="""
119
  .large-font textarea {
120
  font-size: 20px !important;
121
  font-weight: 500;
 
122
  }
123
  .large-font label {
124
  font-size: 20px !important;
@@ -126,28 +171,43 @@ with gr.Blocks(css="""
126
  }
127
  """) as demo:
128
  gr.Markdown("# Simultaneous Speech Translation Demo")
 
 
 
 
 
 
129
 
130
- state = gr.State()
131
-
132
  with gr.Row():
133
  with gr.Column():
 
 
 
 
 
 
 
134
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
135
 
136
  with gr.Row():
137
  with gr.Column():
138
  translation_output = gr.Textbox(
139
  label="Translation",
140
- lines=5,
 
141
  interactive=False,
142
- elem_classes=["large-font"]
 
 
143
  )
144
 
 
145
  audio_input.stream(
146
  translate,
147
- inputs=[state, audio_input],
148
- outputs=[state, translation_output],
149
  show_progress=False,
150
- stream_every=0.96
151
  )
152
 
153
- demo.launch()
 
1
  import re
2
  import argparse
3
 
 
 
4
  import gradio as gr
5
  import numpy as np
6
 
7
  import torch
8
  import torchaudio.functional as F
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def prepare_speech(new_chunk):
11
  sr, y = new_chunk
12
  # Convert to mono if stereo
 
20
 
21
  return resampled_y.numpy()
22
 
23
+ def wav_array_to_base64(wav_array, sample_rate):
24
+ """Convert a numpy audio array to base64 encoded WAV."""
25
+ import base64
26
+ import io
27
+ import soundfile as sf
28
+
29
+ buffer = io.BytesIO()
30
+ sf.write(buffer, wav_array, sample_rate, format='WAV')
31
+ buffer.seek(0)
32
+ return base64.b64encode(buffer.read()).decode('utf-8')
33
+
34
+ def prepare_inputs(messages, audio_base64):
35
+ if not messages: # Check for None or empty list
36
  messages = [
37
  {
38
+ "role": "system",
39
+ "content": "You are a professional simultaneous interpreter. You will be given chunks of English audio and you need to translate the audio into Chinese text."
40
+ },
 
 
41
  ]
42
  messages.append(
43
  {
44
  "role": "user",
45
+ "content": [{"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"}}]
46
  }
47
  )
48
+ return messages
49
 
50
+ def translate(messages, new_chunk, chunk_buffer, chunk_size_seconds, last_chunk_time):
51
+ """
52
+ Translate audio chunks with buffering.
53
+
54
+ Args:
55
+ messages: Conversation history
56
+ new_chunk: New audio chunk from microphone
57
+ chunk_buffer: List of buffered audio arrays
58
+ chunk_size_seconds: Target chunk size in seconds
59
+ last_chunk_time: Timestamp of last received chunk (to detect pauses)
60
+
61
+ Returns:
62
+ messages, full_translation, updated_chunk_buffer, current_time
63
+ """
64
+ from openai import OpenAI
65
+ import time
66
+
67
+ current_time = time.time()
68
+
69
+ if new_chunk is None:
70
+ current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"]) if messages else ""
71
+ return messages, current_translation, chunk_buffer, last_chunk_time
72
+
73
+ # Initialize messages if None
74
+ if messages is None:
75
+ messages = []
76
+
77
+ # Initialize chunk_buffer if None
78
+ if chunk_buffer is None:
79
+ chunk_buffer = []
80
+
81
+ # Check if there was a significant gap (> 2 seconds) - indicates pause/resume
82
+ # Clear partial buffer to avoid concatenating audio from different time periods
83
+ if last_chunk_time is not None and (current_time - last_chunk_time) > 2.0:
84
+ if chunk_buffer:
85
+ print(f"⚠️ Detected pause (gap: {current_time - last_chunk_time:.1f}s). Clearing {len(chunk_buffer)} partial chunks.")
86
+ chunk_buffer = []
87
+
88
+ # Prepare and buffer the new chunk
89
  y = prepare_speech(new_chunk)
90
+ chunk_buffer.append(y)
91
+
92
+ # Calculate how many 0.96s chunks we need to reach target size
93
+ chunks_needed = int(chunk_size_seconds / 0.96)
94
+
95
+ # If we haven't accumulated enough chunks yet, return without processing
96
+ if len(chunk_buffer) < chunks_needed:
97
+ # Return current state without translation
98
+ current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
99
+ return messages, current_translation, chunk_buffer, current_time
100
+
101
+ # We have enough chunks - concatenate and process
102
+ concatenated_audio = np.concatenate(chunk_buffer[:chunks_needed])
103
+ chunk_buffer = chunk_buffer[chunks_needed:] # Keep any extra chunks for next iteration
104
+
105
+ # Convert to base64
106
+ audio_base64 = wav_array_to_base64(concatenated_audio, 16000)
107
+
108
+ # Prepare messages
109
+ messages = prepare_inputs(messages, audio_base64)
110
+
111
+ # Calculate context window size based on chunk size
112
+ # Larger chunks = longer audio = can keep fewer messages in context
113
+ # Base: 30 messages for 1.92s chunks, scale proportionally
114
+ context_window = max(10, int(30 * (1.92 / chunk_size_seconds)))
115
+
116
+ # Call OpenAI API
117
+ client = OpenAI(
118
+ base_url="http://localhost:8000/v1",
119
+ api_key="EMPTY",
120
+ )
121
+
122
+ model_path = "/data/user_data/siqiouya/ckpts/test_swift/Qwen3-Omni-30B-A3B-Instruct-lora/v1-20251104-033331-hf"
123
+
124
+ completion = client.chat.completions.create(
125
+ model=model_path,
126
+ messages=[messages[0]] + messages[-context_window:],
127
  )
128
+ translation = completion.choices[0].message.content
 
 
 
 
129
  messages.append(
130
  {
131
  "role": "assistant",
132
+ "content": translation
133
  }
134
  )
135
+
136
+ # Get all translations
137
+ full_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
138
+
139
+ # Keep only the last 5 lines for display
140
+ translation_lines = full_translation.split('\n') if full_translation else ['']
141
+ # Filter out empty lines for counting, but preserve them in output
142
+ non_empty_lines = [line for line in translation_lines if line.strip()]
143
+
144
+ if len(non_empty_lines) > 5:
145
+ # Find the last 5 non-empty lines and include any surrounding context
146
+ # Count backwards to find where the 5th-to-last non-empty line is
147
+ count = 0
148
+ for i in range(len(translation_lines) - 1, -1, -1):
149
+ if translation_lines[i].strip():
150
+ count += 1
151
+ if count == 5:
152
+ display_translation = '\n'.join(translation_lines[i:])
153
+ break
154
+ else:
155
+ display_translation = full_translation
156
+ else:
157
+ display_translation = full_translation
158
+
159
+ return messages, display_translation, chunk_buffer, current_time
160
 
161
 
162
  with gr.Blocks(css="""
163
  .large-font textarea {
164
  font-size: 20px !important;
165
  font-weight: 500;
166
+ overflow-y: auto !important;
167
  }
168
  .large-font label {
169
  font-size: 20px !important;
 
171
  }
172
  """) as demo:
173
  gr.Markdown("# Simultaneous Speech Translation Demo")
174
+ gr.Markdown("**Instructions:** Select chunk size, then click the microphone to start recording. Refresh page to reset the history.")
175
+
176
+ # State components
177
+ messages_state = gr.State(value=[])
178
+ chunk_buffer_state = gr.State(value=[])
179
+ last_chunk_time_state = gr.State(value=None)
180
 
 
 
181
  with gr.Row():
182
  with gr.Column():
183
+ # Chunk size selector (multiples of 0.96)
184
+ chunk_size_selector = gr.Dropdown(
185
+ choices=[0.96, 1.92, 2.88, 3.84, 4.80, 5.76, 6.72, 7.68, 8.64, 9.60],
186
+ value=1.92,
187
+ label="Chunk Size (seconds)",
188
+ info="Larger chunks = more context but slower response. Must be multiple of 0.96s."
189
+ )
190
  audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
191
 
192
  with gr.Row():
193
  with gr.Column():
194
  translation_output = gr.Textbox(
195
  label="Translation",
196
+ lines=3,
197
+ max_lines=5,
198
  interactive=False,
199
+ elem_classes=["large-font"],
200
+ autoscroll=True,
201
+ show_copy_button=True
202
  )
203
 
204
+ # Streaming translation
205
  audio_input.stream(
206
  translate,
207
+ inputs=[messages_state, audio_input, chunk_buffer_state, chunk_size_selector, last_chunk_time_state],
208
+ outputs=[messages_state, translation_output, chunk_buffer_state, last_chunk_time_state],
209
  show_progress=False,
210
+ stream_every=0.96 # Base unit - buffering happens inside translate()
211
  )
212
 
213
+ demo.launch(share=True)
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
- torch==2.8.0
2
- torchvision==0.23.0
3
- torchaudio==2.8.0
4
- transformers==4.57.1
5
- accelerate
6
- qwen-omni-utils
7
- jupyter
8
- flash-attn
 
1
+ openai
2
+ torch
3
+ torchaudio
4
+ numpy
5
+ soundfile