Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
Upload folder using huggingface_hub
Browse files- README.md +2 -10
- app.py +145 -85
- requirements.txt +5 -8
README.md
CHANGED
|
@@ -1,14 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title: Open
|
| 3 |
-
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.49.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: apache-2.0
|
| 11 |
-
short_description: multilingual models of streaming speech translation
|
| 12 |
---
|
| 13 |
-
|
| 14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Open-LiveTranslate
|
| 3 |
+
app_file: app.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 5.49.1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,42 +1,12 @@
|
|
| 1 |
import re
|
| 2 |
import argparse
|
| 3 |
|
| 4 |
-
import spaces
|
| 5 |
-
|
| 6 |
import gradio as gr
|
| 7 |
import numpy as np
|
| 8 |
|
| 9 |
import torch
|
| 10 |
import torchaudio.functional as F
|
| 11 |
|
| 12 |
-
from transformers import (
|
| 13 |
-
AutoProcessor,
|
| 14 |
-
Qwen3OmniMoeThinkerForConditionalGeneration,
|
| 15 |
-
Qwen3OmniMoeForConditionalGeneration,
|
| 16 |
-
Qwen3OmniMoeProcessor,
|
| 17 |
-
GenerationConfig,
|
| 18 |
-
Qwen3OmniMoeConfig
|
| 19 |
-
)
|
| 20 |
-
from qwen_omni_utils import process_mm_info
|
| 21 |
-
|
| 22 |
-
model_name = "owaski/Open-LiveTranslate-v0-En-Zh"
|
| 23 |
-
model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
|
| 24 |
-
model_name,
|
| 25 |
-
dtype="auto",
|
| 26 |
-
device_map="auto",
|
| 27 |
-
attn_implementation="flash_attention_2",
|
| 28 |
-
enable_audio_output=False,
|
| 29 |
-
)
|
| 30 |
-
processor = Qwen3OmniMoeProcessor.from_pretrained(model_name)
|
| 31 |
-
generation_config = GenerationConfig(
|
| 32 |
-
num_beams=1,
|
| 33 |
-
do_sample=False,
|
| 34 |
-
temperature=0.6,
|
| 35 |
-
top_p=0.95,
|
| 36 |
-
top_k=1,
|
| 37 |
-
max_new_tokens=2048,
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
def prepare_speech(new_chunk):
|
| 41 |
sr, y = new_chunk
|
| 42 |
# Convert to mono if stereo
|
|
@@ -50,75 +20,150 @@ def prepare_speech(new_chunk):
|
|
| 50 |
|
| 51 |
return resampled_y.numpy()
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
messages = [
|
| 56 |
{
|
| 57 |
-
"role": "system",
|
| 58 |
-
"content":
|
| 59 |
-
|
| 60 |
-
]
|
| 61 |
-
}
|
| 62 |
]
|
| 63 |
messages.append(
|
| 64 |
{
|
| 65 |
"role": "user",
|
| 66 |
-
"content": [{"type": "
|
| 67 |
}
|
| 68 |
)
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
y = prepare_speech(new_chunk)
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
)
|
| 103 |
-
translation =
|
| 104 |
-
text_ids.sequences[:, inputs["input_ids"].shape[1] :],
|
| 105 |
-
skip_special_tokens=True,
|
| 106 |
-
clean_up_tokenization_spaces=False
|
| 107 |
-
)[0]
|
| 108 |
messages.append(
|
| 109 |
{
|
| 110 |
"role": "assistant",
|
| 111 |
-
"content":
|
| 112 |
}
|
| 113 |
)
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
with gr.Blocks(css="""
|
| 119 |
.large-font textarea {
|
| 120 |
font-size: 20px !important;
|
| 121 |
font-weight: 500;
|
|
|
|
| 122 |
}
|
| 123 |
.large-font label {
|
| 124 |
font-size: 20px !important;
|
|
@@ -126,28 +171,43 @@ with gr.Blocks(css="""
|
|
| 126 |
}
|
| 127 |
""") as demo:
|
| 128 |
gr.Markdown("# Simultaneous Speech Translation Demo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
state = gr.State()
|
| 131 |
-
|
| 132 |
with gr.Row():
|
| 133 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
|
| 135 |
|
| 136 |
with gr.Row():
|
| 137 |
with gr.Column():
|
| 138 |
translation_output = gr.Textbox(
|
| 139 |
label="Translation",
|
| 140 |
-
lines=
|
|
|
|
| 141 |
interactive=False,
|
| 142 |
-
elem_classes=["large-font"]
|
|
|
|
|
|
|
| 143 |
)
|
| 144 |
|
|
|
|
| 145 |
audio_input.stream(
|
| 146 |
translate,
|
| 147 |
-
inputs=[
|
| 148 |
-
outputs=[
|
| 149 |
show_progress=False,
|
| 150 |
-
stream_every=0.96
|
| 151 |
)
|
| 152 |
|
| 153 |
-
demo.launch()
|
|
|
|
| 1 |
import re
|
| 2 |
import argparse
|
| 3 |
|
|
|
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
import torch
|
| 8 |
import torchaudio.functional as F
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def prepare_speech(new_chunk):
|
| 11 |
sr, y = new_chunk
|
| 12 |
# Convert to mono if stereo
|
|
|
|
| 20 |
|
| 21 |
return resampled_y.numpy()
|
| 22 |
|
| 23 |
+
def wav_array_to_base64(wav_array, sample_rate):
|
| 24 |
+
"""Convert a numpy audio array to base64 encoded WAV."""
|
| 25 |
+
import base64
|
| 26 |
+
import io
|
| 27 |
+
import soundfile as sf
|
| 28 |
+
|
| 29 |
+
buffer = io.BytesIO()
|
| 30 |
+
sf.write(buffer, wav_array, sample_rate, format='WAV')
|
| 31 |
+
buffer.seek(0)
|
| 32 |
+
return base64.b64encode(buffer.read()).decode('utf-8')
|
| 33 |
+
|
| 34 |
+
def prepare_inputs(messages, audio_base64):
|
| 35 |
+
if not messages: # Check for None or empty list
|
| 36 |
messages = [
|
| 37 |
{
|
| 38 |
+
"role": "system",
|
| 39 |
+
"content": "You are a professional simultaneous interpreter. You will be given chunks of English audio and you need to translate the audio into Chinese text."
|
| 40 |
+
},
|
|
|
|
|
|
|
| 41 |
]
|
| 42 |
messages.append(
|
| 43 |
{
|
| 44 |
"role": "user",
|
| 45 |
+
"content": [{"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"}}]
|
| 46 |
}
|
| 47 |
)
|
| 48 |
+
return messages
|
| 49 |
|
| 50 |
+
def translate(messages, new_chunk, chunk_buffer, chunk_size_seconds, last_chunk_time):
|
| 51 |
+
"""
|
| 52 |
+
Translate audio chunks with buffering.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
messages: Conversation history
|
| 56 |
+
new_chunk: New audio chunk from microphone
|
| 57 |
+
chunk_buffer: List of buffered audio arrays
|
| 58 |
+
chunk_size_seconds: Target chunk size in seconds
|
| 59 |
+
last_chunk_time: Timestamp of last received chunk (to detect pauses)
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
messages, full_translation, updated_chunk_buffer, current_time
|
| 63 |
+
"""
|
| 64 |
+
from openai import OpenAI
|
| 65 |
+
import time
|
| 66 |
+
|
| 67 |
+
current_time = time.time()
|
| 68 |
+
|
| 69 |
+
if new_chunk is None:
|
| 70 |
+
current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"]) if messages else ""
|
| 71 |
+
return messages, current_translation, chunk_buffer, last_chunk_time
|
| 72 |
+
|
| 73 |
+
# Initialize messages if None
|
| 74 |
+
if messages is None:
|
| 75 |
+
messages = []
|
| 76 |
+
|
| 77 |
+
# Initialize chunk_buffer if None
|
| 78 |
+
if chunk_buffer is None:
|
| 79 |
+
chunk_buffer = []
|
| 80 |
+
|
| 81 |
+
# Check if there was a significant gap (> 2 seconds) - indicates pause/resume
|
| 82 |
+
# Clear partial buffer to avoid concatenating audio from different time periods
|
| 83 |
+
if last_chunk_time is not None and (current_time - last_chunk_time) > 2.0:
|
| 84 |
+
if chunk_buffer:
|
| 85 |
+
print(f"⚠️ Detected pause (gap: {current_time - last_chunk_time:.1f}s). Clearing {len(chunk_buffer)} partial chunks.")
|
| 86 |
+
chunk_buffer = []
|
| 87 |
+
|
| 88 |
+
# Prepare and buffer the new chunk
|
| 89 |
y = prepare_speech(new_chunk)
|
| 90 |
+
chunk_buffer.append(y)
|
| 91 |
+
|
| 92 |
+
# Calculate how many 0.96s chunks we need to reach target size
|
| 93 |
+
chunks_needed = int(chunk_size_seconds / 0.96)
|
| 94 |
+
|
| 95 |
+
# If we haven't accumulated enough chunks yet, return without processing
|
| 96 |
+
if len(chunk_buffer) < chunks_needed:
|
| 97 |
+
# Return current state without translation
|
| 98 |
+
current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
|
| 99 |
+
return messages, current_translation, chunk_buffer, current_time
|
| 100 |
+
|
| 101 |
+
# We have enough chunks - concatenate and process
|
| 102 |
+
concatenated_audio = np.concatenate(chunk_buffer[:chunks_needed])
|
| 103 |
+
chunk_buffer = chunk_buffer[chunks_needed:] # Keep any extra chunks for next iteration
|
| 104 |
+
|
| 105 |
+
# Convert to base64
|
| 106 |
+
audio_base64 = wav_array_to_base64(concatenated_audio, 16000)
|
| 107 |
+
|
| 108 |
+
# Prepare messages
|
| 109 |
+
messages = prepare_inputs(messages, audio_base64)
|
| 110 |
+
|
| 111 |
+
# Calculate context window size based on chunk size
|
| 112 |
+
# Larger chunks = longer audio = can keep fewer messages in context
|
| 113 |
+
# Base: 30 messages for 1.92s chunks, scale proportionally
|
| 114 |
+
context_window = max(10, int(30 * (1.92 / chunk_size_seconds)))
|
| 115 |
+
|
| 116 |
+
# Call OpenAI API
|
| 117 |
+
client = OpenAI(
|
| 118 |
+
base_url="http://localhost:8000/v1",
|
| 119 |
+
api_key="EMPTY",
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
model_path = "/data/user_data/siqiouya/ckpts/test_swift/Qwen3-Omni-30B-A3B-Instruct-lora/v1-20251104-033331-hf"
|
| 123 |
+
|
| 124 |
+
completion = client.chat.completions.create(
|
| 125 |
+
model=model_path,
|
| 126 |
+
messages=[messages[0]] + messages[-context_window:],
|
| 127 |
)
|
| 128 |
+
translation = completion.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
messages.append(
|
| 130 |
{
|
| 131 |
"role": "assistant",
|
| 132 |
+
"content": translation
|
| 133 |
}
|
| 134 |
)
|
| 135 |
+
|
| 136 |
+
# Get all translations
|
| 137 |
+
full_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
|
| 138 |
+
|
| 139 |
+
# Keep only the last 5 lines for display
|
| 140 |
+
translation_lines = full_translation.split('\n') if full_translation else ['']
|
| 141 |
+
# Filter out empty lines for counting, but preserve them in output
|
| 142 |
+
non_empty_lines = [line for line in translation_lines if line.strip()]
|
| 143 |
+
|
| 144 |
+
if len(non_empty_lines) > 5:
|
| 145 |
+
# Find the last 5 non-empty lines and include any surrounding context
|
| 146 |
+
# Count backwards to find where the 5th-to-last non-empty line is
|
| 147 |
+
count = 0
|
| 148 |
+
for i in range(len(translation_lines) - 1, -1, -1):
|
| 149 |
+
if translation_lines[i].strip():
|
| 150 |
+
count += 1
|
| 151 |
+
if count == 5:
|
| 152 |
+
display_translation = '\n'.join(translation_lines[i:])
|
| 153 |
+
break
|
| 154 |
+
else:
|
| 155 |
+
display_translation = full_translation
|
| 156 |
+
else:
|
| 157 |
+
display_translation = full_translation
|
| 158 |
+
|
| 159 |
+
return messages, display_translation, chunk_buffer, current_time
|
| 160 |
|
| 161 |
|
| 162 |
with gr.Blocks(css="""
|
| 163 |
.large-font textarea {
|
| 164 |
font-size: 20px !important;
|
| 165 |
font-weight: 500;
|
| 166 |
+
overflow-y: auto !important;
|
| 167 |
}
|
| 168 |
.large-font label {
|
| 169 |
font-size: 20px !important;
|
|
|
|
| 171 |
}
|
| 172 |
""") as demo:
|
| 173 |
gr.Markdown("# Simultaneous Speech Translation Demo")
|
| 174 |
+
gr.Markdown("**Instructions:** Select chunk size, then click the microphone to start recording. Refresh page to reset the history.")
|
| 175 |
+
|
| 176 |
+
# State components
|
| 177 |
+
messages_state = gr.State(value=[])
|
| 178 |
+
chunk_buffer_state = gr.State(value=[])
|
| 179 |
+
last_chunk_time_state = gr.State(value=None)
|
| 180 |
|
|
|
|
|
|
|
| 181 |
with gr.Row():
|
| 182 |
with gr.Column():
|
| 183 |
+
# Chunk size selector (multiples of 0.96)
|
| 184 |
+
chunk_size_selector = gr.Dropdown(
|
| 185 |
+
choices=[0.96, 1.92, 2.88, 3.84, 4.80, 5.76, 6.72, 7.68, 8.64, 9.60],
|
| 186 |
+
value=1.92,
|
| 187 |
+
label="Chunk Size (seconds)",
|
| 188 |
+
info="Larger chunks = more context but slower response. Must be multiple of 0.96s."
|
| 189 |
+
)
|
| 190 |
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
|
| 191 |
|
| 192 |
with gr.Row():
|
| 193 |
with gr.Column():
|
| 194 |
translation_output = gr.Textbox(
|
| 195 |
label="Translation",
|
| 196 |
+
lines=3,
|
| 197 |
+
max_lines=5,
|
| 198 |
interactive=False,
|
| 199 |
+
elem_classes=["large-font"],
|
| 200 |
+
autoscroll=True,
|
| 201 |
+
show_copy_button=True
|
| 202 |
)
|
| 203 |
|
| 204 |
+
# Streaming translation
|
| 205 |
audio_input.stream(
|
| 206 |
translate,
|
| 207 |
+
inputs=[messages_state, audio_input, chunk_buffer_state, chunk_size_selector, last_chunk_time_state],
|
| 208 |
+
outputs=[messages_state, translation_output, chunk_buffer_state, last_chunk_time_state],
|
| 209 |
show_progress=False,
|
| 210 |
+
stream_every=0.96 # Base unit - buffering happens inside translate()
|
| 211 |
)
|
| 212 |
|
| 213 |
+
demo.launch(share=True)
|
requirements.txt
CHANGED
|
@@ -1,8 +1,5 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
torchaudio
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
qwen-omni-utils
|
| 7 |
-
jupyter
|
| 8 |
-
flash-attn
|
|
|
|
| 1 |
+
openai
|
| 2 |
+
torch
|
| 3 |
+
torchaudio
|
| 4 |
+
numpy
|
| 5 |
+
soundfile
|
|
|
|
|
|
|
|
|