owaski's picture
add message
c1c27c7
import re
import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio.functional as F
def prepare_speech(new_chunk):
sr, y = new_chunk
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
y /= 32768.0
resampled_y = F.resample(torch.from_numpy(y), sr, 16000)
return resampled_y.numpy()
def wav_array_to_base64(wav_array, sample_rate):
"""Convert a numpy audio array to base64 encoded WAV."""
import base64
import io
import soundfile as sf
buffer = io.BytesIO()
sf.write(buffer, wav_array, sample_rate, format='WAV')
buffer.seek(0)
return base64.b64encode(buffer.read()).decode('utf-8')
def prepare_inputs(messages, audio_base64):
if not messages: # Check for None or empty list
messages = [
{
"role": "system",
"content": "You are a professional simultaneous interpreter. You will be given chunks of English audio and you need to translate the audio into Chinese text."
},
]
messages.append(
{
"role": "user",
"content": [{"type": "audio_url", "audio_url": {"url": f"data:audio/wav;base64,{audio_base64}"}}]
}
)
return messages
def translate(messages, new_chunk, chunk_buffer, chunk_size_seconds, last_chunk_time):
"""
Translate audio chunks with buffering.
Args:
messages: Conversation history
new_chunk: New audio chunk from microphone
chunk_buffer: List of buffered audio arrays
chunk_size_seconds: Target chunk size in seconds
last_chunk_time: Timestamp of last received chunk (to detect pauses)
Returns:
messages, full_translation, updated_chunk_buffer, current_time
"""
from openai import OpenAI
import time
current_time = time.time()
if new_chunk is None:
current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"]) if messages else ""
return messages, current_translation, chunk_buffer, last_chunk_time
# Initialize messages if None
if messages is None:
messages = []
# Initialize chunk_buffer if None
if chunk_buffer is None:
chunk_buffer = []
# Check if there was a significant gap (> 2 seconds) - indicates pause/resume
# Clear partial buffer to avoid concatenating audio from different time periods
if last_chunk_time is not None and (current_time - last_chunk_time) > 2.0:
if chunk_buffer:
print(f"⚠️ Detected pause (gap: {current_time - last_chunk_time:.1f}s). Clearing {len(chunk_buffer)} partial chunks.")
chunk_buffer = []
# Prepare and buffer the new chunk
y = prepare_speech(new_chunk)
chunk_buffer.append(y)
# Calculate how many 0.96s chunks we need to reach target size
chunks_needed = int(chunk_size_seconds / 0.96)
# If we haven't accumulated enough chunks yet, return without processing
if len(chunk_buffer) < chunks_needed:
# Return current state without translation
current_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
return messages, current_translation, chunk_buffer, current_time
# We have enough chunks - concatenate and process
concatenated_audio = np.concatenate(chunk_buffer[:chunks_needed])
chunk_buffer = chunk_buffer[chunks_needed:] # Keep any extra chunks for next iteration
# Convert to base64
audio_base64 = wav_array_to_base64(concatenated_audio, 16000)
# Prepare messages
messages = prepare_inputs(messages, audio_base64)
# Calculate context window size based on chunk size
# Larger chunks = longer audio = can keep fewer messages in context
# Base: 30 messages for 1.92s chunks, scale proportionally
context_window = max(10, int(30 * (1.92 / chunk_size_seconds)))
# Call OpenAI API
# model owaski/Open-LiveTranslate-v0-En-Zh served locally with vllm
client = OpenAI(
base_url="https://jaida-avian-irmgard.ngrok-free.dev/v1",
api_key="",
)
model_path = "/data/user_data/siqiouya/ckpts/test_swift/Qwen3-Omni-30B-A3B-Instruct-lora/v1-20251104-033331-hf"
completion = client.chat.completions.create(
model=model_path,
messages=[messages[0]] + messages[-context_window:],
top_p=0.95,
temperature=0.6,
extra_body={"top_k": 20}
)
print(f"completion: {completion}")
translation = completion.choices[0].message.content
messages.append(
{
"role": "assistant",
"content": translation
}
)
# Get all translations
full_translation = ''.join([message["content"] for message in messages if message["role"] == "assistant"])
# Keep only the last 5 lines for display
translation_lines = full_translation.split('\n') if full_translation else ['']
# Filter out empty lines for counting, but preserve them in output
non_empty_lines = [line for line in translation_lines if line.strip()]
if len(non_empty_lines) > 5:
# Find the last 5 non-empty lines and include any surrounding context
# Count backwards to find where the 5th-to-last non-empty line is
count = 0
for i in range(len(translation_lines) - 1, -1, -1):
if translation_lines[i].strip():
count += 1
if count == 5:
display_translation = '\n'.join(translation_lines[i:])
break
else:
display_translation = full_translation
else:
display_translation = full_translation
return messages, display_translation, chunk_buffer, current_time
with gr.Blocks(css="""
.large-font textarea {
font-size: 20px !important;
font-weight: 500;
overflow-y: auto !important;
}
.large-font label {
font-size: 20px !important;
font-weight: bold;
}
""") as demo:
gr.Markdown("# Simultaneous Speech Translation Demo")
gr.Markdown("**Instructions:** Select chunk size, then click the microphone to start recording. Refresh page to reset the history.")
# State components
messages_state = gr.State(value=[])
chunk_buffer_state = gr.State(value=[])
last_chunk_time_state = gr.State(value=None)
with gr.Row():
with gr.Column():
# Chunk size selector (multiples of 0.96)
chunk_size_selector = gr.Dropdown(
choices=[0.96, 1.92, 2.88, 3.84, 4.80, 5.76, 6.72, 7.68, 8.64, 9.60],
value=1.92,
label="Chunk Size (seconds)",
info="Larger chunks = more context but slower response. Must be multiple of 0.96s."
)
audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
with gr.Row():
with gr.Column():
translation_output = gr.Textbox(
label="Translation",
lines=3,
max_lines=5,
interactive=False,
elem_classes=["large-font"],
autoscroll=True,
show_copy_button=True
)
# Streaming translation
audio_input.stream(
translate,
inputs=[messages_state, audio_input, chunk_buffer_state, chunk_size_selector, last_chunk_time_state],
outputs=[messages_state, translation_output, chunk_buffer_state, last_chunk_time_state],
show_progress=False,
stream_every=0.96 # Base unit - buffering happens inside translate()
)
demo.launch(share=True)