owaski commited on
Commit
7de60ad
·
1 Parent(s): dbac1b4

add app and dependency

Browse files
Files changed (2) hide show
  1. app.py +145 -4
  2. requirements.txt +8 -0
app.py CHANGED
@@ -1,7 +1,148 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
+ import re
2
+ import argparse
3
+
4
  import gradio as gr
5
+ import numpy as np
6
+
7
+ import torch
8
+ import torchaudio.functional as F
9
+
10
+ from transformers import (
11
+ AutoProcessor,
12
+ Qwen3OmniMoeThinkerForConditionalGeneration,
13
+ Qwen3OmniMoeForConditionalGeneration,
14
+ Qwen3OmniMoeProcessor,
15
+ GenerationConfig,
16
+ Qwen3OmniMoeConfig
17
+ )
18
+ from qwen_omni_utils import process_mm_info
19
+
20
+ model_name = "owaski/Open-LiveTranslate-v0-En-Zh"
21
+ model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
22
+ model_name,
23
+ dtype="auto",
24
+ device_map="auto",
25
+ attn_implementation="flash_attention_2",
26
+ enable_audio_output=False,
27
+ )
28
+ processor = Qwen3OmniMoeProcessor.from_pretrained(model_name)
29
+ generation_config = GenerationConfig(
30
+ num_beams=1,
31
+ do_sample=False,
32
+ temperature=0.6,
33
+ top_p=0.95,
34
+ top_k=1,
35
+ max_new_tokens=2048,
36
+ )
37
+
38
+ def prepare_speech(new_chunk):
39
+ sr, y = new_chunk
40
+ # Convert to mono if stereo
41
+ if y.ndim > 1:
42
+ y = y.mean(axis=1)
43
+
44
+ y = y.astype(np.float32)
45
+ y /= 32768.0
46
+
47
+ resampled_y = F.resample(torch.from_numpy(y), sr, 16000)
48
+
49
+ return resampled_y.numpy()
50
+
51
+ def prepare_inputs(messages, y):
52
+ if messages is None:
53
+ messages = [
54
+ {
55
+ "role": "system",
56
+ "content": [
57
+ {"type": "text", "text": f"You are a professional simultaneous interpreter. You will be given chunks of English audio and you need to translate the audio into Chinese text."}
58
+ ]
59
+ }
60
+ ]
61
+ messages.append(
62
+ {
63
+ "role": "user",
64
+ "content": [{"type": "audio", "audio": y}]
65
+ }
66
+ )
67
+
68
+ text = processor.apply_chat_template(
69
+ messages,
70
+ add_generation_prompt=True,
71
+ tokenize=False
72
+ )
73
+ audios, images, videos = process_mm_info(messages, use_audio_in_video=False)
74
+
75
+ inputs = processor(
76
+ text=text,
77
+ audio=audios,
78
+ images=images,
79
+ videos=videos,
80
+ return_tensors="pt",
81
+ padding=True,
82
+ use_audio_in_video=False
83
+ )
84
+ inputs['input_features'] = inputs['input_features'].to(model.dtype)
85
+
86
+ return messages, inputs
87
+
88
+ def transcribe(messages, new_chunk):
89
+ y = prepare_speech(new_chunk)
90
+ messages, inputs = prepare_inputs(messages, y)
91
+ text_ids, _ = model.generate(
92
+ **inputs,
93
+ generation_config=generation_config,
94
+ return_audio=False,
95
+ thinker_return_dict_in_generate=True,
96
+ use_audio_in_video=False,
97
+ )
98
+ translation = processor.batch_decode(
99
+ text_ids.sequences[:, inputs["input_ids"].shape[1] :],
100
+ skip_special_tokens=True,
101
+ clean_up_tokenization_spaces=False
102
+ )[0]
103
+ messages.append(
104
+ {
105
+ "role": "assistant",
106
+ "content": [{"type": "text", "text": translation}]
107
+ }
108
+ )
109
+ full_translation = ''.join([message["content"][0]["text"] for message in messages if message["role"] == "assistant"])
110
+ return messages, full_translation
111
+
112
 
113
+ with gr.Blocks(css="""
114
+ .large-font textarea {
115
+ font-size: 20px !important;
116
+ font-weight: 500;
117
+ }
118
+ .large-font label {
119
+ font-size: 20px !important;
120
+ font-weight: bold;
121
+ }
122
+ """) as demo:
123
+ gr.Markdown("# Simultaneous Speech Translation Demo")
124
+
125
+ with gr.Row():
126
+ with gr.Column():
127
+ audio_input = gr.Audio(sources=["microphone"], streaming=True, label="Audio Input")
128
+ state_input = gr.State()
129
+
130
+ with gr.Row():
131
+ with gr.Column():
132
+ translation_output = gr.Textbox(
133
+ label="Translation",
134
+ lines=5,
135
+ interactive=False,
136
+ elem_classes=["large-font"]
137
+ )
138
+ state_output = gr.State()
139
+
140
+ audio_input.stream(
141
+ transcribe,
142
+ inputs=[state_input, audio_input],
143
+ outputs=[state_output, translation_output],
144
+ show_progress=False,
145
+ stream_every=0.96
146
+ )
147
 
148
+ demo.launch()
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch==2.8.0
2
+ torchvision==0.23.0
3
+ torchaudio==2.8.0
4
+ transformers==4.57.1
5
+ accelerate
6
+ qwen-omni-utils
7
+ jupyter
8
+ gradio