Spaces:
Running
Running
gpt-omni
commited on
Commit
·
41c7b36
1
Parent(s):
aab4898
no streaming
Browse files
app.py
CHANGED
|
@@ -39,6 +39,7 @@ from litgpt.generate.base import sample
|
|
| 39 |
|
| 40 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 41 |
ckpt_dir = "./checkpoint"
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
OUT_CHUNK = 4096
|
|
@@ -236,6 +237,7 @@ def run_AT_batch_stream(
|
|
| 236 |
nums_generate = stream_stride
|
| 237 |
begin_generate = False
|
| 238 |
current_index = 0
|
|
|
|
| 239 |
for _ in tqdm(range(2, max_returned_tokens - T + 1)):
|
| 240 |
tokens_A, token_T = next_token_batch(
|
| 241 |
model,
|
|
@@ -278,7 +280,7 @@ def run_AT_batch_stream(
|
|
| 278 |
if index == 7:
|
| 279 |
begin_generate = True
|
| 280 |
|
| 281 |
-
if begin_generate:
|
| 282 |
current_index += 1
|
| 283 |
if current_index == nums_generate:
|
| 284 |
current_index = 0
|
|
@@ -288,10 +290,17 @@ def run_AT_batch_stream(
|
|
| 288 |
|
| 289 |
input_pos = input_pos.add_(1)
|
| 290 |
index += 1
|
|
|
|
|
|
|
| 291 |
text = text_tokenizer.decode(torch.tensor(list_output[-1]))
|
| 292 |
print(f"text output: {text}")
|
| 293 |
model.clear_kv_cache()
|
| 294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
|
| 297 |
for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
|
|
@@ -313,13 +322,16 @@ def process_audio(audio):
|
|
| 313 |
cnt += 1
|
| 314 |
audio_data = np.frombuffer(chunk, dtype=np.int16)
|
| 315 |
audio_data = audio_data.reshape(-1, OUT_CHANNELS)
|
| 316 |
-
|
|
|
|
|
|
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
demo = gr.Interface(
|
| 320 |
process_audio,
|
| 321 |
inputs=gr.Audio(type="filepath", label="Microphone"),
|
| 322 |
-
outputs=[gr.Audio(label="Response", streaming=
|
| 323 |
title="Chat Mini-Omni Demo",
|
| 324 |
# live=True,
|
| 325 |
)
|
|
|
|
| 39 |
|
| 40 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 41 |
ckpt_dir = "./checkpoint"
|
| 42 |
+
streaming_output = False
|
| 43 |
|
| 44 |
|
| 45 |
OUT_CHUNK = 4096
|
|
|
|
| 237 |
nums_generate = stream_stride
|
| 238 |
begin_generate = False
|
| 239 |
current_index = 0
|
| 240 |
+
total_num = 0
|
| 241 |
for _ in tqdm(range(2, max_returned_tokens - T + 1)):
|
| 242 |
tokens_A, token_T = next_token_batch(
|
| 243 |
model,
|
|
|
|
| 280 |
if index == 7:
|
| 281 |
begin_generate = True
|
| 282 |
|
| 283 |
+
if begin_generate and streaming_output:
|
| 284 |
current_index += 1
|
| 285 |
if current_index == nums_generate:
|
| 286 |
current_index = 0
|
|
|
|
| 290 |
|
| 291 |
input_pos = input_pos.add_(1)
|
| 292 |
index += 1
|
| 293 |
+
total_num += 1
|
| 294 |
+
|
| 295 |
text = text_tokenizer.decode(torch.tensor(list_output[-1]))
|
| 296 |
print(f"text output: {text}")
|
| 297 |
model.clear_kv_cache()
|
| 298 |
+
if not streaming_output:
|
| 299 |
+
snac = get_snac(list_output, 7, total_num-7)
|
| 300 |
+
audio_stream = generate_audio_data(snac, snacmodel, device)
|
| 301 |
+
return audio_stream
|
| 302 |
+
|
| 303 |
+
# return list_output
|
| 304 |
|
| 305 |
|
| 306 |
for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
|
|
|
|
| 322 |
cnt += 1
|
| 323 |
audio_data = np.frombuffer(chunk, dtype=np.int16)
|
| 324 |
audio_data = audio_data.reshape(-1, OUT_CHANNELS)
|
| 325 |
+
if streaming_output:
|
| 326 |
+
yield OUT_RATE, audio_data.astype(np.int16)
|
| 327 |
+
else:
|
| 328 |
+
return OUT_RATE, audio_data.astype(np.int16)
|
| 329 |
|
| 330 |
|
| 331 |
demo = gr.Interface(
|
| 332 |
process_audio,
|
| 333 |
inputs=gr.Audio(type="filepath", label="Microphone"),
|
| 334 |
+
outputs=[gr.Audio(label="Response", streaming=streaming_output, autoplay=True)],
|
| 335 |
title="Chat Mini-Omni Demo",
|
| 336 |
# live=True,
|
| 337 |
)
|