Spaces:

gpt-omni
/

mini-omni

Running

App Files Files Community

gpt-omni commited on Sep 14, 2024

Commit

41c7b36

1 Parent(s): aab4898

no streaming

Browse files

Files changed (1) hide show

app.py +16 -4

app.py CHANGED Viewed

@@ -39,6 +39,7 @@ from litgpt.generate.base import sample
 device = "cuda" if torch.cuda.is_available() else "cpu"
 ckpt_dir = "./checkpoint"
 OUT_CHUNK = 4096
@@ -236,6 +237,7 @@ def run_AT_batch_stream(
     nums_generate = stream_stride
     begin_generate = False
     current_index = 0
     for _ in tqdm(range(2, max_returned_tokens - T + 1)):
         tokens_A, token_T = next_token_batch(
             model,
@@ -278,7 +280,7 @@ def run_AT_batch_stream(
         if index == 7:
             begin_generate = True
-        if begin_generate:
             current_index += 1
             if current_index == nums_generate:
                 current_index = 0
@@ -288,10 +290,17 @@ def run_AT_batch_stream(
         input_pos = input_pos.add_(1)
         index += 1
     text = text_tokenizer.decode(torch.tensor(list_output[-1]))
     print(f"text output: {text}")
     model.clear_kv_cache()
-    return list_output
 for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
@@ -313,13 +322,16 @@ def process_audio(audio):
         cnt += 1
         audio_data = np.frombuffer(chunk, dtype=np.int16)
         audio_data = audio_data.reshape(-1, OUT_CHANNELS)
-        yield OUT_RATE, audio_data.astype(np.int16)
 demo = gr.Interface(
     process_audio,
     inputs=gr.Audio(type="filepath", label="Microphone"),
-    outputs=[gr.Audio(label="Response", streaming=True, autoplay=True)],
     title="Chat Mini-Omni Demo",
     # live=True,
 )

 device = "cuda" if torch.cuda.is_available() else "cpu"
 ckpt_dir = "./checkpoint"
+streaming_output = False
 OUT_CHUNK = 4096
     nums_generate = stream_stride
     begin_generate = False
     current_index = 0
+    total_num = 0
     for _ in tqdm(range(2, max_returned_tokens - T + 1)):
         tokens_A, token_T = next_token_batch(
             model,
         if index == 7:
             begin_generate = True
+        if begin_generate and streaming_output:
             current_index += 1
             if current_index == nums_generate:
                 current_index = 0
         input_pos = input_pos.add_(1)
         index += 1
+        total_num += 1
     text = text_tokenizer.decode(torch.tensor(list_output[-1]))
     print(f"text output: {text}")
     model.clear_kv_cache()
+    if not streaming_output:
+        snac = get_snac(list_output, 7, total_num-7)
+        audio_stream = generate_audio_data(snac, snacmodel, device)
+        return audio_stream
+    # return list_output
 for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
         cnt += 1
         audio_data = np.frombuffer(chunk, dtype=np.int16)
         audio_data = audio_data.reshape(-1, OUT_CHANNELS)
+        if streaming_output:
+            yield OUT_RATE, audio_data.astype(np.int16)
+        else:
+            return OUT_RATE, audio_data.astype(np.int16)
 demo = gr.Interface(
     process_audio,
     inputs=gr.Audio(type="filepath", label="Microphone"),
+    outputs=[gr.Audio(label="Response", streaming=streaming_output, autoplay=True)],
     title="Chat Mini-Omni Demo",
     # live=True,
 )