airlsyn commited on
Commit
eeb25c9
·
verified ·
1 Parent(s): 5b565c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -1
README.md CHANGED
@@ -1362,6 +1362,7 @@ user_audio, _ = librosa.load("user_audio.wav", sr=16000, mono=True)
1362
  IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
1363
  CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
1364
  OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
 
1365
 
1366
  total_samples = len(user_audio)
1367
  num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
@@ -1372,7 +1373,9 @@ for chunk_idx in range(num_chunks):
1372
  chunk_audio = user_audio[start:end]
1373
 
1374
  is_last_chunk = (chunk_idx == num_chunks - 1)
1375
-
 
 
1376
  user_msg = {"role": "user", "content": [chunk_audio]}
1377
 
1378
  # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
@@ -1398,6 +1401,7 @@ iter_gen = model.streaming_generate(
1398
  audios = []
1399
  text = ""
1400
 
 
1401
  if generate_audio:
1402
  for wav_chunk, text_chunk in iter_gen:
1403
  audios.append(wav_chunk)
 
1362
  IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
1363
  CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
1364
  OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
1365
+ MIN_AUDIO_SAMPLES = 1600
1366
 
1367
  total_samples = len(user_audio)
1368
  num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
 
1373
  chunk_audio = user_audio[start:end]
1374
 
1375
  is_last_chunk = (chunk_idx == num_chunks - 1)
1376
+ if is_last_chunk and len(chunk_audio) < MIN_AUDIO_SAMPLES:
1377
+ chunk_audio = np.concatenate([chunk_audio, np.zeros(MIN_AUDIO_SAMPLES - len(chunk_audio), dtype=chunk_audio.dtype)])
1378
+
1379
  user_msg = {"role": "user", "content": [chunk_audio]}
1380
 
1381
  # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
 
1401
  audios = []
1402
  text = ""
1403
 
1404
+ output_audio_path = ...
1405
  if generate_audio:
1406
  for wav_chunk, text_chunk in iter_gen:
1407
  audios.append(wav_chunk)