openbmb
/

MiniCPM-o-4_5

feature-extraction

Model card Files Files and versions

airlsyn commited on 29 days ago

Commit

eeb25c9

·

verified ·

1 Parent(s): 5b565c5

Update README.md

Files changed (1) hide show

README.md +5 -1

README.md CHANGED Viewed

@@ -1362,6 +1362,7 @@ user_audio, _ = librosa.load("user_audio.wav", sr=16000, mono=True)
 IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
 CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
 OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
 total_samples = len(user_audio)
 num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
@@ -1372,7 +1373,9 @@ for chunk_idx in range(num_chunks):
     chunk_audio = user_audio[start:end]
     is_last_chunk = (chunk_idx == num_chunks - 1)
     user_msg = {"role": "user", "content": [chunk_audio]}
     # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
@@ -1398,6 +1401,7 @@ iter_gen = model.streaming_generate(
 audios = []
 text = ""
 if generate_audio:
     for wav_chunk, text_chunk in iter_gen:
         audios.append(wav_chunk)

 IN_SAMPLE_RATE = 16000 # input audio sample rate, fixed value
 CHUNK_SAMPLES = IN_SAMPLE_RATE # sample
 OUT_SAMPLE_RATE = 24000 # output audio sample rate, fixed value
+MIN_AUDIO_SAMPLES = 1600
 total_samples = len(user_audio)
 num_chunks = (total_samples + CHUNK_SAMPLES - 1) // CHUNK_SAMPLES
     chunk_audio = user_audio[start:end]
     is_last_chunk = (chunk_idx == num_chunks - 1)
+    if is_last_chunk and len(chunk_audio) < MIN_AUDIO_SAMPLES:
+        chunk_audio = np.concatenate([chunk_audio, np.zeros(MIN_AUDIO_SAMPLES - len(chunk_audio), dtype=chunk_audio.dtype)])
     user_msg = {"role": "user", "content": [chunk_audio]}
     # For each 1s audio chunk, perform streaming_prefill once to reduce first-token latency
 audios = []
 text = ""
+output_audio_path = ...
 if generate_audio:
     for wav_chunk, text_chunk in iter_gen:
         audios.append(wav_chunk)