Spaces:
Sleeping
Sleeping
sampling
Browse files
app.py
CHANGED
|
@@ -133,9 +133,18 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
|
|
| 133 |
out_path = out_file.name
|
| 134 |
|
| 135 |
ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
|
|
|
|
| 136 |
subprocess.run([
|
| 137 |
ffmpeg_exe, '-y', '-i', in_path,
|
| 138 |
-
'-ac', '1',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
], check=True, capture_output=True)
|
| 140 |
|
| 141 |
with open(out_path, 'rb') as f:
|
|
@@ -145,15 +154,105 @@ def preprocess_audio_ffmpeg(audio_data: bytes, target_sr: int = 16000) -> np.nda
|
|
| 145 |
os.unlink(out_path)
|
| 146 |
|
| 147 |
audio_array, sr = sf.read(io.BytesIO(wav_data))
|
|
|
|
| 148 |
if len(audio_array.shape) > 1:
|
| 149 |
audio_array = np.mean(audio_array, axis=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
return audio_array.astype(np.float32)
|
|
|
|
| 151 |
except Exception as e:
|
| 152 |
logger.error(f"FFmpeg preprocessing failed: {e}")
|
| 153 |
raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
def speech_to_text(audio_data: bytes) -> str:
|
| 156 |
audio_array = preprocess_audio_ffmpeg(audio_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
candidates = []
|
| 158 |
for code in ["yo", "ha", "ig", "en"]:
|
| 159 |
model, proc = _get_asr(code)
|
|
@@ -172,6 +271,50 @@ def speech_to_text(audio_data: bytes) -> str:
|
|
| 172 |
return max((t for _, t in candidates), key=lambda s: len(s or ""))
|
| 173 |
return ""
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
|
| 176 |
def get_ai_response(text: str) -> str:
|
| 177 |
try:
|
|
@@ -183,12 +326,9 @@ def get_ai_response(text: str) -> str:
|
|
| 183 |
logger.error(f"AI request error: {e}")
|
| 184 |
return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
|
| 185 |
|
| 186 |
-
# Enhanced keyword lists for language detection
|
| 187 |
HAUSA_WORDS = [
|
| 188 |
-
# Agricultural terms
|
| 189 |
"aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
|
| 190 |
"daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
|
| 191 |
-
# Common Hausa words
|
| 192 |
"da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
|
| 193 |
"ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
|
| 194 |
"wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
|
|
@@ -197,9 +337,7 @@ HAUSA_WORDS = [
|
|
| 197 |
]
|
| 198 |
|
| 199 |
YORUBA_WORDS = [
|
| 200 |
-
# Agricultural terms
|
| 201 |
"ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
|
| 202 |
-
# Common Yoruba words
|
| 203 |
"ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
|
| 204 |
"o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
|
| 205 |
"kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
|
|
@@ -208,9 +346,7 @@ YORUBA_WORDS = [
|
|
| 208 |
]
|
| 209 |
|
| 210 |
IGBO_WORDS = [
|
| 211 |
-
# Agricultural terms
|
| 212 |
"ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
|
| 213 |
-
# Common Igbo words
|
| 214 |
"na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
|
| 215 |
"m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
|
| 216 |
"otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
|
|
@@ -219,23 +355,17 @@ IGBO_WORDS = [
|
|
| 219 |
]
|
| 220 |
|
| 221 |
def detect_language_keywords(text: str) -> str:
|
| 222 |
-
"""
|
| 223 |
-
Lightweight keyword-based language detection.
|
| 224 |
-
Returns language code: 'ha' (Hausa), 'yo' (Yoruba), 'ig' (Igbo), 'en' (English)
|
| 225 |
-
"""
|
| 226 |
text_lower = text.lower().strip()
|
| 227 |
|
| 228 |
if not text_lower:
|
| 229 |
-
return "en"
|
| 230 |
|
| 231 |
-
# Count matches for each language
|
| 232 |
hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
|
| 233 |
yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
|
| 234 |
igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
|
| 235 |
|
| 236 |
logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
|
| 237 |
|
| 238 |
-
# Return language with highest count, default to English if no matches
|
| 239 |
if hausa_count > yoruba_count and hausa_count > igbo_count:
|
| 240 |
logger.info("Keyword detection: Hausa")
|
| 241 |
return "ha"
|
|
@@ -250,9 +380,6 @@ def detect_language_keywords(text: str) -> str:
|
|
| 250 |
return "en"
|
| 251 |
|
| 252 |
def detect_language(text: str) -> str:
|
| 253 |
-
"""
|
| 254 |
-
Main language detection function using lightweight keyword-based approach.
|
| 255 |
-
"""
|
| 256 |
logger.info(f"Detecting language for text: '{text[:50]}...'")
|
| 257 |
return detect_language_keywords(text)
|
| 258 |
|
|
@@ -282,7 +409,6 @@ def text_to_speech_file(text: str) -> str:
|
|
| 282 |
audio_raw = speech_output["audio"]
|
| 283 |
sampling_rate = int(speech_output["sampling_rate"])
|
| 284 |
|
| 285 |
-
|
| 286 |
if isinstance(audio_raw, torch.Tensor):
|
| 287 |
audio_np = audio_raw.detach().cpu().numpy()
|
| 288 |
else:
|
|
@@ -292,15 +418,24 @@ def text_to_speech_file(text: str) -> str:
|
|
| 292 |
audio_np = audio_np.reshape(-1)
|
| 293 |
audio_np = audio_np.astype(np.float32, copy=False)
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
audio_clipped = np.clip(audio_np, -1.0, 1.0)
|
| 297 |
audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
|
| 298 |
|
| 299 |
-
|
| 300 |
fd, path = tempfile.mkstemp(suffix=".wav")
|
| 301 |
os.close(fd)
|
| 302 |
|
| 303 |
-
|
| 304 |
sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
|
| 305 |
return path
|
| 306 |
|
|
|
|
| 133 |
out_path = out_file.name
|
| 134 |
|
| 135 |
ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
|
| 136 |
+
|
| 137 |
subprocess.run([
|
| 138 |
ffmpeg_exe, '-y', '-i', in_path,
|
| 139 |
+
'-ac', '1',
|
| 140 |
+
'-ar', str(target_sr),
|
| 141 |
+
'-af',
|
| 142 |
+
'highpass=f=80,' +
|
| 143 |
+
'lowpass=f=8000,' +
|
| 144 |
+
'dynaudnorm=p=0.95:m=10.0,' +
|
| 145 |
+
'volume=1.0,' +
|
| 146 |
+
'aresample=resampler=soxr',
|
| 147 |
+
out_path
|
| 148 |
], check=True, capture_output=True)
|
| 149 |
|
| 150 |
with open(out_path, 'rb') as f:
|
|
|
|
| 154 |
os.unlink(out_path)
|
| 155 |
|
| 156 |
audio_array, sr = sf.read(io.BytesIO(wav_data))
|
| 157 |
+
|
| 158 |
if len(audio_array.shape) > 1:
|
| 159 |
audio_array = np.mean(audio_array, axis=1)
|
| 160 |
+
|
| 161 |
+
if sr != target_sr:
|
| 162 |
+
logger.warning(f"Audio sampling rate {sr} != target {target_sr}, applying additional resampling...")
|
| 163 |
+
try:
|
| 164 |
+
from scipy import signal
|
| 165 |
+
ratio = target_sr / sr
|
| 166 |
+
audio_array = signal.resample(audio_array, int(len(audio_array) * ratio))
|
| 167 |
+
logger.info(f"Successfully resampled using scipy to {target_sr}Hz")
|
| 168 |
+
except ImportError:
|
| 169 |
+
logger.warning("scipy not available, using numpy interpolation")
|
| 170 |
+
ratio = target_sr / sr
|
| 171 |
+
new_length = int(len(audio_array) * ratio)
|
| 172 |
+
audio_array = np.interp(
|
| 173 |
+
np.linspace(0, len(audio_array), new_length),
|
| 174 |
+
np.arange(len(audio_array)),
|
| 175 |
+
audio_array
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
audio_array = _validate_and_normalize_audio(audio_array)
|
| 179 |
+
|
| 180 |
+
logger.info(f"Audio preprocessing complete: {len(audio_array)} samples at {target_sr}Hz")
|
| 181 |
return audio_array.astype(np.float32)
|
| 182 |
+
|
| 183 |
except Exception as e:
|
| 184 |
logger.error(f"FFmpeg preprocessing failed: {e}")
|
| 185 |
raise HTTPException(status_code=400, detail="Audio preprocessing failed. Ensure ffmpeg is installed.")
|
| 186 |
|
| 187 |
+
def _validate_and_normalize_audio(audio_array: np.ndarray) -> np.ndarray:
|
| 188 |
+
# Check for silence or very low amplitude
|
| 189 |
+
rms = np.sqrt(np.mean(audio_array**2))
|
| 190 |
+
if rms < 0.001:
|
| 191 |
+
logger.warning("Audio appears to be very quiet or silent")
|
| 192 |
+
|
| 193 |
+
max_val = np.max(np.abs(audio_array))
|
| 194 |
+
if max_val > 0.95:
|
| 195 |
+
logger.warning(f"Audio may be clipped (max: {max_val:.3f})")
|
| 196 |
+
|
| 197 |
+
target_rms = 0.1
|
| 198 |
+
current_rms = np.sqrt(np.mean(audio_array**2))
|
| 199 |
+
|
| 200 |
+
if current_rms > 0:
|
| 201 |
+
normalization_factor = min(target_rms / current_rms, 2.0)
|
| 202 |
+
audio_array = audio_array * normalization_factor
|
| 203 |
+
logger.info(f"Normalized audio RMS from {current_rms:.4f} to {np.sqrt(np.mean(audio_array**2)):.4f}")
|
| 204 |
+
|
| 205 |
+
audio_array = np.clip(audio_array, -0.99, 0.99)
|
| 206 |
+
|
| 207 |
+
audio_array = audio_array - np.mean(audio_array)
|
| 208 |
+
|
| 209 |
+
return audio_array
|
| 210 |
+
|
| 211 |
+
def chunk_audio(audio_array: np.ndarray, chunk_length: float = 10.0, overlap: float = 1.0, sample_rate: int = 16000) -> list:
|
| 212 |
+
chunk_samples = int(chunk_length * sample_rate)
|
| 213 |
+
overlap_samples = int(overlap * sample_rate)
|
| 214 |
+
step_samples = chunk_samples - overlap_samples
|
| 215 |
+
|
| 216 |
+
chunks = []
|
| 217 |
+
start = 0
|
| 218 |
+
|
| 219 |
+
while start < len(audio_array):
|
| 220 |
+
end = min(start + chunk_samples, len(audio_array))
|
| 221 |
+
chunk = audio_array[start:end]
|
| 222 |
+
|
| 223 |
+
fade_samples = int(0.05 * sample_rate)
|
| 224 |
+
if len(chunk) > 2 * fade_samples:
|
| 225 |
+
chunk[:fade_samples] *= np.linspace(0, 1, fade_samples)
|
| 226 |
+
chunk[-fade_samples:] *= np.linspace(1, 0, fade_samples)
|
| 227 |
+
|
| 228 |
+
if len(chunk) < chunk_samples:
|
| 229 |
+
chunk = np.pad(chunk, (0, chunk_samples - len(chunk)), mode='constant')
|
| 230 |
+
|
| 231 |
+
chunk_rms = np.sqrt(np.mean(chunk**2))
|
| 232 |
+
if chunk_rms < 0.001:
|
| 233 |
+
logger.warning(f"Chunk {len(chunks)+1} appears to be very quiet (RMS: {chunk_rms:.6f})")
|
| 234 |
+
|
| 235 |
+
chunks.append(chunk)
|
| 236 |
+
start += step_samples
|
| 237 |
+
|
| 238 |
+
if end >= len(audio_array):
|
| 239 |
+
break
|
| 240 |
+
|
| 241 |
+
logger.info(f"Split audio into {len(chunks)} chunks of {chunk_length}s each with quality preservation")
|
| 242 |
+
return chunks
|
| 243 |
+
|
| 244 |
def speech_to_text(audio_data: bytes) -> str:
|
| 245 |
audio_array = preprocess_audio_ffmpeg(audio_data)
|
| 246 |
+
|
| 247 |
+
audio_duration = len(audio_array) / 16000
|
| 248 |
+
logger.info(f"Audio duration: {audio_duration:.2f} seconds")
|
| 249 |
+
|
| 250 |
+
if audio_duration <= 15:
|
| 251 |
+
return _process_single_chunk(audio_array)
|
| 252 |
+
else:
|
| 253 |
+
return _process_chunked_audio(audio_array)
|
| 254 |
+
|
| 255 |
+
def _process_single_chunk(audio_array: np.ndarray) -> str:
|
| 256 |
candidates = []
|
| 257 |
for code in ["yo", "ha", "ig", "en"]:
|
| 258 |
model, proc = _get_asr(code)
|
|
|
|
| 271 |
return max((t for _, t in candidates), key=lambda s: len(s or ""))
|
| 272 |
return ""
|
| 273 |
|
| 274 |
+
def _process_chunked_audio(audio_array: np.ndarray) -> str:
|
| 275 |
+
chunks = chunk_audio(audio_array, chunk_length=10.0, overlap=1.0)
|
| 276 |
+
|
| 277 |
+
language_results = {}
|
| 278 |
+
|
| 279 |
+
for code in ["yo", "ha", "ig", "en"]:
|
| 280 |
+
model, proc = _get_asr(code)
|
| 281 |
+
if model is None or proc is None:
|
| 282 |
+
continue
|
| 283 |
+
|
| 284 |
+
chunk_texts = []
|
| 285 |
+
for i, chunk in enumerate(chunks):
|
| 286 |
+
try:
|
| 287 |
+
text = _run_whisper(model, proc, chunk)
|
| 288 |
+
if text and text.strip():
|
| 289 |
+
chunk_texts.append(text.strip())
|
| 290 |
+
logger.info(f"Chunk {i+1}/{len(chunks)} ({code}): {text[:50]}...")
|
| 291 |
+
except Exception as e:
|
| 292 |
+
logger.warning(f"Failed to process chunk {i+1} with {code}: {e}")
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
+
if chunk_texts:
|
| 296 |
+
combined_text = " ".join(chunk_texts)
|
| 297 |
+
language_results[code] = combined_text
|
| 298 |
+
logger.info(f"Combined {code} result: {combined_text[:100]}...")
|
| 299 |
+
|
| 300 |
+
best_result = ""
|
| 301 |
+
best_confidence = 0
|
| 302 |
+
|
| 303 |
+
for lang_code, text in language_results.items():
|
| 304 |
+
detected_lang = detect_language(text)
|
| 305 |
+
confidence = len(text.split())
|
| 306 |
+
|
| 307 |
+
logger.info(f"Language {lang_code}: detected as {detected_lang}, confidence: {confidence}")
|
| 308 |
+
|
| 309 |
+
if detected_lang == lang_code:
|
| 310 |
+
confidence *= 2
|
| 311 |
+
|
| 312 |
+
if confidence > best_confidence:
|
| 313 |
+
best_confidence = confidence
|
| 314 |
+
best_result = text
|
| 315 |
+
|
| 316 |
+
return best_result if best_result else ""
|
| 317 |
+
|
| 318 |
|
| 319 |
def get_ai_response(text: str) -> str:
|
| 320 |
try:
|
|
|
|
| 326 |
logger.error(f"AI request error: {e}")
|
| 327 |
return f"I'm sorry, I couldn't connect to the AI service. You said: '{text}'."
|
| 328 |
|
|
|
|
| 329 |
HAUSA_WORDS = [
|
|
|
|
| 330 |
"aikin", "manoma", "gona", "amfanin", "yanayi", "tsaba", "fasaha", "bisa", "noman", "shuka",
|
| 331 |
"daji", "rani", "damina", "amfani", "bidi'a", "noma", "bashi", "manure", "tsiro", "gishiri",
|
|
|
|
| 332 |
"da", "shi", "ta", "su", "mu", "ku", "ni", "kai", "ita", "shi", "ita", "su", "mu", "ku",
|
| 333 |
"ina", "yana", "tana", "suna", "muna", "kuna", "na", "ka", "ta", "sa", "mu", "ku",
|
| 334 |
"wani", "wata", "wasu", "wadansu", "wadannan", "wannan", "wancan", "wannan",
|
|
|
|
| 337 |
]
|
| 338 |
|
| 339 |
YORUBA_WORDS = [
|
|
|
|
| 340 |
"ilé", "ọmọ", "òun", "awọn", "agbẹ", "oko", "ọgbà", "irugbin", "àkọsílẹ", "omi", "ojo", "àgbàlá", "irọlẹ",
|
|
|
|
| 341 |
"ni", "ti", "si", "fun", "lati", "ninu", "lori", "labe", "pelu", "ati", "tabi", "sugbon",
|
| 342 |
"o", "a", "e", "won", "mi", "re", "wa", "yin", "won", "mi", "re", "wa", "yin",
|
| 343 |
"kan", "kankan", "die", "pupo", "gbogbo", "kookan", "kookan", "gbogbo",
|
|
|
|
| 346 |
]
|
| 347 |
|
| 348 |
IGBO_WORDS = [
|
|
|
|
| 349 |
"ugbo", "akụkọ", "mmiri", "ala", "ọrụ", "ncheta", "ọhụrụ", "ugwu", "nri", "ahụhụ",
|
|
|
|
| 350 |
"na", "n'", "maka", "n'ihi", "n'ime", "n'elu", "n'okpuru", "na", "na", "na",
|
| 351 |
"m", "i", "o", "ya", "anyị", "unu", "ha", "m", "i", "o", "ya", "anyị", "unu", "ha",
|
| 352 |
"otu", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ", "ọtụtụ",
|
|
|
|
| 355 |
]
|
| 356 |
|
| 357 |
def detect_language_keywords(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
text_lower = text.lower().strip()
|
| 359 |
|
| 360 |
if not text_lower:
|
| 361 |
+
return "en"
|
| 362 |
|
|
|
|
| 363 |
hausa_count = sum(1 for word in HAUSA_WORDS if word in text_lower)
|
| 364 |
yoruba_count = sum(1 for word in YORUBA_WORDS if word in text_lower)
|
| 365 |
igbo_count = sum(1 for word in IGBO_WORDS if word in text_lower)
|
| 366 |
|
| 367 |
logger.info(f"Language detection scores - Hausa: {hausa_count}, Yoruba: {yoruba_count}, Igbo: {igbo_count}")
|
| 368 |
|
|
|
|
| 369 |
if hausa_count > yoruba_count and hausa_count > igbo_count:
|
| 370 |
logger.info("Keyword detection: Hausa")
|
| 371 |
return "ha"
|
|
|
|
| 380 |
return "en"
|
| 381 |
|
| 382 |
def detect_language(text: str) -> str:
|
|
|
|
|
|
|
|
|
|
| 383 |
logger.info(f"Detecting language for text: '{text[:50]}...'")
|
| 384 |
return detect_language_keywords(text)
|
| 385 |
|
|
|
|
| 409 |
audio_raw = speech_output["audio"]
|
| 410 |
sampling_rate = int(speech_output["sampling_rate"])
|
| 411 |
|
|
|
|
| 412 |
if isinstance(audio_raw, torch.Tensor):
|
| 413 |
audio_np = audio_raw.detach().cpu().numpy()
|
| 414 |
else:
|
|
|
|
| 418 |
audio_np = audio_np.reshape(-1)
|
| 419 |
audio_np = audio_np.astype(np.float32, copy=False)
|
| 420 |
|
| 421 |
+
target_sr = 16000
|
| 422 |
+
if sampling_rate != target_sr:
|
| 423 |
+
logger.info(f"Resampling TTS audio from {sampling_rate}Hz to {target_sr}Hz")
|
| 424 |
+
ratio = target_sr / sampling_rate
|
| 425 |
+
new_length = int(len(audio_np) * ratio)
|
| 426 |
+
audio_np = np.interp(
|
| 427 |
+
np.linspace(0, len(audio_np), new_length),
|
| 428 |
+
np.arange(len(audio_np)),
|
| 429 |
+
audio_np
|
| 430 |
+
)
|
| 431 |
+
sampling_rate = target_sr
|
| 432 |
|
| 433 |
audio_clipped = np.clip(audio_np, -1.0, 1.0)
|
| 434 |
audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
|
| 435 |
|
|
|
|
| 436 |
fd, path = tempfile.mkstemp(suffix=".wav")
|
| 437 |
os.close(fd)
|
| 438 |
|
|
|
|
| 439 |
sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
|
| 440 |
return path
|
| 441 |
|