Spaces:

Pandaisop
/

voice-detection-api

Sleeping

vineetshukla.work@gmail.com

final commit

c5c9261 3 months ago

2.47 kB


	import io
	import base64
	import librosa
	import numpy as np
	import soundfile as sf
	from fastapi import HTTPException
	from app.config import settings
	import logging

	logger = logging.getLogger(__name__)

	def decode_base64_audio(base64_string: str) -> io.BytesIO:
	"""
	Decodes a Base64 string into a BytesIO object.
	"""
	try:
	if "base64," in base64_string:
	base64_string = base64_string.split("base64,")[1]

	audio_data = base64.b64decode(base64_string)
	return io.BytesIO(audio_data)
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Invalid Base64 audio: {str(e)}")

	def preprocess_audio(audio_file: io.BytesIO):
	"""
	Clean and standardized preprocessing for AI detection.
	Focuses on natural signal preservation to avoid false AI classifications.
	"""
	import tempfile
	import os

	try:
	# Save to temporary file for librosa
	with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as tmp_file:
	tmp_file.write(audio_file.read())
	tmp_path = tmp_file.name

	try:
	# Load audio at 16kHz (Standard for Wav2Vec2)
	y, sr = librosa.load(tmp_path, sr=settings.SAMPLE_RATE)

	# Ensure mono
	if len(y.shape) > 1:
	y = librosa.to_mono(y)

	# 1. Basic Silence Trimming (Safer threshold)
	y_trimmed, _ = librosa.effects.trim(y, top_db=40)
	if len(y_trimmed) > sr * 0.1: # Only use if not too much was cut
	y = y_trimmed

	# 2. Gentle Normalization
	# Instead of target RMS, we use standard peak normalization
	# This preserves the natural dynamics which models use for detection
	if np.max(np.abs(y)) > 0:
	y = y / np.max(np.abs(y))

	# 3. Time Clamping
	max_duration = 30
	if len(y) > sr * max_duration:
	y = y[:sr * max_duration]

	logger.info(f"Natural preprocessing complete: {len(y)/sr:.2f}s")
	return y

	finally:
	if os.path.exists(tmp_path):
	os.unlink(tmp_path)

	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Error processing audio file: {str(e)}")