Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Sleeping

FastAPI-Backend-Models / services /stt_service.py

malek-messaoudii

Enhance configuration validation and audio processing limits

918acab about 2 months ago

1.93 kB

	from services.gemini_client import get_gemini_client
	import base64
	import mimetypes
	import logging

	logger = logging.getLogger(__name__)


	async def speech_to_text(audio_bytes: bytes, filename: str) -> str:
	"""
	Convert audio bytes to text using Gemini API.

	Args:
	audio_bytes: Raw audio file bytes
	filename: Name of the audio file (used to detect format)

	Returns:
	Transcribed text

	Raises:
	Exception: If transcription fails
	"""
	try:
	client = get_gemini_client()

	# Detect MIME type from filename
	mime_type, _ = mimetypes.guess_type(filename)
	if mime_type is None:
	mime_type = "audio/wav" # fallback

	logger.info(f"Converting audio to text (format: {mime_type})")

	# Convert audio to base64
	audio_b64 = base64.b64encode(audio_bytes).decode('utf-8')

	# Create proper content structure for Gemini
	contents = [
	{
	"parts": [
	{
	"inline_data": {
	"mime_type": mime_type,
	"data": audio_b64
	}
	},
	{
	"text": "Transcribe this audio to text."
	}
	]
	}
	]

	# Call Gemini API
	response = client.models.generate_content(
	model="gemini-2.0-flash-exp", # Using a model that supports multimodal
	contents=contents
	)

	transcribed_text = response.text.strip()
	logger.info(f"✓ STT successful: '{transcribed_text}'")

	return transcribed_text

	except Exception as e:
	logger.error(f"✗ STT failed: {str(e)}")
	raise Exception(f"Speech-to-text conversion failed: {str(e)}")