Spaces:

abhinav0231
/

SparrowTale

Sleeping

App Files Files Community

SparrowTale / audio_transcription.py

abhinav0231

Upload 13 files

db06c0a verified 8 months ago

raw

history blame contribute delete

4.33 kB

	import os
	import logging
	from typing import Optional, Dict, Any
	from sarvamai import SarvamAI
	import streamlit as st

	class LanguageAgnosticSarvamSTT:
	def __init__(self, api_subscription_key: str):
	if not api_subscription_key:
	raise ValueError("Sarvam API key is missing.")
	self.client = SarvamAI(api_subscription_key=api_subscription_key)

	# Common Indian languages supported by Sarvam API
	self.supported_languages = [
	"en-IN", "hi-IN", "gu-IN", "mr-IN", "bn-IN",
	"ta-IN", "te-IN", "kn-IN", "ml-IN", "pa-IN"
	]

	def _extract_confidence(self, response) -> float:
	"""
	Estimates a confidence score from the API response.
	NOTE: This is an estimation because the Sarvam API might not provide a direct
	overall confidence score. A longer, non-empty transcript is considered more confident.
	"""
	try:
	if hasattr(response, 'confidence') and response.confidence:
	return response.confidence

	# Use response.text as it contains the transcribed string
	if hasattr(response, 'text') and response.text:
	transcript_length = len(response.text.strip())
	# Normalize length to a score between 0.1 and 0.9
	return min(0.9, 0.1 + (transcript_length / 150.0))
	else:
	return 0.0 # No text means zero confidence
	except:
	return 0.0

	def transcribe_with_auto_detection(
	self,
	audio_file_path: str,
	model: str = "saarika:v2.5",
	confidence_threshold: float = 0.7
	) -> Dict[str, Any]:
	"""
	Transcribes audio by trying multiple languages and picking the best result.
	"""
	best_result_text = ""
	best_confidence = 0.0
	best_language = None

	print(f"--- Starting auto-detection transcription for {os.path.basename(audio_file_path)} ---")

	for lang_code in self.supported_languages:
	try:
	with open(audio_file_path, "rb") as audio_file:
	response = self.client.speech_to_text.transcribe(
	file=audio_file,
	model=model,
	language_code=lang_code
	)

	confidence = self._extract_confidence(response)
	print(f"Attempted {lang_code}: Confidence (estimated) = {confidence:.2f}")

	if confidence > best_confidence:
	best_confidence = confidence
	best_result_text = response.text.strip()
	best_language = lang_code

	if confidence >= confidence_threshold:
	print(f"High confidence result found with {lang_code}. Stopping search.")
	break

	except Exception as e:
	logging.warning(f"Failed to transcribe with {lang_code}: {e}")
	continue

	return {
	"text": best_result_text,
	"detected_language": best_language,
	"confidence": best_confidence,
	}

	transcriber_agent = None
	try:
	api_key = st.secrets.get("SARVAM_API_KEY") or os.getenv("SARVAM_API_KEY")
	if api_key:
	transcriber_agent = LanguageAgnosticSarvamSTT(api_key)
	print("✅ Sarvam AI language-agnostic transcriber initialized.")
	else:
	print("⚠️ Warning: SARVAM_API_KEY not found. Transcription will not work.")
	except Exception as e:
	print(f"❌ Error initializing Sarvam transcriber: {e}")

	def transcribe_audio_with_auto_detect(file_path: str) -> Dict[str, Any]:
	"""
	Public function to access the transcription agent.
	"""
	if not transcriber_agent:
	return {"text": "Error: Transcriber agent not initialized.", "detected_language": None}

	if not os.path.exists(file_path):
	return {"text": f"Error: Audio file not found at '{file_path}'", "detected_language": None}

	return transcriber_agent.transcribe_with_auto_detection(audio_file_path=file_path)