Spaces:

Elvoro
/

Tools

Running

App Files Files Community

Tools / src /google_src /stt.py

jebin2

new ch

20ff2b7 2 days ago

raw

history blame contribute delete

9.98 kB

	import json
	import json5
	from typing import List, Dict, Union
	from google.cloud import speech_v1 as speech
	from src.logger_config import logger
	import os
	from .gcs_utils import get_gcs_credentials
	from src.config import get_config_value

	class GoogleSTT:
	def __init__(self, credentials=None):
	if not credentials:
	credentials = get_gcs_credentials("final_data")
	self.client = speech.SpeechClient(credentials=credentials)

	def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
	"""
	Generate timed transcript using Google Cloud Speech-to-Text.

	Args:
	audio_input: File path (str) or audio content (bytes).

	Returns:
	List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'.
	"""
	try:
	content = None
	if isinstance(audio_input, str):
	if os.path.exists(audio_input):
	with open(audio_input, "rb") as audio_file:
	content = audio_file.read()
	else:
	raise FileNotFoundError(f"Audio file not found: {audio_input}")
	elif isinstance(audio_input, bytes):
	content = audio_input
	else:
	raise ValueError("audio_input must be a file path string or bytes.")

	logger.debug("🎤 Generating timed transcript...")

	if get_config_value("TEST_AUTOMATION"):
	logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript")
	return [
	{ "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387},
	{ "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387},
	{ "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387},
	{ "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387},
	{ "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387},
	{ "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387},
	{ "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387},
	{ "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387},
	{ "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387},
	{ "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387},
	{ "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387},
	{ "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387},
	{ "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387},
	{ "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387},
	{ "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387},
	{ "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387},
	{ "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387},
	{ "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387},
	{ "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387},
	{ "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387},
	{ "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387},
	{ "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387},
	{ "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387},
	{ "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387},
	{ "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387},
	{ "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387},
	{ "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387},
	{ "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387},
	{ "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387},
	{ "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387},
	{ "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387},
	{ "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387},
	{ "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387},
	{ "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387},
	{ "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387}
	]

	audio = speech.RecognitionAudio(content=content)
	config = speech.RecognitionConfig(
	encoding=speech.RecognitionConfig.AudioEncoding.MP3,
	sample_rate_hertz=24000,
	language_code="en-US",
	enable_automatic_punctuation=True,
	enable_word_time_offsets=True,
	model="video",
	)

	logger.debug("🔄 Transcribing audio with word-level timing...")
	operation = self.client.long_running_recognize(config=config, audio=audio)
	response = operation.result(timeout=90)

	words = []
	for result in response.results:
	for alternative in result.alternatives:
	for word_info in alternative.words:
	words.append(
	{
	"word": word_info.word,
	"start_time": word_info.start_time.total_seconds(),
	"end_time": word_info.end_time.total_seconds(),
	"confidence": alternative.confidence,
	}
	)

	logger.debug(f"Generated timed transcript: {len(words)} words")
	logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")

	if verify_with_text:
	import re

	# Normalize both texts: remove non-alphanumeric, lowercase
	def normalize_text(text):
	return re.sub(r'[^a-zA-Z0-9]', '', text).lower()

	stt_text = "".join([w["word"] for w in words])
	normalized_stt = normalize_text(stt_text)
	normalized_verify = normalize_text(verify_with_text)

	if normalized_stt == normalized_verify:
	logger.debug("STT transcript matches text (alphanumeric check), skipping verification.")
	else:
	logger.debug("Verifying transcript with text...")
	try:
	# Construct prompt for verification
	prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
	if os.path.exists(prompt_path):
	with open(prompt_path, "r") as f:
	prompt_template = f.read()

	prompt = prompt_template.format(
	verify_with_text=verify_with_text,
	timed_words_json=json.dumps(words)
	)
	else:
	logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
	return words

	from . import ai_studio_sdk
	response_text = ai_studio_sdk.generate(prompt)

	if response_text:
	# Clean up response if it contains markdown code blocks
	clean_response = response_text.replace("```json", "").replace("```", "").strip()
	corrected_words = json5.loads(clean_response)

	# Basic validation
	if isinstance(corrected_words, list) and len(corrected_words) > 0:
	logger.debug(f"Verified transcript: {len(corrected_words)} words")
	logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}")
	words = corrected_words
	else:
	logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
	else:
	logger.warning("⚠️ Verification failed (no response), keeping original transcript.")

	except Exception as e:
	logger.error(f"⚠️ Transcript verification failed: {e}")

	return words

	except Exception as e:
	logger.error(f"❌ Speech-to-Text failed: {e}")
	raise