import json import json5 from typing import List, Dict, Union from google.cloud import speech_v1 as speech from src.logger_config import logger import os from .gcs_utils import get_gcs_credentials from src.config import get_config_value class GoogleSTT: def __init__(self, credentials=None): if not credentials: credentials = get_gcs_credentials("final_data") self.client = speech.SpeechClient(credentials=credentials) def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]: """ Generate timed transcript using Google Cloud Speech-to-Text. Args: audio_input: File path (str) or audio content (bytes). Returns: List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'. """ try: content = None if isinstance(audio_input, str): if os.path.exists(audio_input): with open(audio_input, "rb") as audio_file: content = audio_file.read() else: raise FileNotFoundError(f"Audio file not found: {audio_input}") elif isinstance(audio_input, bytes): content = audio_input else: raise ValueError("audio_input must be a file path string or bytes.") logger.debug("🎤 Generating timed transcript...") if get_config_value("TEST_AUTOMATION"): logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript") return [ { "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387}, { "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387}, { "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387}, { "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387}, { "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387}, { "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387}, { "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387}, { "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387}, { "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387}, { "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387}, { "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387}, { "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387}, { "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387}, { "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387}, { "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387}, { "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387}, { "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387}, { "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387}, { "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387}, { "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387}, { "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387}, { "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387}, { "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387}, { "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387}, { "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387}, { "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387}, { "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387}, { "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387}, { "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387}, { "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387}, { "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387}, { "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387}, { "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387}, { "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387}, { "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387} ] audio = speech.RecognitionAudio(content=content) config = speech.RecognitionConfig( encoding=speech.RecognitionConfig.AudioEncoding.MP3, sample_rate_hertz=24000, language_code="en-US", enable_automatic_punctuation=True, enable_word_time_offsets=True, model="video", ) logger.debug("🔄 Transcribing audio with word-level timing...") operation = self.client.long_running_recognize(config=config, audio=audio) response = operation.result(timeout=90) words = [] for result in response.results: for alternative in result.alternatives: for word_info in alternative.words: words.append( { "word": word_info.word, "start_time": word_info.start_time.total_seconds(), "end_time": word_info.end_time.total_seconds(), "confidence": alternative.confidence, } ) logger.debug(f"Generated timed transcript: {len(words)} words") logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}") if verify_with_text: import re # Normalize both texts: remove non-alphanumeric, lowercase def normalize_text(text): return re.sub(r'[^a-zA-Z0-9]', '', text).lower() stt_text = "".join([w["word"] for w in words]) normalized_stt = normalize_text(stt_text) normalized_verify = normalize_text(verify_with_text) if normalized_stt == normalized_verify: logger.debug("STT transcript matches text (alphanumeric check), skipping verification.") else: logger.debug("Verifying transcript with text...") try: # Construct prompt for verification prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md") if os.path.exists(prompt_path): with open(prompt_path, "r") as f: prompt_template = f.read() prompt = prompt_template.format( verify_with_text=verify_with_text, timed_words_json=json.dumps(words) ) else: logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.") return words from . import ai_studio_sdk response_text = ai_studio_sdk.generate(prompt) if response_text: # Clean up response if it contains markdown code blocks clean_response = response_text.replace("```json", "").replace("```", "").strip() corrected_words = json5.loads(clean_response) # Basic validation if isinstance(corrected_words, list) and len(corrected_words) > 0: logger.debug(f"Verified transcript: {len(corrected_words)} words") logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}") words = corrected_words else: logger.warning("⚠️ Verification returned invalid format, keeping original transcript.") else: logger.warning("⚠️ Verification failed (no response), keeping original transcript.") except Exception as e: logger.error(f"⚠️ Transcript verification failed: {e}") return words except Exception as e: logger.error(f"❌ Speech-to-Text failed: {e}") raise