Spaces:

Elvoro
/

Tools

Running

File size: 9,975 Bytes

import json
import json5
from typing import List, Dict, Union
from google.cloud import speech_v1 as speech
from src.logger_config import logger
import os
from .gcs_utils import get_gcs_credentials
from src.config import get_config_value

class GoogleSTT:
    def __init__(self, credentials=None):
        if not credentials:
            credentials = get_gcs_credentials("final_data")
        self.client = speech.SpeechClient(credentials=credentials)

    def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
        """
        Generate timed transcript using Google Cloud Speech-to-Text.
        
        Args:
            audio_input: File path (str) or audio content (bytes).
            
        Returns:
            List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'.
        """
        try:
            content = None
            if isinstance(audio_input, str):
                if os.path.exists(audio_input):
                    with open(audio_input, "rb") as audio_file:
                        content = audio_file.read()
                else:
                     raise FileNotFoundError(f"Audio file not found: {audio_input}")
            elif isinstance(audio_input, bytes):
                content = audio_input
            else:
                raise ValueError("audio_input must be a file path string or bytes.")

            logger.debug("🎤 Generating timed transcript...")

            if get_config_value("TEST_AUTOMATION"):
                logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript")
                return [
                    { "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387},
                    { "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387},
                    { "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387},
                    { "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387},
                    { "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387},
                    { "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387},
                    { "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387},
                    { "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387},
                    { "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387},
                    { "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387},
                    { "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387},
                    { "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387},
                    { "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387},
                    { "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387},
                    { "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387},
                    { "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387},
                    { "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387},
                    { "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387},
                    { "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387},
                    { "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387},
                    { "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387},
                    { "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387},
                    { "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387},
                    { "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387},
                    { "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387},
                    { "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387},
                    { "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387},
                    { "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387},
                    { "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387},
                    { "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387},
                    { "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387},
                    { "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387},
                    { "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387},
                    { "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387},
                    { "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387}
                ]

            audio = speech.RecognitionAudio(content=content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.MP3,
                sample_rate_hertz=24000,
                language_code="en-US",
                enable_automatic_punctuation=True,
                enable_word_time_offsets=True,
                model="video",
            )

            logger.debug("🔄 Transcribing audio with word-level timing...")
            operation = self.client.long_running_recognize(config=config, audio=audio)
            response = operation.result(timeout=90)

            words = []
            for result in response.results:
                for alternative in result.alternatives:
                    for word_info in alternative.words:
                        words.append(
                            {
                                "word": word_info.word,
                                "start_time": word_info.start_time.total_seconds(),
                                "end_time": word_info.end_time.total_seconds(),
                                "confidence": alternative.confidence,
                            }
                        )

            logger.debug(f"Generated timed transcript: {len(words)} words")
            logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")

            if verify_with_text:
                import re
                
                # Normalize both texts: remove non-alphanumeric, lowercase
                def normalize_text(text):
                    return re.sub(r'[^a-zA-Z0-9]', '', text).lower()
                
                stt_text = "".join([w["word"] for w in words])
                normalized_stt = normalize_text(stt_text)
                normalized_verify = normalize_text(verify_with_text)
                
                if normalized_stt == normalized_verify:
                    logger.debug("STT transcript matches text (alphanumeric check), skipping verification.")
                else:
                    logger.debug("Verifying transcript with text...")
                    try:
                        # Construct prompt for verification
                        prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
                        if os.path.exists(prompt_path):
                            with open(prompt_path, "r") as f:
                                prompt_template = f.read()
                            
                            prompt = prompt_template.format(
                                verify_with_text=verify_with_text,
                                timed_words_json=json.dumps(words)
                            )
                        else:
                            logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
                            return words
                        
                        from . import ai_studio_sdk
                        response_text = ai_studio_sdk.generate(prompt)
                        
                        if response_text:
                            # Clean up response if it contains markdown code blocks
                            clean_response = response_text.replace("```json", "").replace("```", "").strip()
                            corrected_words = json5.loads(clean_response)
                            
                            # Basic validation
                            if isinstance(corrected_words, list) and len(corrected_words) > 0:
                                logger.debug(f"Verified transcript: {len(corrected_words)} words")
                                logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}")
                                words = corrected_words
                            else:
                                 logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
                        else:
                            logger.warning("⚠️ Verification failed (no response), keeping original transcript.")
                            
                    except Exception as e:
                         logger.error(f"⚠️ Transcript verification failed: {e}")

            return words

        except Exception as e:
            logger.error(f"❌ Speech-to-Text failed: {e}")
            raise