File size: 9,975 Bytes
15ffb7b
7bfc215
1a31454
 
f20025d
1a31454
 
5f00d5a
1a31454
 
 
 
 
 
 
7bfc215
1a31454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175adcc
1a31454
5f00d5a
 
 
175adcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20ff2b7
40c905b
5f00d5a
 
1a31454
 
 
 
 
 
 
 
 
 
175adcc
1a31454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503d4ac
175adcc
7bfc215
 
62e3d70
7bfc215
62e3d70
 
 
 
 
 
 
 
 
503d4ac
62e3d70
503d4ac
62e3d70
 
 
 
 
 
 
 
 
 
 
 
 
 
7bfc215
62e3d70
 
7bfc215
62e3d70
 
 
 
 
 
 
503d4ac
62e3d70
 
 
 
7bfc215
62e3d70
 
 
 
7bfc215
1a31454
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import json
import json5
from typing import List, Dict, Union
from google.cloud import speech_v1 as speech
from src.logger_config import logger
import os
from .gcs_utils import get_gcs_credentials
from src.config import get_config_value

class GoogleSTT:
    def __init__(self, credentials=None):
        if not credentials:
            credentials = get_gcs_credentials("final_data")
        self.client = speech.SpeechClient(credentials=credentials)

    def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
        """
        Generate timed transcript using Google Cloud Speech-to-Text.
        
        Args:
            audio_input: File path (str) or audio content (bytes).
            
        Returns:
            List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'.
        """
        try:
            content = None
            if isinstance(audio_input, str):
                if os.path.exists(audio_input):
                    with open(audio_input, "rb") as audio_file:
                        content = audio_file.read()
                else:
                     raise FileNotFoundError(f"Audio file not found: {audio_input}")
            elif isinstance(audio_input, bytes):
                content = audio_input
            else:
                raise ValueError("audio_input must be a file path string or bytes.")

            logger.debug("🎤 Generating timed transcript...")

            if get_config_value("TEST_AUTOMATION"):
                logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript")
                return [
                    { "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387},
                    { "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387},
                    { "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387},
                    { "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387},
                    { "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387},
                    { "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387},
                    { "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387},
                    { "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387},
                    { "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387},
                    { "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387},
                    { "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387},
                    { "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387},
                    { "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387},
                    { "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387},
                    { "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387},
                    { "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387},
                    { "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387},
                    { "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387},
                    { "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387},
                    { "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387},
                    { "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387},
                    { "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387},
                    { "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387},
                    { "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387},
                    { "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387},
                    { "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387},
                    { "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387},
                    { "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387},
                    { "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387},
                    { "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387},
                    { "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387},
                    { "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387},
                    { "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387},
                    { "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387},
                    { "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387}
                ]

            audio = speech.RecognitionAudio(content=content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.MP3,
                sample_rate_hertz=24000,
                language_code="en-US",
                enable_automatic_punctuation=True,
                enable_word_time_offsets=True,
                model="video",
            )

            logger.debug("🔄 Transcribing audio with word-level timing...")
            operation = self.client.long_running_recognize(config=config, audio=audio)
            response = operation.result(timeout=90)

            words = []
            for result in response.results:
                for alternative in result.alternatives:
                    for word_info in alternative.words:
                        words.append(
                            {
                                "word": word_info.word,
                                "start_time": word_info.start_time.total_seconds(),
                                "end_time": word_info.end_time.total_seconds(),
                                "confidence": alternative.confidence,
                            }
                        )

            logger.debug(f"Generated timed transcript: {len(words)} words")
            logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")

            if verify_with_text:
                import re
                
                # Normalize both texts: remove non-alphanumeric, lowercase
                def normalize_text(text):
                    return re.sub(r'[^a-zA-Z0-9]', '', text).lower()
                
                stt_text = "".join([w["word"] for w in words])
                normalized_stt = normalize_text(stt_text)
                normalized_verify = normalize_text(verify_with_text)
                
                if normalized_stt == normalized_verify:
                    logger.debug("STT transcript matches text (alphanumeric check), skipping verification.")
                else:
                    logger.debug("Verifying transcript with text...")
                    try:
                        # Construct prompt for verification
                        prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
                        if os.path.exists(prompt_path):
                            with open(prompt_path, "r") as f:
                                prompt_template = f.read()
                            
                            prompt = prompt_template.format(
                                verify_with_text=verify_with_text,
                                timed_words_json=json.dumps(words)
                            )
                        else:
                            logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
                            return words
                        
                        from . import ai_studio_sdk
                        response_text = ai_studio_sdk.generate(prompt)
                        
                        if response_text:
                            # Clean up response if it contains markdown code blocks
                            clean_response = response_text.replace("```json", "").replace("```", "").strip()
                            corrected_words = json5.loads(clean_response)
                            
                            # Basic validation
                            if isinstance(corrected_words, list) and len(corrected_words) > 0:
                                logger.debug(f"Verified transcript: {len(corrected_words)} words")
                                logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}")
                                words = corrected_words
                            else:
                                 logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
                        else:
                            logger.warning("⚠️ Verification failed (no response), keeping original transcript.")
                            
                    except Exception as e:
                         logger.error(f"⚠️ Transcript verification failed: {e}")

            return words

        except Exception as e:
            logger.error(f"❌ Speech-to-Text failed: {e}")
            raise