File size: 9,975 Bytes
15ffb7b 7bfc215 1a31454 f20025d 1a31454 5f00d5a 1a31454 7bfc215 1a31454 175adcc 1a31454 5f00d5a 175adcc 20ff2b7 40c905b 5f00d5a 1a31454 175adcc 1a31454 503d4ac 175adcc 7bfc215 62e3d70 7bfc215 62e3d70 503d4ac 62e3d70 503d4ac 62e3d70 7bfc215 62e3d70 7bfc215 62e3d70 503d4ac 62e3d70 7bfc215 62e3d70 7bfc215 1a31454 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import json
import json5
from typing import List, Dict, Union
from google.cloud import speech_v1 as speech
from src.logger_config import logger
import os
from .gcs_utils import get_gcs_credentials
from src.config import get_config_value
class GoogleSTT:
def __init__(self, credentials=None):
if not credentials:
credentials = get_gcs_credentials("final_data")
self.client = speech.SpeechClient(credentials=credentials)
def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
"""
Generate timed transcript using Google Cloud Speech-to-Text.
Args:
audio_input: File path (str) or audio content (bytes).
Returns:
List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'.
"""
try:
content = None
if isinstance(audio_input, str):
if os.path.exists(audio_input):
with open(audio_input, "rb") as audio_file:
content = audio_file.read()
else:
raise FileNotFoundError(f"Audio file not found: {audio_input}")
elif isinstance(audio_input, bytes):
content = audio_input
else:
raise ValueError("audio_input must be a file path string or bytes.")
logger.debug("🎤 Generating timed transcript...")
if get_config_value("TEST_AUTOMATION"):
logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript")
return [
{ "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387},
{ "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387},
{ "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387},
{ "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387},
{ "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387},
{ "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387},
{ "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387},
{ "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387},
{ "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387},
{ "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387},
{ "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387},
{ "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387},
{ "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387},
{ "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387},
{ "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387},
{ "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387},
{ "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387},
{ "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387},
{ "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387},
{ "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387},
{ "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387},
{ "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387},
{ "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387},
{ "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387},
{ "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387},
{ "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387},
{ "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387},
{ "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387},
{ "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387},
{ "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387},
{ "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387},
{ "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387},
{ "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387},
{ "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387},
{ "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387}
]
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=24000,
language_code="en-US",
enable_automatic_punctuation=True,
enable_word_time_offsets=True,
model="video",
)
logger.debug("🔄 Transcribing audio with word-level timing...")
operation = self.client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=90)
words = []
for result in response.results:
for alternative in result.alternatives:
for word_info in alternative.words:
words.append(
{
"word": word_info.word,
"start_time": word_info.start_time.total_seconds(),
"end_time": word_info.end_time.total_seconds(),
"confidence": alternative.confidence,
}
)
logger.debug(f"Generated timed transcript: {len(words)} words")
logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")
if verify_with_text:
import re
# Normalize both texts: remove non-alphanumeric, lowercase
def normalize_text(text):
return re.sub(r'[^a-zA-Z0-9]', '', text).lower()
stt_text = "".join([w["word"] for w in words])
normalized_stt = normalize_text(stt_text)
normalized_verify = normalize_text(verify_with_text)
if normalized_stt == normalized_verify:
logger.debug("STT transcript matches text (alphanumeric check), skipping verification.")
else:
logger.debug("Verifying transcript with text...")
try:
# Construct prompt for verification
prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
if os.path.exists(prompt_path):
with open(prompt_path, "r") as f:
prompt_template = f.read()
prompt = prompt_template.format(
verify_with_text=verify_with_text,
timed_words_json=json.dumps(words)
)
else:
logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
return words
from . import ai_studio_sdk
response_text = ai_studio_sdk.generate(prompt)
if response_text:
# Clean up response if it contains markdown code blocks
clean_response = response_text.replace("```json", "").replace("```", "").strip()
corrected_words = json5.loads(clean_response)
# Basic validation
if isinstance(corrected_words, list) and len(corrected_words) > 0:
logger.debug(f"Verified transcript: {len(corrected_words)} words")
logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}")
words = corrected_words
else:
logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
else:
logger.warning("⚠️ Verification failed (no response), keeping original transcript.")
except Exception as e:
logger.error(f"⚠️ Transcript verification failed: {e}")
return words
except Exception as e:
logger.error(f"❌ Speech-to-Text failed: {e}")
raise
|