jebin2's picture
new ch
20ff2b7
import json
import json5
from typing import List, Dict, Union
from google.cloud import speech_v1 as speech
from src.logger_config import logger
import os
from .gcs_utils import get_gcs_credentials
from src.config import get_config_value
class GoogleSTT:
def __init__(self, credentials=None):
if not credentials:
credentials = get_gcs_credentials("final_data")
self.client = speech.SpeechClient(credentials=credentials)
def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
"""
Generate timed transcript using Google Cloud Speech-to-Text.
Args:
audio_input: File path (str) or audio content (bytes).
Returns:
List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'.
"""
try:
content = None
if isinstance(audio_input, str):
if os.path.exists(audio_input):
with open(audio_input, "rb") as audio_file:
content = audio_file.read()
else:
raise FileNotFoundError(f"Audio file not found: {audio_input}")
elif isinstance(audio_input, bytes):
content = audio_input
else:
raise ValueError("audio_input must be a file path string or bytes.")
logger.debug("🎤 Generating timed transcript...")
if get_config_value("TEST_AUTOMATION"):
logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript")
return [
{ "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387},
{ "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387},
{ "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387},
{ "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387},
{ "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387},
{ "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387},
{ "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387},
{ "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387},
{ "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387},
{ "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387},
{ "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387},
{ "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387},
{ "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387},
{ "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387},
{ "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387},
{ "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387},
{ "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387},
{ "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387},
{ "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387},
{ "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387},
{ "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387},
{ "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387},
{ "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387},
{ "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387},
{ "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387},
{ "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387},
{ "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387},
{ "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387},
{ "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387},
{ "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387},
{ "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387},
{ "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387},
{ "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387},
{ "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387},
{ "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387}
]
audio = speech.RecognitionAudio(content=content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.MP3,
sample_rate_hertz=24000,
language_code="en-US",
enable_automatic_punctuation=True,
enable_word_time_offsets=True,
model="video",
)
logger.debug("🔄 Transcribing audio with word-level timing...")
operation = self.client.long_running_recognize(config=config, audio=audio)
response = operation.result(timeout=90)
words = []
for result in response.results:
for alternative in result.alternatives:
for word_info in alternative.words:
words.append(
{
"word": word_info.word,
"start_time": word_info.start_time.total_seconds(),
"end_time": word_info.end_time.total_seconds(),
"confidence": alternative.confidence,
}
)
logger.debug(f"Generated timed transcript: {len(words)} words")
logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")
if verify_with_text:
import re
# Normalize both texts: remove non-alphanumeric, lowercase
def normalize_text(text):
return re.sub(r'[^a-zA-Z0-9]', '', text).lower()
stt_text = "".join([w["word"] for w in words])
normalized_stt = normalize_text(stt_text)
normalized_verify = normalize_text(verify_with_text)
if normalized_stt == normalized_verify:
logger.debug("STT transcript matches text (alphanumeric check), skipping verification.")
else:
logger.debug("Verifying transcript with text...")
try:
# Construct prompt for verification
prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
if os.path.exists(prompt_path):
with open(prompt_path, "r") as f:
prompt_template = f.read()
prompt = prompt_template.format(
verify_with_text=verify_with_text,
timed_words_json=json.dumps(words)
)
else:
logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
return words
from . import ai_studio_sdk
response_text = ai_studio_sdk.generate(prompt)
if response_text:
# Clean up response if it contains markdown code blocks
clean_response = response_text.replace("```json", "").replace("```", "").strip()
corrected_words = json5.loads(clean_response)
# Basic validation
if isinstance(corrected_words, list) and len(corrected_words) > 0:
logger.debug(f"Verified transcript: {len(corrected_words)} words")
logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}")
words = corrected_words
else:
logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
else:
logger.warning("⚠️ Verification failed (no response), keeping original transcript.")
except Exception as e:
logger.error(f"⚠️ Transcript verification failed: {e}")
return words
except Exception as e:
logger.error(f"❌ Speech-to-Text failed: {e}")
raise