|
|
import json |
|
|
import json5 |
|
|
from typing import List, Dict, Union |
|
|
from google.cloud import speech_v1 as speech |
|
|
from src.logger_config import logger |
|
|
import os |
|
|
from .gcs_utils import get_gcs_credentials |
|
|
from src.config import get_config_value |
|
|
|
|
|
class GoogleSTT: |
|
|
def __init__(self, credentials=None): |
|
|
if not credentials: |
|
|
credentials = get_gcs_credentials("final_data") |
|
|
self.client = speech.SpeechClient(credentials=credentials) |
|
|
|
|
|
def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]: |
|
|
""" |
|
|
Generate timed transcript using Google Cloud Speech-to-Text. |
|
|
|
|
|
Args: |
|
|
audio_input: File path (str) or audio content (bytes). |
|
|
|
|
|
Returns: |
|
|
List of dictionaries containing 'word', 'start_time', 'end_time', 'confidence'. |
|
|
""" |
|
|
try: |
|
|
content = None |
|
|
if isinstance(audio_input, str): |
|
|
if os.path.exists(audio_input): |
|
|
with open(audio_input, "rb") as audio_file: |
|
|
content = audio_file.read() |
|
|
else: |
|
|
raise FileNotFoundError(f"Audio file not found: {audio_input}") |
|
|
elif isinstance(audio_input, bytes): |
|
|
content = audio_input |
|
|
else: |
|
|
raise ValueError("audio_input must be a file path string or bytes.") |
|
|
|
|
|
logger.debug("🎤 Generating timed transcript...") |
|
|
|
|
|
if get_config_value("TEST_AUTOMATION"): |
|
|
logger.info("🧪 TEST_MODE: Skipping Google STT, returning dummy transcript") |
|
|
return [ |
|
|
{ "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.8659737706184387}, |
|
|
{ "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.8659737706184387}, |
|
|
{ "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.8659737706184387}, |
|
|
{ "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.8659737706184387}, |
|
|
{ "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "but", "start_time": 3.4, "end_time": 3.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "filming,", "start_time": 3.6, "end_time": 3.9, "confidence": 0.8659737706184387}, |
|
|
{ "word": "it", "start_time": 3.9, "end_time": 4.0, "confidence": 0.8659737706184387}, |
|
|
{ "word": "yourself", "start_time": 4.0, "end_time": 4.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "takes", "start_time": 4.4, "end_time": 4.7, "confidence": 0.8659737706184387}, |
|
|
{ "word": "forever", "start_time": 4.7, "end_time": 5.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "and", "start_time": 5.5, "end_time": 5.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "stock", "start_time": 5.6, "end_time": 6.0, "confidence": 0.8659737706184387}, |
|
|
{ "word": "sites", "start_time": 6.0, "end_time": 6.2, "confidence": 0.8659737706184387}, |
|
|
{ "word": "charge", "start_time": 6.2, "end_time": 6.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "$60", "start_time": 6.6, "end_time": 7.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "per", "start_time": 7.3, "end_time": 7.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "clip.", "start_time": 7.4, "end_time": 7.9, "confidence": 0.8659737706184387}, |
|
|
{ "word": "I", "start_time": 8.1, "end_time": 8.3, "confidence": 0.8659737706184387}, |
|
|
{ "word": "use", "start_time": 8.3, "end_time": 8.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "this", "start_time": 8.4, "end_time": 8.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "Library", "start_time": 8.6, "end_time": 9.0, "confidence": 0.8659737706184387}, |
|
|
{ "word": "instead", "start_time": 9.0, "end_time": 9.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "1,000", "start_time": 9.9, "end_time": 10.5, "confidence": 0.8659737706184387}, |
|
|
{ "word": "luxury", "start_time": 10.5, "end_time": 10.9, "confidence": 0.8659737706184387}, |
|
|
{ "word": "clips", "start_time": 10.9, "end_time": 11.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "for", "start_time": 11.5, "end_time": 11.6, "confidence": 0.8659737706184387}, |
|
|
{ "word": "$50", "start_time": 11.6, "end_time": 12.4, "confidence": 0.8659737706184387}, |
|
|
{ "word": "Link", "start_time": 12.7, "end_time": 13.1, "confidence": 0.8659737706184387}, |
|
|
{ "word": "Bio.", "start_time": 13.1, "end_time": 13.6, "confidence": 0.8659737706184387} |
|
|
] |
|
|
|
|
|
audio = speech.RecognitionAudio(content=content) |
|
|
config = speech.RecognitionConfig( |
|
|
encoding=speech.RecognitionConfig.AudioEncoding.MP3, |
|
|
sample_rate_hertz=24000, |
|
|
language_code="en-US", |
|
|
enable_automatic_punctuation=True, |
|
|
enable_word_time_offsets=True, |
|
|
model="video", |
|
|
) |
|
|
|
|
|
logger.debug("🔄 Transcribing audio with word-level timing...") |
|
|
operation = self.client.long_running_recognize(config=config, audio=audio) |
|
|
response = operation.result(timeout=90) |
|
|
|
|
|
words = [] |
|
|
for result in response.results: |
|
|
for alternative in result.alternatives: |
|
|
for word_info in alternative.words: |
|
|
words.append( |
|
|
{ |
|
|
"word": word_info.word, |
|
|
"start_time": word_info.start_time.total_seconds(), |
|
|
"end_time": word_info.end_time.total_seconds(), |
|
|
"confidence": alternative.confidence, |
|
|
} |
|
|
) |
|
|
|
|
|
logger.debug(f"Generated timed transcript: {len(words)} words") |
|
|
logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}") |
|
|
|
|
|
if verify_with_text: |
|
|
import re |
|
|
|
|
|
|
|
|
def normalize_text(text): |
|
|
return re.sub(r'[^a-zA-Z0-9]', '', text).lower() |
|
|
|
|
|
stt_text = "".join([w["word"] for w in words]) |
|
|
normalized_stt = normalize_text(stt_text) |
|
|
normalized_verify = normalize_text(verify_with_text) |
|
|
|
|
|
if normalized_stt == normalized_verify: |
|
|
logger.debug("STT transcript matches text (alphanumeric check), skipping verification.") |
|
|
else: |
|
|
logger.debug("Verifying transcript with text...") |
|
|
try: |
|
|
|
|
|
prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md") |
|
|
if os.path.exists(prompt_path): |
|
|
with open(prompt_path, "r") as f: |
|
|
prompt_template = f.read() |
|
|
|
|
|
prompt = prompt_template.format( |
|
|
verify_with_text=verify_with_text, |
|
|
timed_words_json=json.dumps(words) |
|
|
) |
|
|
else: |
|
|
logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.") |
|
|
return words |
|
|
|
|
|
from . import ai_studio_sdk |
|
|
response_text = ai_studio_sdk.generate(prompt) |
|
|
|
|
|
if response_text: |
|
|
|
|
|
clean_response = response_text.replace("```json", "").replace("```", "").strip() |
|
|
corrected_words = json5.loads(clean_response) |
|
|
|
|
|
|
|
|
if isinstance(corrected_words, list) and len(corrected_words) > 0: |
|
|
logger.debug(f"Verified transcript: {len(corrected_words)} words") |
|
|
logger.debug(f"Verified Transcript:\n{json.dumps(corrected_words, indent=2)}") |
|
|
words = corrected_words |
|
|
else: |
|
|
logger.warning("⚠️ Verification returned invalid format, keeping original transcript.") |
|
|
else: |
|
|
logger.warning("⚠️ Verification failed (no response), keeping original transcript.") |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"⚠️ Transcript verification failed: {e}") |
|
|
|
|
|
return words |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"❌ Speech-to-Text failed: {e}") |
|
|
raise |
|
|
|