Spaces:

Elvoro
/

Tools

Running

App Files Files Community

jebin2 commited on 8 days ago

Commit

7bfc215

1 Parent(s): b0d5eeb

feat: Add AI-driven verification and correction for speech-to-text timed transcripts.

Browse files

Files changed (4) hide show

src/google_src/ai_studio_sdk.py +17 -0
src/google_src/stt.py +43 -1
src/pipelines/ai_pipeline.py +1 -1
src/prompt/stt_verification.md +17 -0

src/google_src/ai_studio_sdk.py CHANGED Viewed

@@ -328,6 +328,23 @@ def _get_mock_response(prompt: str) -> str:
     "reason": "Pricing and CTA (\"Link in bio\") refer to the product purchase"
   }
 ]
 """
     # Default fallback

     "reason": "Pricing and CTA (\"Link in bio\") refer to the product purchase"
   }
 ]
+"""
+    # 6. Transcript Verification Prompt
+    if "verify and correct the timed words" in prompt_lower or "speech-to-text alignment" in prompt_lower:
+        return """
+[
+  { "word": "If", "start_time": 0.2, "end_time": 0.4, "confidence": 0.99 },
+  { "word": "you're", "start_time": 0.4, "end_time": 0.5, "confidence": 0.99 },
+  { "word": "creating", "start_time": 0.5, "end_time": 0.9, "confidence": 0.99 },
+  { "word": "content", "start_time": 0.9, "end_time": 1.3, "confidence": 0.99 },
+  { "word": "for", "start_time": 1.3, "end_time": 1.4, "confidence": 0.99 },
+  { "word": "social", "start_time": 1.4, "end_time": 1.8, "confidence": 0.99 },
+  { "word": "media,", "start_time": 1.8, "end_time": 2.3, "confidence": 0.99 },
+  { "word": "you", "start_time": 2.3, "end_time": 2.4, "confidence": 0.99 },
+  { "word": "need", "start_time": 2.4, "end_time": 2.7, "confidence": 0.99 },
+  { "word": "b-roll", "start_time": 2.7, "end_time": 3.3, "confidence": 0.99 }
+]
 """
     # Default fallback

src/google_src/stt.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 from typing import List, Dict, Union
 from google.cloud import speech_v1 as speech
 from src.utils import logger
@@ -12,7 +13,7 @@ class GoogleSTT:
             credentials = get_gcs_credentials("final_data")
         self.client = speech.SpeechClient(credentials=credentials)
-    def generate_timed_transcript(self, audio_input: Union[str, bytes]) -> List[Dict]:
         """
         Generate timed transcript using Google Cloud Speech-to-Text.
@@ -106,6 +107,47 @@ class GoogleSTT:
             logger.info(f"✅ Generated timed transcript: {len(words)} words")
             logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")
             return words
         except Exception as e:

 import json
+import json5
 from typing import List, Dict, Union
 from google.cloud import speech_v1 as speech
 from src.utils import logger
             credentials = get_gcs_credentials("final_data")
         self.client = speech.SpeechClient(credentials=credentials)
+    def generate_timed_transcript(self, audio_input: Union[str, bytes], verify_with_text: str = None) -> List[Dict]:
         """
         Generate timed transcript using Google Cloud Speech-to-Text.
             logger.info(f"✅ Generated timed transcript: {len(words)} words")
             logger.debug(f"Timed Transcript:\n{json.dumps(words, indent=2)}")
+            if verify_with_text:
+                logger.info("🔍 Verifying transcript with text...")
+                try:
+                    # Construct prompt for verification
+                    prompt_path = os.path.join(os.path.dirname(__file__), "../prompt/stt_verification.md")
+                    if os.path.exists(prompt_path):
+                        with open(prompt_path, "r") as f:
+                            prompt_template = f.read()
+                        prompt = prompt_template.format(
+                            verify_with_text=verify_with_text,
+                            timed_words_json=json.dumps(words)
+                        )
+                    else:
+                        logger.warning(f"⚠️ Prompt file not found at {prompt_path}, skipping verification.")
+                        return words
+                    from . import ai_studio_sdk
+                    response_text = ai_studio_sdk.generate(prompt)
+                    if response_text:
+                        # Clean up response if it contains markdown code blocks
+                        clean_response = response_text.replace("```json", "").replace("```", "").strip()
+                        corrected_words = json5.loads(clean_response)
+                        # Basic validation
+                        if isinstance(corrected_words, list) and len(corrected_words) > 0:
+                            logger.info(f"✅ Verified transcript: {len(corrected_words)} words")
+                            words = corrected_words
+                        else:
+                             logger.warning("⚠️ Verification returned invalid format, keeping original transcript.")
+                    else:
+                        logger.warning("⚠️ Verification failed (no response), keeping original transcript.")
+                except Exception as e:
+                     logger.error(f"⚠️ Transcript verification failed: {e}")
+                     # Fallback to original words on failure
             return words
         except Exception as e:

src/pipelines/ai_pipeline.py CHANGED Viewed

@@ -128,7 +128,7 @@ class AIContentAutomationBase(ContentAutomationBase):
         # Generate timed transcript
         logger.info("\n⏱️ STEP 5a: Generate Timed Transcript")
-        timed_words = self.stt.generate_timed_transcript(tts_audio_data["local_path"])
         visual_assets = get_config_value("visual_assets", {})
         visual_assets["timed_transcript"] = timed_words
         set_config_value("visual_assets", visual_assets)

         # Generate timed transcript
         logger.info("\n⏱️ STEP 5a: Generate Timed Transcript")
+        timed_words = self.stt.generate_timed_transcript(tts_audio_data["local_path"], tts_audio_data["text"])
         visual_assets = get_config_value("visual_assets", {})
         visual_assets["timed_transcript"] = timed_words
         set_config_value("visual_assets", visual_assets)

src/prompt/stt_verification.md ADDED Viewed

	@@ -0,0 +1,17 @@

+You are an expert in speech-to-text alignment.
+I have a ground truth text and a list of timed words generated by an STT engine.
+The STT engine might have made spelling mistakes, missed words, or added extra words.
+Your task is to correct the 'word' field in the timed words list to match the ground truth text,
+while preserving the 'start_time' and 'end_time' as much as possible.
+If a word is missing in the STT output but present in the ground truth, you can try to interpolate timings or merge/split existing timings,
+but the most important thing is that the sequence of 'word' values in your output EXACTLY matches the ground truth text.
+Ground Truth Text:
+"{verify_with_text}"
+Timed Words List (JSON):
+{timed_words_json}
+Return ONLY the corrected JSON list of objects with keys: "word", "start_time", "end_time", "confidence".
+Do not return any markdown formatting or explanation. Just the JSON.