""" Precision Patch: Post-transcription NER + Confidence correction service. This service identifies proper nouns and ambiguous tokens (ORG, PRODUCT, PERSON, GPE, LOC, CARDINAL) in transcribed text using spaCy, cross-references their confidence against Whisper's word-level probabilities, and sends only "suspicious" segments to the LLM for correction. Key design decisions: - CARDINAL is included because spaCy sometimes mis-tags unknown proper nouns (e.g. "NowCree") as CARDINAL - we still want to catch those. - URLs (e.g. "notebookklem.google.com") are NOT tagged by spaCy's NER at all. They are captured separately via a regex fallback. - The LLM correction pass is batched: all suspicious segments are sent in ONE call. """ import re import spacy # Regex to find URL-like tokens whisper may have garbled _URL_PATTERN = re.compile(r'\b[\w.-]+\.(?:com|org|net|io|ai|google|co)\b', re.IGNORECASE) class PrecisionPatch: """ Identifies and corrects low-confidence proper nouns in Whisper transcriptions. """ # Entity labels considered "name-like" - includes CARDINAL because spaCy # sometimes misclassifies unknown capitalized words (like brand names) as CARDINAL. ENTITY_LABELS = {"ORG", "PRODUCT", "PERSON", "GPE", "LOC", "CARDINAL"} # Confidence threshold - entities below this are considered suspicious CONFIDENCE_THRESHOLD = 0.85 def __init__(self): try: self.nlp = spacy.load("en_core_web_sm") except OSError: import subprocess, sys subprocess.run( [sys.executable, "-m", "spacy", "download", "en_core_web_sm"], check=True ) self.nlp = spacy.load("en_core_web_sm") def find_entities(self, text: str) -> list[dict]: """ Identify named entities AND URL-like tokens in text that could be brand names or proper nouns worth verifying. Args: text: The transcript segment text. Returns: List of dicts with keys: text, start (char offset), end (char offset), label """ doc = self.nlp(text) entities = [ { "text": ent.text, "start": ent.start_char, "end": ent.end_char, "label": ent.label_, } for ent in doc.ents if ent.label_ in self.ENTITY_LABELS ] # Regex fallback: catch URL-like tokens spaCy's NER misses entirely seen_spans = {(e["start"], e["end"]) for e in entities} for m in _URL_PATTERN.finditer(text): span = (m.start(), m.end()) if span not in seen_spans: entities.append({ "text": m.group(), "start": m.start(), "end": m.end(), "label": "URL", }) seen_spans.add(span) return entities def map_entities_to_confidence(self, entities: list[dict], whisper_words: list, segment_text: str) -> list[dict]: """ Calculates average probability for each spaCy entity based on Whisper words. Uses character offset alignment between the text and whisper word objects. """ if not whisper_words: for ent in entities: ent["confidence"] = 0.0 return entities # Pre-calculate char offsets for each whisper word in the segment_text word_offsets = [] current_pos = 0 for w in whisper_words: # Whisper words usually have leading spaces, so we find where it appears # relative to our current position in the segment_text. start_idx = segment_text.find(w.word, current_pos) if start_idx == -1: # Fallback: if not found, just assume it follows immediately start_idx = current_pos end_idx = start_idx + len(w.word) word_offsets.append({ "start": start_idx, "end": end_idx, "prob": w.probability }) current_pos = end_idx for ent in entities: overlapping_probs = [] for w_off in word_offsets: # Check for any overlap between entity span and word span if max(ent["start"], w_off["start"]) < min(ent["end"], w_off["end"]): overlapping_probs.append(w_off["prob"]) if overlapping_probs: ent["confidence"] = sum(overlapping_probs) / len(overlapping_probs) else: ent["confidence"] = 0.0 return entities def get_suspicious_indices(self, segments: list) -> list[int]: """ Identifies indices of segments that contain low-confidence entities. """ suspicious_indices = [] for i, seg in enumerate(segments): entities = self.find_entities(seg.text) if not entities: continue entities = self.map_entities_to_confidence(entities, seg.words, seg.text) is_suspicious = any(e["confidence"] < self.CONFIDENCE_THRESHOLD for e in entities) if is_suspicious: suspicious_indices.append(i) return suspicious_indices def apply_patch(self, segments: list, suspicious_indices: list[int]): """ Takes segments and suspicious indices, uses Gemini to correct them, and updates segments in place. Includes surrounding context for better accuracy. """ if not suspicious_indices: return segments from app.services.translators.gemini_adapter import GeminiAdapter gemini = GeminiAdapter() # Build a set of indices to send, including 1 line of context indices_to_send = set() for idx in suspicious_indices: if idx > 0: indices_to_send.add(idx - 1) indices_to_send.add(idx) if idx < len(segments) - 1: indices_to_send.add(idx + 1) sorted_indices = sorted(list(indices_to_send)) original_lines = [segments[i].text for i in sorted_indices] # Call Gemini for batch correction corrected_lines = gemini.correct_batch(original_lines) # Apply corrections back to segments for i, corrected_text in zip(sorted_indices, corrected_lines): original_text = segments[i].text # Defensive check: If the correction is a fragment (e.g. just the word "Naukri") # we reject it to prevent massive context loss. # Rule: If original has > 2 words and correction has 1 word, it's likely a fragment. orig_words = original_text.split() corr_words = corrected_text.split() if len(orig_words) > 2 and len(corr_words) <= 1: print(f" ⚠️ Warning: Precision Patch rejected a fragmented response for line {i+1} to preserve context.") continue segments[i].text = corrected_text return segments def apply_precision_patch(segments: list): """ Convenience function to run the full Precision Patch workflow on a list of segments. """ patcher = PrecisionPatch() suspicious_indices = patcher.get_suspicious_indices(segments) if suspicious_indices: print(f" ✨ Precision Patch: Found {len(suspicious_indices)} segments with low-confidence entities. Correcting...") patcher.apply_patch(segments, suspicious_indices) else: print(" ✅ Precision Patch: No suspicious entities found.") return segments