#!/usr/bin/env python3 """Procedural dictation → syntax processor. No LLM needed. Just token scanning with a symbol vocabulary. Rules: 1. "space" → literal space 2. Symbol words → their character (dash→-, dot→., etc.) 3. Synonyms: minus→-, period→., forward slash→/, etc. 4. Number words → digits (one→1, forty two→42, hundred→00, thousand→000) 5. Casing directives: camel case, snake case, pascal case, kebab case 6. "capital X" → X (uppercase), "all caps word" → WORD 7. Everything else → pass through literally """ import json import re # ── Symbol vocabulary ──────────────────────────────────────────────────── SYMBOLS = { # Primary protocol words 'dash': '-', 'dot': '.', 'slash': '/', 'pipe': '|', 'redirect': '>', 'append': '>>', 'less': None, # needs lookahead for "less than" 'star': '*', 'bang': '!', 'hash': '#', 'tilde': '~', 'at': '@', 'dollar': '$', 'percent': '%', 'caret': '^', 'ampersand': '&', 'equals': '=', 'plus': '+', 'colon': ':', 'semicolon': ';', 'underscore': '_', 'comma': ',', 'backslash': '\\', 'quote': '"', 'backtick': '`', 'question': None, # needs lookahead for "question mark" # Synonyms — common alternatives people use 'minus': '-', 'hyphen': '-', 'period': '.', 'asterisk': '*', 'hashtag': '#', } # Two-word symbols (checked before single-word) TWO_WORD_SYMBOLS = { ('single', 'quote'): "'", ('open', 'paren'): '(', ('close', 'paren'): ')', ('open', 'brace'): '{', ('close', 'brace'): '}', ('open', 'bracket'): '[', ('close', 'bracket'): ']', ('open', 'angle'): '<', ('close', 'angle'): '>', ('open', 'curly'): '{', ('close', 'curly'): '}', ('less', 'than'): '<', ('question', 'mark'): '?', ('dash', 'dash'): '--', ('double', 'dash'): '--', ('minus', 'minus'): '--', ('and', 'and'): '&&', ('pipe', 'pipe'): '||', ('dot', 'dot'): '..', ('two', 'redirect'): '2>', ('forward', 'slash'): '/', ('back', 'slash'): '\\', ('equals', 'sign'): '=', ('at', 'sign'): '@', ('dollar', 'sign'): '$', ('open', 'parenthesis'): '(', ('close', 'parenthesis'): ')', ('new', 'line'): '\n', } # Three-word symbols THREE_WORD_SYMBOLS = { ('two', 'redirect', 'ampersand'): '2>&', } # ── Number words ───────────────────────────────────────────────────────── ONES = { 'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19, } TENS = { 'twenty': 20, 'thirty': 30, 'forty': 40, 'fifty': 50, 'sixty': 60, 'seventy': 70, 'eighty': 80, 'ninety': 90, } MULTIPLIERS = { 'hundred': 100, 'thousand': 1000, } ALL_NUMBER_WORDS = set(ONES.keys()) | set(TENS.keys()) | set(MULTIPLIERS.keys()) def is_number_word(w): return w in ALL_NUMBER_WORDS def consume_number(words, i): """Try to consume a number starting at position i. Handles: - Single: "five" → 5 - Teens: "twelve" → 12 - Compound: "forty two" → 42 - Multipliers: "three thousand" → 3000, "one hundred" → 100 - Digit sequences: "one nine two" → 192 (when 3+ single digits) - Mixed: "eight thousand" → 8000 """ w = words[i] # Tens word: twenty, thirty, etc. if w in TENS: val = TENS[w] j = i + 1 # "forty two" compound if j < len(words) and words[j] in ONES and ONES[words[j]] < 10: val += ONES[words[j]] j += 1 # Check for multiplier: "forty thousand" if j < len(words) and words[j] in MULTIPLIERS: val *= MULTIPLIERS[words[j]] j += 1 return str(val), j # Single/teens: zero through nineteen if w in ONES: val = ONES[w] j = i + 1 # Check for multiplier: "three thousand", "one hundred" if j < len(words) and words[j] in MULTIPLIERS: val *= MULTIPLIERS[words[j]] j += 1 # "three thousand two hundred" etc — keep consuming # But keep it simple for now return str(val), j # Check for digit sequence: "one nine two" → "192" # Only if next word is ALSO a single digit (0-9) result = str(val) while j < len(words) and words[j] in ONES and ONES[words[j]] < 10: result += str(ONES[words[j]]) j += 1 if j > i + 1: return result, j return str(val), i + 1 return None, i # ── Casing directives ─────────────────────────────────────────────────── CASING_DIRECTIVES = {'camel', 'snake', 'pascal', 'kebab', 'screaming'} def consume_casing(words, i): """Try to consume a casing directive and its arguments. "camel case get user profile" → "getUserProfile" "snake case api key" → "api_key" "pascal case my component" → "MyComponent" "kebab case my component" → "my-component" Consumes words until "space" or end of input. Returns (result, new_i) or (None, i). """ w = words[i].lower() if w not in CASING_DIRECTIVES: return None, i if i + 1 >= len(words) or words[i + 1].lower() != 'case': return None, i style = w j = i + 2 # Consume words until "space" or end or another directive/symbol parts = [] while j < len(words): next_w = words[j] if next_w == 'space': break if next_w in SYMBOLS: break if next_w in CASING_DIRECTIVES and j + 1 < len(words) and words[j + 1] == 'case': break if next_w in TWO_WORD_SYMBOLS or next_w == 'all' or next_w == 'capital': break parts.append(next_w.lower()) j += 1 if not parts: return None, i if style == 'camel': result = parts[0] + ''.join(p.capitalize() for p in parts[1:]) elif style == 'pascal': result = ''.join(p.capitalize() for p in parts) elif style == 'snake': result = '_'.join(parts) elif style == 'kebab': result = '-'.join(parts) elif style == 'screaming': result = '_'.join(p.upper() for p in parts) else: return None, i return result, j # ── ML-based needs_llm detector ───────────────────────────────────────── # Replaces the old heuristic is_protocol(). Loads weights from # needs-llm-model.json and does feature extraction + dot product + sigmoid. # No sklearn needed at runtime. import math as _math _NLM_MODEL = None # cached on first call def _load_nlm_model(): """Load the trained model weights from JSON (cached).""" global _NLM_MODEL if _NLM_MODEL is not None: return _NLM_MODEL import os model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'needs-llm-model.json') with open(model_path) as f: _NLM_MODEL = json.load(f) # Convert lists to frozensets for fast lookup _NLM_MODEL['_protocol_vocab'] = frozenset(_NLM_MODEL['protocol_vocab']) _NLM_MODEL['_filler_words'] = frozenset(_NLM_MODEL['filler_words']) _NLM_MODEL['_casing_starters'] = frozenset(_NLM_MODEL['casing_starters']) return _NLM_MODEL def _extract_features(text, model): """Extract 10 numeric features from dictated text.""" words = text.lower().split() n = len(words) if n == 0: return [0.0] * 10 protocol_vocab = model['_protocol_vocab'] filler_words = model['_filler_words'] casing_starters = model['_casing_starters'] intent_phrases = model['intent_phrases'] correction_phrases = model['correction_phrases'] space_count = words.count('space') space_ratio = space_count / n space_present = 1.0 if space_count > 0 else 0.0 protocol_count = sum(1 for w in words if w in protocol_vocab) protocol_ratio = protocol_count / n filler_count = sum(1 for w in words if w in filler_words) text_lower = text.lower() intent_count = sum(1 for p in intent_phrases if p in text_lower) correction_count = sum(1 for p in correction_phrases if p in text_lower) starts_casing = 1.0 if words[0] in casing_starters else 0.0 word_count = n / 20.0 non_protocol_ratio = 1.0 - protocol_ratio avg_word_len = sum(len(w) for w in words) / n return [ space_ratio, space_present, protocol_ratio, filler_count, intent_count, correction_count, starts_casing, word_count, non_protocol_ratio, avg_word_len, ] def needs_llm(text): """Does this dictated input need LLM normalization? Returns True if the input is fuzzy/natural/chaotic and needs LLM. Returns False if the input is clean protocol and the processor can handle it. Uses a trained logistic regression model (10 features, dot product + sigmoid). """ model = _load_nlm_model() features = _extract_features(text, model) weights = model['weights'] bias = model['bias'] threshold = model['threshold'] logit = sum(f * w for f, w in zip(features, weights)) + bias prob = 1.0 / (1.0 + _math.exp(-logit)) return prob >= threshold def process_dictation(text): """Convert dictated text to syntax using purely procedural rules.""" # Normalize alternative casing/caps forms before splitting text = re.sub(r'\bcamel[-_]?case\b', 'camel case', text, flags=re.IGNORECASE) text = re.sub(r'\bpascal[-_]?case\b', 'pascal case', text, flags=re.IGNORECASE) text = re.sub(r'\bsnake[-_]?case\b', 'snake case', text, flags=re.IGNORECASE) text = re.sub(r'\bkebab[-_]?case\b', 'kebab case', text, flags=re.IGNORECASE) text = re.sub(r'\bscreaming[-_]?case\b', 'screaming case', text, flags=re.IGNORECASE) text = re.sub(r'\ball[-_]caps\b', 'all caps', text, flags=re.IGNORECASE) words = text.split() output = [] i = 0 n = len(words) in_quote = False last_was_word = False while i < n: w = words[i] # ── "space" → literal space ── if w == 'space': output.append(' ') last_was_word = False i += 1 continue # ── Three-word symbols ── if i + 2 < n: triple = (words[i], words[i+1], words[i+2]) if triple in THREE_WORD_SYMBOLS: output.append(THREE_WORD_SYMBOLS[triple]) last_was_word = False i += 3 continue # ── Casing directives ── cased, new_i = consume_casing(words, i) if cased is not None: output.append(cased) last_was_word = False i = new_i continue # ── Two-word symbols ── if i + 1 < n: pair = (words[i], words[i+1]) if pair in TWO_WORD_SYMBOLS: sym = TWO_WORD_SYMBOLS[pair] output.append(sym) if sym in ('"', "'"): in_quote = not in_quote last_was_word = False i += 2 continue # ── "all caps " ── if w == 'all' and i + 2 < n and words[i+1] == 'caps': output.append(words[i+2].upper()) last_was_word = False i += 3 continue # ── "capital " ── if w == 'capital' and i + 1 < n: next_w = words[i+1] if len(next_w) == 1: output.append(next_w.upper()) else: output.append(next_w[0].upper() + next_w[1:]) last_was_word = False i += 2 continue # ── Single-word symbols ── if w in SYMBOLS and SYMBOLS[w] is not None: sym = SYMBOLS[w] output.append(sym) if sym in ('"', "'"): in_quote = not in_quote last_was_word = False i += 1 continue # ── Number words ── if is_number_word(w): num_str, new_i = consume_number(words, i) if num_str is not None: output.append(num_str) last_was_word = False i = new_i continue # ── Regular word → pass through ── # Inside quotes, insert spaces between consecutive regular words if in_quote and last_was_word: output.append(' ') output.append(w) last_was_word = True i += 1 return ''.join(output) # ── Main: evaluate ────────────────────────────────────────────────────── if __name__ == '__main__': import sys from collections import defaultdict eval_file = sys.argv[1] if len(sys.argv) > 1 else 'datasets/eval-independent.json' data = json.load(open(eval_file)) n = len(data) exact = ws = wsc = 0 errors = [] cat_results = defaultdict(lambda: {'exact': 0, 'total': 0}) # Group by difficulty if present by_difficulty = defaultdict(list) for d in data: got = process_dictation(d['dictated']) expected = d['expected'] ws_got = re.sub(r'\s+', ' ', got.strip()) ws_exp = re.sub(r'\s+', ' ', expected.strip()) is_exact = got == expected is_ws = ws_got == ws_exp is_wsc = ws_got.lower() == ws_exp.lower() if is_exact: exact += 1 if is_ws: ws += 1 if is_wsc: wsc += 1 diff = d.get('difficulty', 'unknown') by_difficulty[diff].append(is_exact) if not is_exact: errors.append({ 'dictated': d['dictated'][:80], 'expected': expected[:60], 'got': got[:60], 'category': d.get('category', ''), 'difficulty': diff, }) print(f'PROCEDURAL PROCESSOR — {eval_file}') print('=' * 70) print(f' Exact: {exact}/{n} ({exact/n*100:.1f}%)') print(f' WS-norm: {ws}/{n} ({ws/n*100:.1f}%)') print(f' WS+case: {wsc}/{n} ({wsc/n*100:.1f}%)') print() if len(by_difficulty) > 1: print('BY DIFFICULTY:') for diff in ['clean', 'fuzzy', 'natural', 'chaotic', 'unknown']: if diff in by_difficulty: results = by_difficulty[diff] ex = sum(results) tot = len(results) print(f' {diff:>10}: {ex}/{tot} ({ex/tot*100:.0f}%)') print() print(f'ERRORS ({len(errors)}, showing first 15):') print('-' * 70) for e in errors[:15]: print(f' [{e["difficulty"]:>7}] [{e["category"]}]') print(f' expected: {e["expected"]}') print(f' got: {e["got"]}') print()