File size: 15,130 Bytes

04558eb

#!/usr/bin/env python3
"""Zero-training normalizer pipeline.

Architecture:
  Raw transcript
    → Protocol detector (is it already in protocol format?)
    → IF protocol: strip filler procedurally → processor
    → IF NOT protocol: LLM normalize → processor
    → Final syntax output

The LLM only handles non-protocol input (fuzzy dictation, natural language).
Protocol-format input bypasses the LLM entirely for deterministic handling.
"""

import json
import sys
import time
import re
import os
import argparse
from collections import defaultdict

from mlx_lm import load, generate
from mlx_lm.sample_utils import make_sampler

# Import the procedural processor
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', 'processor'))
from procedural import process_dictation

# ── Protocol detection ───────────────────────────────────────────────────

# Words that are part of the protocol vocabulary (not filler)
PROTOCOL_VOCAB = {
    'space', 'dash', 'dot', 'slash', 'pipe', 'star', 'bang', 'hash',
    'tilde', 'at', 'dollar', 'percent', 'caret', 'ampersand', 'equals',
    'plus', 'colon', 'semicolon', 'underscore', 'comma', 'backslash',
    'quote', 'backtick', 'redirect', 'append',
    'capital', 'camel', 'snake', 'pascal', 'kebab', 'screaming',
}

# Common conversational filler patterns to strip
FILLER_PREFIXES = [
    r"^okay\s+so\s+(?:the\s+command\s+is\s+|like\s+)?",
    r"^so\s+(?:the\s+command\s+is\s+|like\s+|it's\s+)?",
    r"^um+\s+(?:so\s+)?(?:the\s+)?",
    r"^(?:I\s+wanna?|I\s+want\s+to)\s+(?:\w+\s+)*?(?:to\s+|is\s+)?",
    r"^can\s+you\s+(?:type\s+(?:out\s+)?)?",
    r"^(?:let's\s+(?:do|see|try)\s+)",
    r"^basically\s+(?:run\s+|do\s+|type\s+)?",
    r"^(?:and\s+then|then)\s+",
    r"^right\s+so\s+",
    r"^(?:type\s+(?:out\s+)?)",
    r"^okay\s+(?:let\s+me\s+type\s+)?(?:the\s+)?(?:\w+\s+)?(?:command\s+)?(?:so\s+)?(?:it's\s+)?",
    r"^I\s+think\s+we\s+need\s+",
    r"^(?:so\s+)?for\s+the\s+\w+\s+(?:variable\s+)?(?:it's\s+)?",
    r"^I\s+want\s+to\s+run\s+",
]

FILLER_SUFFIXES = [
    r"\s+I\s+think$",
    r"\s+right$",
    r"\s+yeah$",
]


FILLER_WORDS = {
    'okay', 'ok', 'so', 'um', 'uh', 'like', 'basically', 'actually',
    'i', 'the', 'can', 'right', 'wait', 'well', 'and',
    'we', 'you', 'hmm', "let's", 'just',
    'then', "i'm", "it's", "that's",
    'should', 'would', 'could', 'maybe',
}

SELF_CORRECTION = {'wait', 'no', 'actually', 'meant', 'not'}


def is_pure_protocol(text):
    """Check if text is pure protocol format (no filler, no corrections).

    Returns True only if:
    1. Input contains "space" as separator (protocol format)
    2. Does NOT start with filler words (conversational)
    3. Does NOT contain self-correction markers
    """
    words = text.lower().split()
    if not words:
        return False

    # Must contain "space" keyword
    if 'space' not in words:
        return False

    # Must not start with filler
    if words[0] in FILLER_WORDS:
        return False

    # Must not contain self-correction patterns
    word_set = set(words)
    if word_set & SELF_CORRECTION:
        return False

    return True


def strip_filler(text):
    """Procedurally strip conversational filler from text."""
    result = text
    for pattern in FILLER_PREFIXES:
        result = re.sub(pattern, '', result, flags=re.IGNORECASE)
    for pattern in FILLER_SUFFIXES:
        result = re.sub(pattern, '', result, flags=re.IGNORECASE)
    return result.strip()


# ── LLM prompt (optimized for non-protocol input) ───────────────────────

SYSTEM_PROMPT = """You normalize voice dictation into clean protocol format for a processor.

YOUR JOB:
1. If the input already contains "space" keywords with conversational filler → strip the filler, output the protocol content VERBATIM
2. If input is natural speech without "space" keywords → normalize it:
   a) Replace synonyms: minus→dash, hyphen→dash, period→dot, forward slash→slash, asterisk→star, hashtag→hash, double dash→dash dash
   b) Insert "space" between separate arguments/tokens
   c) Do NOT insert "space" within: paths (slash-separated), dotted names (file dot txt), compound flags (dash dash verbose)
3. Resolve self-corrections (no wait, actually, I meant) → keep only the FINAL intent
4. Output ONLY protocol words — never output actual symbols like - . / @ etc.

PROTOCOL KEYWORDS (output as words):
Separator: space
Symbols: dash dot slash pipe star bang hash tilde at dollar percent caret ampersand equals plus colon semicolon underscore comma backslash quote backtick redirect append
Multi-word: dash dash, single quote, open/close paren, open/close brace, open/close bracket, less than, question mark, and and, pipe pipe, dot dot, new line
Casing: camel case, snake case, pascal case, kebab case (followed by the words to transform)
Capitalization: capital (next word), all caps (next word)
Numbers: zero through nineteen, twenty/thirty/.../ninety, hundred, thousand

Output ONLY the normalized protocol text. Nothing else."""

FEW_SHOT = [
    # Fuzzy: missing spaces, synonym replacement needed
    {
        "input": "git commit minus m quote fix login bug quote",
        "output": "git space commit space dash m space quote fix space login space bug quote"
    },
    {
        "input": "cat file period txt",
        "output": "cat space file dot txt"
    },
    {
        "input": "ls minus l minus a slash var slash log",
        "output": "ls space dash l space dash a space slash var slash log"
    },
    {
        "input": "docker run minus minus rm minus it ubuntu",
        "output": "docker space run space dash dash rm space dash it space ubuntu"
    },
    {
        "input": "cd forward slash usr forward slash local forward slash bin",
        "output": "cd space slash usr slash local slash bin"
    },
    {
        "input": "python server period py double dash port eight thousand",
        "output": "python space server dot py space dash dash port space eight thousand"
    },
    {
        "input": "git push hyphen u origin main",
        "output": "git space push space dash u space origin space main"
    },
    {
        "input": "npm install hyphen hyphen save dev eslint",
        "output": "npm space install space dash dash save dash dev space eslint"
    },
    # Casing: pass through verbatim, no spaces between words after the directive
    {
        "input": "snake case api response handler",
        "output": "snake case api response handler"
    },
    {
        "input": "camel case is authenticated",
        "output": "camel case is authenticated"
    },
    # Natural: filler around protocol content, strip filler and pass through protocol
    {
        "input": "okay so the command is git space push space dash u space origin space main",
        "output": "git space push space dash u space origin space main"
    },
    {
        "input": "can you type out docker space run space dash dash rm space nginx",
        "output": "docker space run space dash dash rm space nginx"
    },
    {
        "input": "I wanna set the variable name to camel case get user profile",
        "output": "camel case get user profile"
    },
    {
        "input": "the path should be slash usr slash local slash bin",
        "output": "slash usr slash local slash bin"
    },
    {
        "input": "um the flag is dash dash verbose",
        "output": "dash dash verbose"
    },
    {
        "input": "so for the environment variable it's all caps AWS underscore SECRET underscore ACCESS underscore KEY",
        "output": "all caps AWS underscore SECRET underscore ACCESS underscore KEY"
    },
    # Chaotic: self-corrections
    {
        "input": "dash dash no wait just dash v",
        "output": "dash v"
    },
    {
        "input": "run it on port three thousand",
        "output": "three thousand"
    },
    {
        "input": "wait no not dash dash force I meant dash dash force dash with dash lease",
        "output": "dash dash force dash with dash lease"
    },
    {
        "input": "so we need to... actually let's just do git stash",
        "output": "git space stash"
    },
]


def build_prompt(tokenizer, user_input):
    """Build the full prompt with system instructions, few-shot examples, and the user input."""
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]

    for ex in FEW_SHOT:
        messages.append({"role": "user", "content": ex["input"]})
        messages.append({"role": "assistant", "content": ex["output"]})

    messages.append({"role": "user", "content": user_input})

    return tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


def llm_normalize(model, tokenizer, raw_input, max_tokens=200):
    """Use the LLM to normalize raw dictation into protocol format."""
    prompt = build_prompt(tokenizer, raw_input)
    sampler = make_sampler(temp=0.0)
    output = generate(
        model, tokenizer, prompt=prompt,
        max_tokens=max_tokens, verbose=False,
        sampler=sampler,
    )
    # Clean up: strip whitespace, remove any wrapping quotes/backticks
    result = output.strip()
    result = result.strip('`').strip('"').strip("'")
    # Remove markdown code blocks if present
    result = re.sub(r'^```\w*\n?', '', result)
    result = re.sub(r'\n?```$', '', result)
    return result.strip()


def run_pipeline(model, tokenizer, raw_input):
    """Full pipeline: detect format → normalize if needed → processor."""
    t0 = time.perf_counter()

    if is_pure_protocol(raw_input):
        # Already in protocol format — strip filler procedurally, skip LLM
        protocol_text = strip_filler(raw_input)
        used_llm = False
    else:
        # Needs LLM normalization
        protocol_text = llm_normalize(model, tokenizer, raw_input)
        used_llm = True

    t_norm = time.perf_counter()
    final_output = process_dictation(protocol_text)
    t_proc = time.perf_counter()

    return {
        'protocol': protocol_text,
        'output': final_output,
        'used_llm': used_llm,
        'norm_ms': (t_norm - t0) * 1000,
        'proc_ms': (t_proc - t_norm) * 1000,
        'total_ms': (t_proc - t0) * 1000,
    }


def main():
    parser = argparse.ArgumentParser(description='Zero-training normalizer pipeline evaluation')
    parser.add_argument('eval_file', help='Path to evaluation JSON file')
    parser.add_argument('--model', default='mlx-community/Qwen2.5-1.5B-Instruct-4bit',
                        help='MLX model to use')
    parser.add_argument('--limit', type=int, default=0,
                        help='Limit number of entries to evaluate (0 = all)')
    parser.add_argument('--show-all', action='store_true',
                        help='Show all results, not just errors')
    parser.add_argument('--show-protocol', action='store_true',
                        help='Show normalized protocol output for each entry')
    args = parser.parse_args()

    # Load model
    print(f'Loading model: {args.model}')
    model, tokenizer = load(args.model)
    print(f'Model loaded.\n')

    # Load eval data
    data = json.load(open(args.eval_file))
    if args.limit:
        data = data[:args.limit]

    n = len(data)
    exact = ws = 0
    llm_calls = 0
    errors = []
    by_difficulty = defaultdict(list)
    latencies = []

    print(f'Evaluating {n} entries from {args.eval_file}')
    print(f'Pipeline: Protocol Detect → LLM ({args.model.split("/")[-1]}) / Filler Strip → Processor')
    print('=' * 70)

    for idx, d in enumerate(data):
        result = run_pipeline(model, tokenizer, d['dictated'])
        if result['used_llm']:
            llm_calls += 1

        expected = d['expected']
        got = result['output']

        ws_got = re.sub(r'\s+', ' ', got.strip())
        ws_exp = re.sub(r'\s+', ' ', expected.strip())
        is_exact = got == expected
        is_ws = ws_got == ws_exp

        if is_exact:
            exact += 1
        if is_ws:
            ws += 1

        diff = d.get('difficulty', 'unknown')
        by_difficulty[diff].append(is_exact)
        latencies.append(result['total_ms'])

        marker = '.' if is_exact else 'x'
        sys.stdout.write(marker)
        sys.stdout.flush()
        if (idx + 1) % 50 == 0:
            sys.stdout.write(f' [{idx+1}/{n}]\n')
            sys.stdout.flush()

        if args.show_all or (args.show_protocol and not is_exact):
            llm_tag = 'LLM' if result['used_llm'] else 'SKIP'
            print(f'\n  [{diff:>7}] [{d.get("category", "")}] {"PASS" if is_exact else "FAIL"} ({llm_tag})')
            print(f'    input:    {d["dictated"][:120]}')
            if args.show_protocol:
                print(f'    protocol: {result["protocol"][:120]}')
            print(f'    expected: {expected[:100]}')
            print(f'    got:      {got[:100]}')
            print(f'    latency:  {result["total_ms"]:.0f}ms')

        if not is_exact:
            errors.append({
                'dictated': d['dictated'][:120],
                'expected': expected[:100],
                'got': got[:100],
                'protocol': result['protocol'][:120],
                'category': d.get('category', ''),
                'difficulty': diff,
                'used_llm': result['used_llm'],
                'latency_ms': result['total_ms'],
            })

    # Ensure newline after progress dots
    if n % 50 != 0:
        print(f' [{n}/{n}]')
    print()

    # ── Results ──
    print(f'NORMALIZER PIPELINE — {args.eval_file}')
    print(f'Model: {args.model}')
    print('=' * 70)
    print(f'  Exact:   {exact}/{n} ({exact/n*100:.1f}%)')
    print(f'  WS-norm: {ws}/{n} ({ws/n*100:.1f}%)')
    print(f'  LLM calls: {llm_calls}/{n} ({llm_calls/n*100:.0f}% needed LLM)')
    print()

    if len(by_difficulty) > 1 or 'unknown' not in by_difficulty:
        print('BY DIFFICULTY:')
        for diff in ['clean', 'fuzzy', 'natural', 'chaotic', 'unknown']:
            if diff in by_difficulty:
                results = by_difficulty[diff]
                ex = sum(results)
                tot = len(results)
                print(f'  {diff:>10}: {ex}/{tot} ({ex/tot*100:.0f}%)')
        print()

    avg_lat = sum(latencies) / len(latencies) if latencies else 0
    p50 = sorted(latencies)[len(latencies) // 2] if latencies else 0
    p95 = sorted(latencies)[int(len(latencies) * 0.95)] if latencies else 0
    print(f'LATENCY:')
    print(f'  avg: {avg_lat:.0f}ms  p50: {p50:.0f}ms  p95: {p95:.0f}ms')
    print()

    print(f'ERRORS ({len(errors)}, showing first 25):')
    print('-' * 70)
    for e in errors[:25]:
        llm_tag = 'LLM' if e['used_llm'] else 'SKIP'
        print(f'  [{e["difficulty"]:>7}] [{e["category"]}] ({llm_tag})')
        print(f'    input:    {e["dictated"]}')
        print(f'    protocol: {e["protocol"]}')
        print(f'    expected: {e["expected"]}')
        print(f'    got:      {e["got"]}')
        print()


if __name__ == '__main__':
    main()