File size: 11,478 Bytes

685d968

"""
Data Preprocessing for Memory Routing Training

This script converts synthetic JSONL conversations to Tinker-compatible
types.Datum objects for supervised fine-tuning.

Per Tinker docs (rendering.mdx):
- Use renderer.build_supervised_example() to get tokens and weights
- Weights indicate which tokens to train on (1.0 for completion, 0.0 for prompt)
- Target tokens are shifted by 1 (predicting next token)

Per PRD Section 6.6:
- Validate datum length <= 4096
- Ensure non-zero weights
- Verify token IDs are within vocab range
"""

import json
import os
from typing import List, Dict, Any, Tuple
from dataclasses import dataclass

# Note: These imports require tinker and tinker-cookbook to be installed
# pip install git+https://github.com/thinking-machines-lab/tinker.git
# pip install git+https://github.com/thinking-machines-lab/tinker-cookbook.git

MODEL_NAME = "meta-llama/Llama-3.1-8B"
RENDERER_NAME = "llama3"
MAX_SEQUENCE_LENGTH = 4096

# Memory taxonomy for validation
VALID_CATEGORIES = {
    "company.brand_core",
    "company.strategic_signatures", 
    "company.knowledge_artifacts",
    "company.business_priorities",
    "company.tools_config",
    "company.performance_context",
    "user.communication_style",
    "user.strategic_approach",
    "user.role_context",
    "user.workflow_patterns",
    "user.session_history",
    "user.interaction_preferences",
    "none"
}

@dataclass
class PreprocessingStats:
    total_examples: int = 0
    valid_examples: int = 0
    skipped_too_long: int = 0
    skipped_zero_weights: int = 0
    skipped_invalid_tokens: int = 0
    skipped_invalid_categories: int = 0


def build_routing_prompt(conversation: List[Dict[str, str]], categories: List[str]) -> List[Dict[str, str]]:
    """
    Build the full conversation for training, including:
    1. System prompt with taxonomy
    2. User message with conversation
    3. Assistant response with categories
    
    Per PRD Section 6 - Student Prompt format.
    """
    # System prompt with taxonomy
    system_content = """You route marketing conversations into structured memory categories.

Available categories:
- company.brand_core: Voice, values, positioning, identity anchors (Long >1y)
- company.strategic_signatures: Decision frameworks, strategic heuristics (Long >1y)
- company.knowledge_artifacts: Docs, style guides, playbooks (Long >1y)
- company.business_priorities: Quarterly/seasonal goals, active campaigns (Short <3m)
- company.tools_config: Integrations, API keys, workflow settings (Medium ~6m)
- company.performance_context: Campaign metrics, retrospectives, learnings (Rolling ~6m)
- user.communication_style: Tone, verbosity, format expectations (Long >1y)
- user.strategic_approach: Personal priorities, success definitions (Long >1y)
- user.role_context: Title, scope, decision authority (Medium ~1y)
- user.workflow_patterns: Review cadence, collaboration norms (Medium ~1y)
- user.session_history: Immediate context, recent asks (Short <2w)
- user.interaction_preferences: Coaching style, feedback expectations (Evolving)
- none: Irrelevant, vague, or transactional content

Respond with comma-separated categories. Use 'none' only if no other category applies."""

    # Format the conversation for the user message
    conversation_text = ""
    for turn in conversation:
        # Handle malformed turns (string instead of dict)
        if isinstance(turn, str):
            conversation_text += f"UNKNOWN: {turn}\n"
            continue
        if not isinstance(turn, dict):
            continue
        role = turn.get("role", "unknown")
        content = turn.get("content", "")
        conversation_text += f"{role.upper()}: {content}\n"
    
    user_content = f"Conversation:\n{conversation_text.strip()}\n\nWhat memory categories apply?"
    
    # Assistant response is the comma-separated categories
    assistant_content = ", ".join(categories)
    
    return [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": assistant_content}
    ]


def load_synthetic_data(filepath: str) -> List[Dict[str, Any]]:
    """Load synthetic data from JSONL file."""
    data = []
    with open(filepath, "r") as f:
        for line in f:
            if line.strip():
                item = json.loads(line)
                data.append(item)
    return data


def validate_categories(categories: List[str]) -> bool:
    """Validate that all categories are in the taxonomy."""
    return all(cat in VALID_CATEGORIES for cat in categories)


def preprocess_example_mock(example: Dict[str, Any], stats: PreprocessingStats) -> Dict[str, Any] | None:
    """
    Mock preprocessing that validates structure without Tinker.
    Returns a dict representation of what would become a Datum.
    
    Use this for testing without Tinker installed.
    """
    conversation = example.get("conversation", [])
    labels = example.get("labels", {})
    categories = labels.get("categories", [])
    
    # Validate categories
    if not validate_categories(categories):
        stats.skipped_invalid_categories += 1
        return None
    
    # Build the full training conversation
    training_messages = build_routing_prompt(conversation, categories)
    
    # Mock token estimation (rough: 4 chars per token)
    total_chars = sum(len(m["content"]) for m in training_messages)
    estimated_tokens = total_chars // 4
    
    if estimated_tokens > MAX_SEQUENCE_LENGTH:
        stats.skipped_too_long += 1
        return None
    
    stats.valid_examples += 1
    
    return {
        "messages": training_messages,
        "categories": categories,
        "estimated_tokens": estimated_tokens,
        "scenario_id": example.get("scenario_id", "unknown")
    }


def preprocess_with_tinker(example: Dict[str, Any], renderer, tokenizer, vocab_size: int, stats: PreprocessingStats):
    """
    Full preprocessing with Tinker renderer.
    
    Per Tinker docs (rendering.mdx):
    - build_supervised_example returns (tokens, weights)
    - weights=1.0 for completion tokens, weights=0.0 for prompt tokens
    
    Per Tinker docs (training-sampling.mdx):
    - input_tokens = tokens[:-1]
    - target_tokens = tokens[1:]  # Shifted for next-token prediction
    - weights = weights[1:]
    """
    from tinker import types
    
    conversation = example.get("conversation", [])
    labels = example.get("labels", {})
    categories = labels.get("categories", [])
    
    # Validate categories
    if not validate_categories(categories):
        stats.skipped_invalid_categories += 1
        return None
    
    # Build the full training conversation
    training_messages = build_routing_prompt(conversation, categories)
    
    # Use renderer to tokenize and get weights
    # Per Tinker rendering.mdx: build_supervised_example returns tokens and weights
    tokens, weights = renderer.build_supervised_example(training_messages)
    
    # Check sequence length
    if len(tokens) > MAX_SEQUENCE_LENGTH:
        stats.skipped_too_long += 1
        return None
    
    # Prepare for next-token prediction
    # Per Tinker training-sampling.mdx example
    input_tokens = tokens[:-1]
    target_tokens = tokens[1:]
    loss_weights = weights[1:]
    
    # Validate non-zero weights
    if sum(loss_weights) == 0:
        stats.skipped_zero_weights += 1
        return None
    
    # Validate token IDs
    if not all(0 <= t < vocab_size for t in target_tokens):
        stats.skipped_invalid_tokens += 1
        return None
    
    # Create Datum object
    # Per Tinker types (Datum class)
    datum = types.Datum(
        model_input=types.ModelInput.from_ints(input_tokens),
        loss_fn_inputs=dict(
            target_tokens=target_tokens,
            weights=loss_weights
        )
    )
    
    stats.valid_examples += 1
    return datum


def preprocess_dataset(
    input_path: str,
    output_dir: str,
    use_tinker: bool = False,
    train_split: float = 0.8
) -> Tuple[PreprocessingStats, str, str]:
    """
    Preprocess the full dataset.
    
    Args:
        input_path: Path to training_dataset_1000.jsonl
        output_dir: Directory to save processed data
        use_tinker: Whether to use actual Tinker (requires installation)
        train_split: Fraction for training (rest is test)
    
    Returns:
        stats, train_path, test_path
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Load data
    print(f"Loading data from {input_path}...")
    raw_data = load_synthetic_data(input_path)
    print(f"Loaded {len(raw_data)} examples")
    
    stats = PreprocessingStats(total_examples=len(raw_data))
    
    if use_tinker:
        # Import Tinker components
        from tinker_cookbook import renderers, tokenizer_utils
        
        print(f"Initializing tokenizer for {MODEL_NAME}...")
        tokenizer = tokenizer_utils.get_tokenizer(MODEL_NAME)
        renderer = renderers.get_renderer(name=RENDERER_NAME, tokenizer=tokenizer)
        vocab_size = len(tokenizer)
        print(f"Vocab size: {vocab_size}")
        
        processed_data = []
        for i, example in enumerate(raw_data):
            if i % 100 == 0:
                print(f"Processing {i}/{len(raw_data)}...")
            datum = preprocess_with_tinker(example, renderer, tokenizer, vocab_size, stats)
            if datum is not None:
                processed_data.append(datum)
    else:
        # Mock preprocessing for testing
        print("Running mock preprocessing (no Tinker)...")
        processed_data = []
        for i, example in enumerate(raw_data):
            if i % 100 == 0:
                print(f"Processing {i}/{len(raw_data)}...")
            result = preprocess_example_mock(example, stats)
            if result is not None:
                processed_data.append(result)
    
    # Split into train/test
    split_idx = int(len(processed_data) * train_split)
    train_data = processed_data[:split_idx]
    test_data = processed_data[split_idx:]
    
    # Save processed data
    train_path = os.path.join(output_dir, "train_data.json")
    test_path = os.path.join(output_dir, "test_data.json")
    
    with open(train_path, "w") as f:
        json.dump([d if isinstance(d, dict) else d.model_dump() for d in train_data], f)
    
    with open(test_path, "w") as f:
        json.dump([d if isinstance(d, dict) else d.model_dump() for d in test_data], f)
    
    print(f"\n=== Preprocessing Complete ===")
    print(f"Total examples: {stats.total_examples}")
    print(f"Valid examples: {stats.valid_examples}")
    print(f"Skipped (too long): {stats.skipped_too_long}")
    print(f"Skipped (zero weights): {stats.skipped_zero_weights}")
    print(f"Skipped (invalid tokens): {stats.skipped_invalid_tokens}")
    print(f"Skipped (invalid categories): {stats.skipped_invalid_categories}")
    print(f"\nTrain set: {len(train_data)} examples")
    print(f"Test set: {len(test_data)} examples")
    print(f"\nSaved to:")
    print(f"  Train: {train_path}")
    print(f"  Test: {test_path}")
    
    return stats, train_path, test_path


if __name__ == "__main__":
    import sys
    
    input_path = sys.argv[1] if len(sys.argv) > 1 else "synthetic_data/training_dataset_1000.jsonl"
    output_dir = sys.argv[2] if len(sys.argv) > 2 else "training/processed_data"
    use_tinker = "--tinker" in sys.argv
    
    preprocess_dataset(input_path, output_dir, use_tinker=use_tinker)