MuratcanKoylan commited on Nov 25, 2025

Commit

685d968

verified ·

1 Parent(s): ec5d796

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +48 -0
MODEL_CARD.md +128 -0
README.md +274 -3
assets/difficulty_comparison.png +0 -0
assets/generate_charts.py +207 -0
assets/model_comparison.png +0 -0
assets/rl_components.png +3 -0
assets/rl_reward.png +0 -0
assets/sft_loss.png +0 -0
docs/PRD.md +498 -0
docs/Synthetic-Data-Gen/synthetic_data.md +784 -0
docs/tinker_docs.md +0 -0
huggingface/README.md +140 -0
huggingface/upload_to_hf.py +220 -0
requirements.txt +14 -0
synthetic_data/README.md +58 -0
synthetic_data/all_generated_data_1000.jsonl +0 -0
synthetic_data/balanced_async_log.txt +67 -0
synthetic_data/balanced_generation_log.txt +0 -0
synthetic_data/balanced_generation_log_20251124_184530.txt +0 -0
synthetic_data/clean_batch.py +38 -0
synthetic_data/clean_data.py +72 -0
synthetic_data/debug_key.py +28 -0
synthetic_data/debug_key_raw.py +13 -0
synthetic_data/diverse_dataset_20251124_192207.jsonl +0 -0
synthetic_data/diverse_generation_log.txt +9 -0
synthetic_data/diverse_log.txt +0 -0
synthetic_data/generate_sample.py +15 -0
synthetic_data/generation_log_100.txt +702 -0
synthetic_data/generation_log_100_v2.txt +0 -0
synthetic_data/generation_log_async.txt +0 -0
synthetic_data/generation_log_final.txt +0 -0
synthetic_data/merged_training_dataset_2001.jsonl +0 -0
synthetic_data/pipeline.py +311 -0
synthetic_data/quick_test_diverse.py +62 -0
synthetic_data/run_balanced_async.py +259 -0
synthetic_data/run_balanced_generation.py +391 -0
synthetic_data/run_batch.py +116 -0
synthetic_data/run_batch_async.py +198 -0
synthetic_data/run_diverse_generation.py +310 -0
synthetic_data/sample_batch.json +841 -0
synthetic_data/test_balanced.py +81 -0
synthetic_data/test_connection.py +40 -0
synthetic_data/test_diverse.py +58 -0
synthetic_data/test_pipeline.py +100 -0
synthetic_data/training_dataset_1000.jsonl +0 -0
synthetic_data/validate.py +85 -0
synthetic_data/verify_key.py +15 -0
training/benchmark.py +328 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/rl_components.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,48 @@

+# Virtual environment
+venv/
+.venv/
+env/
+# Environment variables
+.env
+.env.local
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+# Jupyter
+.ipynb_checkpoints/
+# Logs (keep structure, ignore large files)
+training/logs/**/nohup.out
+# Temporary files
+*.tmp
+*.temp

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Model Card: Memory Routing Agent (Llama-8B + LoRA)
+## Model Details
+- **Model Name**: memory-routing-llama-8b-lora
+- **Base Model**: meta-llama/Llama-3.1-8B
+- **Architecture**: LoRA (Low-Rank Adaptation), rank 32
+- **Training Platform**: Tinker (Thinking Machines)
+- **Training Method**: SFT (Supervised Fine-Tuning) + RL (Reinforcement Learning)
+- **Parameters**: ~8B base + ~100M LoRA adapters
+- **License**: Apache 2.0
+## Intended Use
+This model classifies marketing conversations into memory categories for AI assistant systems. It determines which pieces of information from a conversation should be stored in long-term memory and how they should be categorized.
+### Primary Use Cases
+- Marketing AI assistants that need to remember user preferences
+- CRM systems that extract structured data from conversations
+- Knowledge management systems for marketing teams
+### Out-of-Scope Uses
+- General-purpose chatbots
+- Non-marketing domains (healthcare, legal, finance)
+- Real-time conversation generation
+## Training Data
+### Synthetic Dataset
+- **Size**: 2,001 conversations
+- **Generation**: Cohere Command-R-Plus (104B) as teacher model
+- **Format**: Multi-turn marketing conversations with category labels
+### Category Taxonomy (13 categories)
+| Category | Description | Persistence |
+|----------|-------------|-------------|
+| company.brand_core | Voice, values, positioning | Long (>1y) |
+| company.strategic_signatures | Decision frameworks | Long (>1y) |
+| company.knowledge_artifacts | Docs, style guides | Long (>1y) |
+| company.business_priorities | Quarterly goals | Short (<3m) |
+| company.tools_config | Integrations, APIs | Medium (~6m) |
+| company.performance_context | Campaign metrics | Rolling (~6m) |
+| user.communication_style | Tone, format preferences | Long (>1y) |
+| user.strategic_approach | Personal priorities | Long (>1y) |
+| user.role_context | Title, scope | Medium (~1y) |
+| user.workflow_patterns | Review cadence | Medium (~1y) |
+| user.session_history | Immediate context | Short (<2w) |
+| user.interaction_preferences | Coaching style | Evolving |
+| none | Irrelevant content | N/A |
+## Training Procedure
+### Phase 1: Supervised Fine-Tuning (SFT)
+- **Steps**: 100
+- **Batch Size**: 128
+- **Learning Rate**: 2.86e-4 (Tinker default for Llama-8B)
+- **Optimizer**: Adam (β1=0.9, β2=0.95)
+- **Loss Function**: Cross-entropy
+### Phase 2: Reinforcement Learning (RL)
+- **Iterations**: 12
+- **Groups per Batch**: 64
+- **Group Size**: 32
+- **Learning Rate**: 2e-5
+- **Loss Function**: Importance sampling policy gradient
+- **Reward Function**:
+  - R_F1 (60%): F1 score vs gold labels
+  - R_temp (20%): Temporal alignment
+  - R_parity (10%): Company/user scope
+  - R_eff (10%): Storage efficiency
+## Evaluation Results
+### Marketing Routing Benchmark (50 scenarios)
+| Model | Any Match | Exact Match | Avg F1 |
+|-------|-----------|-------------|--------|
+| **Ours (8B + LoRA)** | 72% | **60%** | **0.68** |
+| Cohere Command-R-Plus (104B) | 82% | 26% | 0.61 |
+### Key Findings
+- **11.1% higher F1** than the 104B teacher model
+- **2.3x better exact match** accuracy
+- **13x smaller** than the teacher model
+- Excels at single-category classification (86% exact on easy cases)
+- Struggles with multi-label scenarios (10% exact on hard cases)
+### Performance by Difficulty
+| Difficulty | Our Model (F1) | Cohere (F1) | Delta |
+|------------|----------------|-------------|-------|
+| Easy | 0.86 | 0.48 | +79% |
+| Medium | 0.65 | 0.64 | +2% |
+| Hard | 0.50 | 0.72 | -31% |
+## Limitations
+1. **Multi-label Detection**: Under-predicts when multiple categories apply
+2. **Company vs User Confusion**: Sometimes confuses `company.strategic_signatures` with `user.strategic_approach`
+3. **Hard Cases**: Performance drops on complex overlapping categories
+4. **Domain Specificity**: Trained only on marketing scenarios
+## Ethical Considerations
+- Model trained on synthetic data; may not capture all real-world edge cases
+- Should be used with human oversight for critical decisions
+- Privacy: Does not store or transmit conversation data
+## Citation
+```bibtex
+@misc{memory-routing-agent-2025,
+  title={Memory Routing Agent: Prompt Distillation for Marketing AI},
+  author={Muratcan Koylan},
+  year={2025},
+  howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
+}
+```
+## Model Files
+- `training/checkpoints/rl_iter_012/` - Final RL checkpoint
+- `training/benchmarks/marketing_routing_benchmark.json` - Benchmark dataset
+- `synthetic_data/merged_training_dataset_2001.jsonl` - Training data
+## Contact
+For questions or issues, please open a GitHub issue.

README.md CHANGED Viewed

@@ -1,3 +1,274 @@
----
-license: apache-2.0
----

+# Memory Routing Agent
+A specialized 8B parameter model that **outperforms 104B models** on marketing conversation classification.
+## Key Results
+![Model Comparison](assets/model_comparison.png)
+| Metric | Our Model (8B) | Cohere (104B) |
+|--------|----------------|---------------|
+| **Avg F1** | **0.68** | 0.61 |
+| Exact Match | **60%** | 26% |
+| Model Size | 8B | 104B |
+| **Improvement** | **+11.1% F1** | baseline |
+Our 8B model achieves **11.1% higher F1 score** than the 104B teacher model that generated its training data, while being **13x smaller**.
+## Training Results
+### Phase 1: Supervised Fine-Tuning (SFT)
+![SFT Loss](assets/sft_loss.png)
+- **100 training steps** on 2,001 synthetic conversations
+- Loss dropped from **5.47 → 0.26** (95% reduction)
+- Best test loss: **0.105** at step 90
+### Phase 2: Reinforcement Learning (RL)
+![RL Reward](assets/rl_reward.png)
+- **30 RL iterations** with importance sampling policy gradient
+- Mean reward improved from **0.73 → 0.93** (+27%)
+- Accuracy maintained at **99.9%+** throughout
+### Reward Components
+![RL Components](assets/rl_components.png)
+| Component | Start | End | Description |
+|-----------|-------|-----|-------------|
+| R_F1 | 0.64 | 0.90 | F1 score vs gold labels |
+| R_temp | 0.81 | 0.95 | Temporal alignment |
+| R_parity | 0.86 | 1.00 | Company/user scope |
+| R_eff | 1.00 | 1.00 | Storage efficiency |
+## Performance by Difficulty
+![Difficulty Comparison](assets/difficulty_comparison.png)
+| Difficulty | Our Model | Cohere (104B) | Winner |
+|------------|-----------|---------------|--------|
+| Easy | **0.86** | 0.48 | Ours (+79%) |
+| Medium | **0.65** | 0.64 | Ours (+2%) |
+| Hard | 0.50 | **0.72** | Cohere |
+Our model excels at clear-cut cases but the larger model handles ambiguous multi-label scenarios better.
+## What It Does
+The Memory Routing Agent classifies marketing conversations into 13 categories to determine what information should be stored in an AI assistant's long-term memory:
+- **Company categories**: brand_core, strategic_signatures, knowledge_artifacts, business_priorities, tools_config, performance_context
+- **User categories**: communication_style, strategic_approach, role_context, workflow_patterns, session_history, interaction_preferences
+- **None**: Transactional or irrelevant content
+## Training Pipeline
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    TRAINING PIPELINE                            │
+├─────────────────────────────────────────────────────────────────┤
+│                                                                 │
+│  1. SYNTHETIC DATA GENERATION                                   │
+│     ├── Cohere Command-R-Plus (104B) as teacher                │
+│     ├── 2,001 marketing conversations                          │
+│     └── 13 category labels + persistence horizons              │
+│                                                                 │
+│  2. SUPERVISED FINE-TUNING (SFT)                               │
+│     ├── Base: meta-llama/Llama-3.1-8B                          │
+│     ├── LoRA rank 32                                           │
+│     ├── 100 steps, batch size 128                              │
+│     └── Cross-entropy loss                                     │
+│                                                                 │
+│  3. REINFORCEMENT LEARNING (RL)                                │
+│     ├── 30 iterations, 64 groups × 32 samples                  │
+│     ├── Importance sampling policy gradient                    │
+│     └── Composite reward: F1 + temporal + parity + efficiency  │
+│                                                                 │
+└─────────────────────────────────────────────────────────────────┘
+```
+## Quick Start
+### Installation
+```bash
+# Clone the repository
+git clone https://github.com/muratcankoylan/memory-routing-agent.git
+cd memory-routing-agent
+# Create virtual environment
+python -m venv venv
+source venv/bin/activate
+# Install dependencies
+pip install tinker-toolkit python-dotenv cohere
+pip install -e ".[envs]"
+```
+### Environment Setup
+```bash
+# Create .env file
+echo "TINKER_API_KEY=your_tinker_key" >> .env
+echo "COHERE_API_KEY=your_cohere_key" >> .env
+echo "HF_TOKEN=your_huggingface_token" >> .env
+```
+### Run Inference
+```python
+import tinker
+from tinker import types
+from tinker_cookbook import renderers
+from tinker_cookbook.tokenizer_utils import get_tokenizer
+# Load model
+service_client = tinker.ServiceClient()
+checkpoint = "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012"
+sampling_client = service_client.create_sampling_client(model_path=checkpoint)
+# Setup tokenizer and renderer
+tokenizer = get_tokenizer("meta-llama/Llama-3.1-8B")
+renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
+# Classify a conversation
+conversation = """
+USER: Our brand voice is professional but approachable. Think Harvard Business Review meets Slack.
+ASSISTANT: So authoritative content with a conversational tone?
+USER: Exactly. We never use jargon without explaining it first.
+"""
+messages = [
+    {"role": "system", "content": "You route marketing conversations into structured memory categories..."},
+    {"role": "user", "content": f"Analyze this conversation:\n\n{conversation}"}
+]
+prompt = renderer.build_generation_prompt(messages)
+params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=renderer.get_stop_sequences())
+result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
+response, _ = renderer.parse_response(result.sequences[0].tokens)
+print(f"Categories: {response['content']}")
+# Output: company.brand_core
+```
+## Project Structure
+```
+memory-routing-agent/
+├── assets/                   # Training visualizations
+│   ├── sft_loss.png
+│   ├── rl_reward.png
+│   ├── rl_components.png
+│   ├── model_comparison.png
+│   └── difficulty_comparison.png
+├── synthetic_data/           # Data generation pipeline
+│   ├── pipeline.py           # Cohere-based conversation generator
+│   ├── run_diverse_generation.py
+│   └── merged_training_dataset_2001.jsonl
+├── training/                 # Training scripts
+│   ├── train_v2.py           # Main training script (SFT + RL)
+│   ├── preprocess.py         # Data preprocessing
+│   ├── rl_env.py             # RL environment and reward function
+│   ├── final_benchmark.py    # Benchmark evaluation
+│   ├── logs/                 # Training logs (JSONL)
+│   └── benchmarks/           # Benchmark results
+├── huggingface/              # HuggingFace upload scripts
+├── docs/                     # Documentation
+│   ├── PRD.md                # Product requirements
+│   └── tinker_docs.md        # Tinker reference
+├── MODEL_CARD.md             # Model card
+└── README.md                 # This file
+```
+## Benchmark
+The Marketing Routing Benchmark contains 50 challenging scenarios across 7 domains:
+| Domain | Scenarios | Description |
+|--------|-----------|-------------|
+| Brand & Positioning | 8 | Brand voice, values, identity |
+| Strategic Decisions | 8 | Decision frameworks, heuristics |
+| Performance & Metrics | 8 | Campaign metrics, learnings |
+| Tools & Integrations | 6 | Tech stack, APIs |
+| User Preferences | 10 | Communication style, workflow |
+| Business Priorities | 6 | Goals, focus areas |
+| Knowledge Artifacts | 4 | Docs, playbooks, templates |
+### Run Benchmark
+```bash
+python training/final_benchmark.py
+```
+## Training Your Own Model
+### 1. Generate Synthetic Data
+```bash
+cd synthetic_data
+python run_diverse_generation.py --num_items 1000
+```
+### 2. Preprocess Data
+```bash
+python training/prepare_data.py
+```
+### 3. Run Training
+```bash
+python training/train_v2.py
+```
+### 4. Evaluate
+```bash
+python training/final_benchmark.py
+```
+## Reward Function
+The RL phase uses a composite reward:
+```
+R_total = 0.6 × R_F1 + 0.2 × R_temp + 0.1 × R_parity + 0.1 × R_eff
+```
+| Component | Weight | Description |
+|-----------|--------|-------------|
+| R_F1 | 60% | F1 score vs gold labels |
+| R_temp | 20% | Persistence horizon alignment |
+| R_parity | 10% | Company/user scope correctness |
+| R_eff | 10% | Storage efficiency (≤3 categories) |
+## Limitations
+- **Multi-label**: Under-predicts when multiple categories apply
+- **Overlap**: Struggles with company/user category overlap
+- **Domain**: Marketing-specific; not tested on other domains
+## Citation
+```bibtex
+@misc{memory-routing-agent-2025,
+  title={Memory Routing Agent: Prompt Distillation for Marketing AI},
+  author={Muratcan Koylan},
+  year={2025},
+  howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
+}
+```
+## License
+Apache 2.0
+## Acknowledgments
+- [Thinking Machines](https://thinkingmachines.ai/) for Tinker training platform
+- [Cohere](https://cohere.com/) for Command-R-Plus teacher model
+- [Meta](https://ai.meta.com/) for Llama 3.1 base model

assets/difficulty_comparison.png ADDED Viewed

assets/generate_charts.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Generate training visualization charts for README
+"""
+import json
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use('Agg')
+import numpy as np
+import os
+# Create assets directory
+os.makedirs('assets', exist_ok=True)
+# Load SFT metrics
+sft_metrics = []
+with open('training/logs/run_20251124_200256/sft_metrics.jsonl', 'r') as f:
+    for line in f:
+        sft_metrics.append(json.loads(line))
+# Load RL metrics
+rl_metrics = []
+with open('training/logs/run_20251124_200256/rl_metrics.jsonl', 'r') as f:
+    for line in f:
+        rl_metrics.append(json.loads(line))
+# Style settings
+plt.style.use('seaborn-v0_8-whitegrid')
+colors = {
+    'train': '#2563eb',
+    'test': '#dc2626',
+    'reward': '#059669',
+    'f1': '#7c3aed',
+    'our_model': '#2563eb',
+    'cohere': '#dc2626'
+}
+# ============ Chart 1: SFT Loss Curve ============
+fig, ax = plt.subplots(figsize=(10, 5))
+steps = [m['step'] for m in sft_metrics]
+train_loss = [m['train_loss'] for m in sft_metrics]
+# Get test loss points
+test_steps = [m['step'] for m in sft_metrics if 'test_loss' in m]
+test_loss = [m['test_loss'] for m in sft_metrics if 'test_loss' in m]
+ax.plot(steps, train_loss, color=colors['train'], linewidth=2, label='Train Loss', alpha=0.8)
+ax.scatter(test_steps, test_loss, color=colors['test'], s=80, zorder=5, label='Test Loss', marker='o')
+ax.plot(test_steps, test_loss, color=colors['test'], linewidth=2, linestyle='--', alpha=0.5)
+ax.set_xlabel('Training Step', fontsize=12)
+ax.set_ylabel('Loss', fontsize=12)
+ax.set_title('SFT Training: Loss Convergence', fontsize=14, fontweight='bold')
+ax.legend(loc='upper right', fontsize=10)
+ax.set_ylim(0, 6)
+# Add annotations
+ax.annotate(f'Final: {train_loss[-1]:.3f}', xy=(steps[-1], train_loss[-1]),
+            xytext=(steps[-1]-15, train_loss[-1]+0.5),
+            fontsize=9, color=colors['train'])
+ax.annotate(f'Best Test: {min(test_loss):.3f}', xy=(test_steps[test_loss.index(min(test_loss))], min(test_loss)),
+            xytext=(test_steps[test_loss.index(min(test_loss))]+5, min(test_loss)+0.3),
+            fontsize=9, color=colors['test'])
+plt.tight_layout()
+plt.savefig('assets/sft_loss.png', dpi=150, bbox_inches='tight')
+plt.close()
+print("Saved: assets/sft_loss.png")
+# ============ Chart 2: RL Reward Progression ============
+fig, ax = plt.subplots(figsize=(10, 5))
+iterations = [m['iteration'] for m in rl_metrics]
+mean_reward = [m['mean_reward'] for m in rl_metrics]
+std_reward = [m['std_reward'] for m in rl_metrics]
+# Plot with confidence band
+ax.fill_between(iterations,
+                [r - s for r, s in zip(mean_reward, std_reward)],
+                [r + s for r, s in zip(mean_reward, std_reward)],
+                alpha=0.2, color=colors['reward'])
+ax.plot(iterations, mean_reward, color=colors['reward'], linewidth=2.5, label='Mean Reward')
+ax.set_xlabel('RL Iteration', fontsize=12)
+ax.set_ylabel('Reward', fontsize=12)
+ax.set_title('RL Training: Reward Progression', fontsize=14, fontweight='bold')
+ax.legend(loc='lower right', fontsize=10)
+ax.set_ylim(0.5, 1.0)
+# Add annotations
+ax.annotate(f'Start: {mean_reward[0]:.3f}', xy=(0, mean_reward[0]),
+            xytext=(2, mean_reward[0]-0.05), fontsize=9, color=colors['reward'])
+ax.annotate(f'Peak: {max(mean_reward):.3f}', xy=(mean_reward.index(max(mean_reward)), max(mean_reward)),
+            xytext=(mean_reward.index(max(mean_reward))+2, max(mean_reward)+0.02),
+            fontsize=9, color=colors['reward'])
+plt.tight_layout()
+plt.savefig('assets/rl_reward.png', dpi=150, bbox_inches='tight')
+plt.close()
+print("Saved: assets/rl_reward.png")
+# ============ Chart 3: Reward Components ============
+fig, ax = plt.subplots(figsize=(10, 5))
+r_f1 = [m['mean_r_f1'] for m in rl_metrics]
+r_temp = [m['mean_r_temp'] for m in rl_metrics]
+r_parity = [m['mean_r_parity'] for m in rl_metrics]
+r_eff = [m['mean_r_eff'] for m in rl_metrics]
+ax.plot(iterations, r_f1, label='R_F1 (60%)', linewidth=2, color='#2563eb')
+ax.plot(iterations, r_temp, label='R_temp (20%)', linewidth=2, color='#7c3aed')
+ax.plot(iterations, r_parity, label='R_parity (10%)', linewidth=2, color='#059669')
+ax.plot(iterations, r_eff, label='R_eff (10%)', linewidth=2, color='#f59e0b')
+ax.set_xlabel('RL Iteration', fontsize=12)
+ax.set_ylabel('Reward Component', fontsize=12)
+ax.set_title('RL Training: Reward Components', fontsize=14, fontweight='bold')
+ax.legend(loc='lower right', fontsize=10)
+ax.set_ylim(0.5, 1.05)
+plt.tight_layout()
+plt.savefig('assets/rl_components.png', dpi=150, bbox_inches='tight')
+plt.close()
+print("Saved: assets/rl_components.png")
+# ============ Chart 4: Model Comparison ============
+fig, ax = plt.subplots(figsize=(8, 5))
+metrics = ['Avg F1', 'Exact Match', 'Any Match']
+our_model = [0.68, 0.60, 0.72]
+cohere = [0.61, 0.26, 0.82]
+x = np.arange(len(metrics))
+width = 0.35
+bars1 = ax.bar(x - width/2, our_model, width, label='Ours (8B)', color=colors['our_model'])
+bars2 = ax.bar(x + width/2, cohere, width, label='Cohere (104B)', color=colors['cohere'])
+ax.set_ylabel('Score', fontsize=12)
+ax.set_title('Model Comparison: 50 Marketing Scenarios', fontsize=14, fontweight='bold')
+ax.set_xticks(x)
+ax.set_xticklabels(metrics, fontsize=11)
+ax.legend(loc='upper right', fontsize=10)
+ax.set_ylim(0, 1.0)
+# Add value labels
+for bar in bars1:
+    height = bar.get_height()
+    ax.annotate(f'{height:.0%}',
+                xy=(bar.get_x() + bar.get_width() / 2, height),
+                xytext=(0, 3), textcoords="offset points",
+                ha='center', va='bottom', fontsize=10, fontweight='bold')
+for bar in bars2:
+    height = bar.get_height()
+    ax.annotate(f'{height:.0%}',
+                xy=(bar.get_x() + bar.get_width() / 2, height),
+                xytext=(0, 3), textcoords="offset points",
+                ha='center', va='bottom', fontsize=10)
+plt.tight_layout()
+plt.savefig('assets/model_comparison.png', dpi=150, bbox_inches='tight')
+plt.close()
+print("Saved: assets/model_comparison.png")
+# ============ Chart 5: Performance by Difficulty ============
+fig, ax = plt.subplots(figsize=(8, 5))
+difficulties = ['Easy', 'Medium', 'Hard']
+our_f1 = [0.86, 0.65, 0.50]
+cohere_f1 = [0.48, 0.64, 0.72]
+x = np.arange(len(difficulties))
+width = 0.35
+bars1 = ax.bar(x - width/2, our_f1, width, label='Ours (8B)', color=colors['our_model'])
+bars2 = ax.bar(x + width/2, cohere_f1, width, label='Cohere (104B)', color=colors['cohere'])
+ax.set_ylabel('F1 Score', fontsize=12)
+ax.set_title('F1 Score by Difficulty Level', fontsize=14, fontweight='bold')
+ax.set_xticks(x)
+ax.set_xticklabels(difficulties, fontsize=11)
+ax.legend(loc='upper right', fontsize=10)
+ax.set_ylim(0, 1.0)
+# Add value labels
+for bar in bars1:
+    height = bar.get_height()
+    ax.annotate(f'{height:.2f}',
+                xy=(bar.get_x() + bar.get_width() / 2, height),
+                xytext=(0, 3), textcoords="offset points",
+                ha='center', va='bottom', fontsize=10, fontweight='bold')
+for bar in bars2:
+    height = bar.get_height()
+    ax.annotate(f'{height:.2f}',
+                xy=(bar.get_x() + bar.get_width() / 2, height),
+                xytext=(0, 3), textcoords="offset points",
+                ha='center', va='bottom', fontsize=10)
+plt.tight_layout()
+plt.savefig('assets/difficulty_comparison.png', dpi=150, bbox_inches='tight')
+plt.close()
+print("Saved: assets/difficulty_comparison.png")
+print("\nAll charts generated successfully!")

assets/model_comparison.png ADDED Viewed

assets/rl_components.png ADDED Viewed

Git LFS Details

SHA256: 3be9068cd428b4126d96f3bc1c93071cfdd44eb089c86501de90fd3f17e31c86
Pointer size: 131 Bytes
Size of remote file: 119 kB

assets/rl_reward.png ADDED Viewed

assets/sft_loss.png ADDED Viewed

docs/PRD.md ADDED Viewed

	@@ -0,0 +1,498 @@

+# Product Requirements Document: Intelligent Memory Routing System (Tinker Implementation)
+## 1. Executive Summary
+Deliver a production memory-routing agent for marketing AI systems using Tinker as the exclusive training platform. The agent ingests conversation snippets, decides if the content merits storage, assigns the correct ontology slot, and respects persistence horizons. We follow a two-stage pipeline: supervised prompt distillation for initialization and reinforcement learning to optimize downstream retrieval utility. All code must rely on Tinker’s asynchronous APIs and built-in loss functions (cross-entropy and importance sampling) to stay within supported patterns.
+## 2. Goals and Non-Goals
+- **Goals**
+  - Maintain selective, temporally-aware storage across the 12-category taxonomy plus `none`.
+  - Achieve reliable multi-label routing with tight output formatting compatible with Tinker renderers.
+  - Support RL reward shaping for retrieval F1, temporal correctness, company/user separation, and storage efficiency.
+  - Produce checkpoints consumable by downstream services via Tinker sampling clients.
+- **Non-Goals**
+  - Building a retrieval engine or serving layer.
+  - Extending Tinker beyond LoRA or supported loss functions.
+## 3. Success Metrics
+- Exact-match accuracy ≥80% on held-out labeled data.
+- Macro F1 ≥90% across categories; `none` precision ≥90%, recall ≥85%.
+- Average predicted categories per utterance ≤2.0.
+- Temporal alignment accuracy ≥90% (long/medium/short mapping).
+- KL divergence: target <0.005, warn 0.005–0.01, critical >0.01.
+## 4. System Architecture Overview
+1. **Synthetic Conversation Library** – Scenario templates drive GPT-5 generation to cover ontology breadth and noise patterns.
+2. **Teacher Labeling** – GPT-5, prompted with taxonomy/persistence guidance, produces gold labels (multi-label + `none`).
+3. **Prompt Distillation (SFT)** – Llama-3.1-8B LoRA (rank 32) is trained asynchronously via Tinker `forward_backward_async(..., loss_fn="cross_entropy")`.
+4. **RL Optimization** – Same model undergoes importance sampling policy gradient loss with a custom `MemoryRoutingEnv`.
+5. **Evaluation Harness** – Tinker evaluator builders and offline scripts verify accuracy, pruning behavior, and reward stability.
+### Model Selection Rationale
+- Using `meta-llama/Llama-3.1-8B` (🐙 Base, 🧱 Dense, 🦆 Small) as the foundation model for this classification task. While Tinker recommends MoE models for cost efficiency and instruction-tuned models for task-specific work, we choose the base model for three reasons:
+  1. **Routing Neutrality**: Instruction-tuned models may have ingrained biases toward helpfulness/verbosity that conflict with selective storage decisions. The base model learns routing behavior purely from our synthetic data.
+  2. **Prompt Distillation Alignment**: Our two-stage pipeline (teacher labels → SFT → RL) is a classic prompt distillation setup where starting from a base model ensures we're not fighting pre-existing instruction-following patterns.
+  3. **Evaluation Baseline**: Establishes a clean baseline for comparing LoRA vs full fine-tuning effects without confounding variables from prior post-training.
+- LoRA rank 32 mirrors Tinker defaults for classification-style tasks. Higher ranks can be evaluated later if capacity becomes a bottleneck.
+- **Post-MVP**: Once baseline performance is established, evaluate `meta-llama/Llama-3.1-8B-Instruct` (to measure instruction-tuning impact) and `Qwen/Qwen3-30B-A3B` (MoE cost efficiency) as alternative starting points.
+### Environment Design Notes
+- Each `MemoryRoutingEnv` is a single-step bandit: `initial_observation()` returns a tokenized conversation + stop conditions, `step()` receives the model's generated classification tokens and terminates immediately with reward.
+- EnvGroupBuilder clones each conversation across `group_size` rollouts for variance reduction; dataset builder provides `batch_size` EnvGroupBuilders per iteration.
+- No multi-turn transitions, which matches Tinker's Env definitions and keeps reward computation simple.
+### Reward Computation Details
+The `step()` method in `MemoryRoutingEnv` performs the following sequence:
+1. **Parse Model Output**: Extract predicted categories from generated tokens using renderer stop sequences. Expected format: `category1, category2, category3` (comma-separated, from valid taxonomy).
+2. **Format Validation**: If parsing fails or any category is invalid, assign `R_format = -1.0` and return immediately (zero for all other reward components).
+3. **Component Calculation**:
+   - `R_F1`: Token-level F1 between predicted and gold category sets. Use macro-averaging if multi-label.
+   - `R_temp`: Persistence alignment. +1.0 if predicted persistence matches gold (long/medium/short), +0.5 if adjacent (long↔medium or medium↔short), 0.0 otherwise. Use majority vote if multi-label predictions span multiple persistence horizons.
+   - `R_parity`: Company/user scope alignment. +1.0 if predicted scope (company/user/mixed/none) matches gold scope exactly, 0.0 otherwise.
+   - `R_eff`: Storage efficiency. `1.0` if ≤3 categories predicted, `0.7` if 4 categories, `0.4` if 5 categories, `0.0` if ≥6 categories.
+4. **Composite Reward**: `R_total = 0.6 * R_F1 + 0.2 * R_temp + 0.1 * R_parity + 0.1 * R_eff` (unless format validation failed, then `R_total = -1.0`).
+**Edge Cases**:
+- Model outputs empty string or only stop tokens → format validation failure.
+- Model outputs `none` + other categories → invalid, format failure (none must be exclusive).
+- Model outputs duplicate categories → deduplicate before computing metrics.
+- Model exceeds max_tokens without hitting stop sequence → truncate and attempt parse, format failure if no valid categories extracted.
+## 5. Memory Ontology
+| Category | Description | Persistence |
+| --- | --- | --- |
+| `company.brand_core` | Voice, values, positioning, identity anchors. | Long (>1y) |
+| `company.strategic_signatures` | Decision frameworks, strategic heuristics. | Long (>1y) |
+| `company.knowledge_artifacts` | Docs, style guides, playbooks. | Long (>1y) |
+| `company.business_priorities` | Quarterly/seasonal goals, active campaigns. | Short (<3m) |
+| `company.tools_config` | Integrations, API keys, workflow settings. | Medium (~6m) |
+| `company.performance_context` | Campaign metrics, retrospectives, learnings. | Rolling (~6m) |
+| `user.communication_style` | Tone, verbosity, format expectations. | Long (>1y) |
+| `user.strategic_approach` | Personal priorities, success definitions. | Long (>1y) |
+| `user.role_context` | Title, scope, decision authority. | Medium (~1y) |
+| `user.workflow_patterns` | Review cadence, collaboration norms. | Medium (~1y) |
+| `user.session_history` | Immediate context, recent asks. | Short (<2w) |
+| `user.interaction_preferences` | Coaching style, feedback expectations. | Evolving |
+| `none` | Irrelevant, vague, or transactional content. | Critical for noise reduction |
+## 6. Data & Prompt Strategy
+### Scenario Generation
+- Script: customize `tinker_cookbook/recipes/prompt_distillation/create_data.py`.
+- Inputs: category focus, distractor category, emotional tone, required signal; 4–10 turns per dialogue.
+- Outputs: JSONL with scenario metadata, teacher confidence, persistence hints.
+### Teacher Prompt
+```
+System: You route marketing conversations into persistent memory. Consider each utterance and decide if it conveys a durable fact. Prefer `none` unless confident.
+Ontology: <category table with definitions + persistence>
+Rules:
+  1. Distinguish company.* from user.* details.
+  2. Match persistence horizon (long/medium/short) to signal lifetime.
+  3. Predict ≤3 categories unless strictly necessary.
+Output:
+categories: cat1, cat2 (use `none` for no storage)
+```
+- Temperature 0.2, max tokens 256, stop newline.
+### Student Prompt
+```
+System: You route marketing conversations into structured memory categories.
+User: Conversation:
+{dialogue}
+Available categories:
+- company.brand_core ...
+- ...
+- none
+Respond with comma-separated categories.
+```
+### Renderer Configuration
+```python
+from tinker_cookbook import renderers, tokenizer_utils
+tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
+renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
+stop_sequences = renderer.get_stop_sequences()
+sampling_params = types.SamplingParams(
+    max_tokens=150,
+    temperature=0.0,
+    stop=stop_sequences,
+)
+```
+### Parsing & Validation
+- Normalize whitespace/case, strip bullets, deduplicate, enforce taxonomy membership.
+- Validation helper:
+```python
+def validate_datum(datum: types.Datum, vocab_size: int) -> bool:
+    if datum.model_input.length > 512:
+        return False
+    weights = datum.loss_fn_inputs["weights"].tolist()
+    if sum(weights) == 0:
+        return False
+    target_tokens = datum.loss_fn_inputs["target_tokens"].tolist()
+    if not all(0 <= t < vocab_size for t in target_tokens):
+        return False
+    return True
+```
+## 6.5 Synthetic Data Quality Assurance
+- **Coverage:** ≥20 examples per category × persistence; ≥20% multi-label, ≥10% `none`-only.
+- **Noise:** ≥30% dialogues include distractors to stress selectivity.
+- **Signal Density:** >60% of turns include relevant info; length 6.5 ± 1.5 turns.
+- **Human Audit:** Spot-check 100 samples per refresh; require ≥95% teacher agreement.
+- **Continuous Improvement:** Log production misses, refresh quarterly, retrain teacher prompt if accuracy drops >10%.
+## 6.6 Data Preprocessing Pipeline
+Before SFT training, synthetic JSONL conversations must be converted to Tinker-compatible `types.Datum` objects:
+**Step 1: Load Synthetic Data**
+```python
+import json
+with open("train.jsonl", "r") as f:
+    conversations = [json.loads(line) for line in f]
+```
+**Step 2: Convert to Datum Objects**
+```python
+from tinker import types
+from tinker_cookbook import renderers, tokenizer_utils
+tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
+renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
+def conversation_to_datum(conversation_json: dict) -> types.Datum:
+    """Convert synthetic conversation to training datum."""
+    tokens, weights = renderer.build_supervised_example(
+        conversation_json["conversation"]
+    )
+    model_input = types.ModelInput.from_ints(tokens[:-1])
+    datum = types.Datum(
+        model_input=model_input,
+        loss_fn_inputs=dict(
+            target_tokens=tokens[1:],
+            weights=weights[1:],
+        ),
+    )
+    return datum
+train_data = [conversation_to_datum(conv) for conv in conversations]
+```
+**Step 3: Validate Datum Objects**
+```python
+vocab_size = len(tokenizer)
+valid_data = []
+for datum in train_data:
+    if datum.model_input.length > 4096:
+        print(f"Warning: Skipping example with length {datum.model_input.length}")
+        continue
+    weights = datum.loss_fn_inputs["weights"].tolist()
+    if sum(weights) == 0:
+        print("Warning: Skipping example with zero loss weights")
+        continue
+    target_tokens = datum.loss_fn_inputs["target_tokens"].tolist()
+    if not all(0 <= t < vocab_size for t in target_tokens):
+        print(f"Warning: Invalid token IDs found")
+        continue
+    valid_data.append(datum)
+print(f"Preprocessed {len(valid_data)}/{len(train_data)} examples")
+```
+**Step 4: Split and Save**
+```python
+train_size = int(0.8 * len(valid_data))
+train_dataset = valid_data[:train_size]
+test_dataset = valid_data[train_size:]
+```
+## 7. Stage 1 – Prompt Distillation (Supervised Learning)
+### Dataset & Batch Size
+- 1–2k labeled conversations (80/20 split after preprocessing per Section 6.6).
+- Batch size 128 (per Tinker SL guidance) balances stability/throughput; if changed, scale LR ∝ √batch_size.
+- Expected preprocessing yield: ~90-95% of raw JSONL (some examples filtered for length/validity).
+### Hyperparameter Selection
+```python
+from tinker_cookbook.hyperparam_utils import get_lr
+model_name = "meta-llama/Llama-3.1-8B"
+learning_rate = get_lr(model_name)  # Returns LoRA-adjusted LR: ~2.86e-4
+```
+- Tinker's `get_lr()` utility already returns the LoRA-optimized learning rate for the specified model, accounting for model size and architecture. No manual scaling needed.
+- Use Adam β1=0.9, β2=0.95, ε=1e-8 (Tinker SL defaults).
+- **Training Duration**: Start with 300 steps minimum (≈20-25 epochs for 1.5k samples at batch_size=128). Tinker SL guidance recommends "at least 100 steps but usually best results with 1000 or more" - for LoRA classification tasks, 300-500 steps typically ensures convergence.
+- **Early Stopping**: Validate every 20 steps on test set. Stop if test loss doesn't improve for 5 consecutive evaluations (100 steps patience).
+- **Convergence Check**: Plot train/test loss curves. If test loss hasn't plateaued by step 300, extend to 500 steps before RL initialization.
+### Async Training Loop
+```python
+import tinker
+from tinker import types
+from tinker_cookbook.hyperparam_utils import get_lr
+service_client = tinker.ServiceClient()
+training_client = await service_client.create_lora_training_client_async(
+    base_model="meta-llama/Llama-3.1-8B",
+    rank=32,
+)
+learning_rate = get_lr("meta-llama/Llama-3.1-8B")
+for step in range(num_steps):
+    # Submit forward-backward pass
+    fwd_bwd_future = await training_client.forward_backward_async(
+        batch_data,
+        loss_fn="cross_entropy",
+    )
+    # Submit optimizer step (can overlap with forward-backward)
+    adam_params = types.AdamParams(
+        learning_rate=learning_rate,
+        beta1=0.9,
+        beta2=0.95,
+        eps=1e-8,
+    )
+    optim_future = await training_client.optim_step_async(adam_params)
+    # Wait for both operations to complete
+    fwd_bwd_result = await fwd_bwd_future.result_async()
+    optim_result = await optim_future.result_async()
+    # Log metrics from both operations
+    log_metrics(step, fwd_bwd_result, optim_result)
+```
+### Checkpointing & Sampling
+```python
+# Save checkpoint for sampling (every 20 steps)
+checkpoint_future = await training_client.save_weights_for_sampler_async(
+    name=f"sft_{step:04d}"
+)
+checkpoint_result = await checkpoint_future.result_async()
+sampling_path = checkpoint_result.path
+# Create sampling client with the checkpoint
+sampling_client = service_client.create_sampling_client(
+    model_path=sampling_path
+)
+# Use with renderer stop sequences for evaluation
+stop_sequences = renderer.get_stop_sequences()
+sampling_params = types.SamplingParams(
+    max_tokens=150,
+    temperature=0.0,
+    stop=stop_sequences,
+)
+```
+- Save weights every 20 steps for periodic evaluation.
+- Critical: Must call `.result_async()` on the checkpoint future to get the path before creating sampling client.
+## 8. Stage 2 – Reinforcement Learning
+### Environment & Reward
+- `MemoryRoutingEnv` implements single-step episodes; EnvGroupBuilder replicates conversations across `group_size=8`.
+- Reward: `0.6 * R_F1 + 0.2 * R_temp + 0.1 * R_parity + 0.1 * R_eff`.
+  - `R_F1`: F1 overlap with teacher labels.
+  - `R_temp`: +1 (correct persistence), +0.5 (adjacent), 0 otherwise.
+  - `R_parity`: +1 when company/user presence matches ground truth.
+  - `R_eff`: 1.0 (≤3 cats), 0.7 (4), 0.4 (5), 0 (≥6) with hard penalty for parser failures.
+### Policy & Sampling Workflow
+```python
+# Save current policy weights for sampling
+checkpoint_future = await training_client.save_weights_for_sampler_async(
+    name=f"rl_step_{step:04d}"
+)
+checkpoint_result = await checkpoint_future.result_async()
+sampling_path = checkpoint_result.path
+# Create sampling client with current policy
+sampling_client = service_client.create_sampling_client(
+    model_path=sampling_path,
+)
+# Wrap in policy completer for RL rollouts
+policy = TinkerTokenCompleter(
+    sampling_client=sampling_client,
+    max_tokens=150,
+    temperature=0.0,
+    stop=renderer.get_stop_sequences(),
+)
+```
+### Async Training Loop
+```python
+for iteration in range(num_iterations):
+    # 1. Gather rollouts concurrently
+    trajectory_groups = await asyncio.gather(
+        *[do_group_rollout(env_builder, policy) for env_builder in env_builders]
+    )
+    # 2. Process trajectories
+    filtered_groups = remove_constant_reward_groups(trajectory_groups)
+    advantages = compute_advantages(filtered_groups)
+    train_data, metadata = assemble_training_data(filtered_groups, advantages)
+    # 3. Submit forward-backward pass
+    fwd_bwd_future = await training_client.forward_backward_async(
+        train_data,
+        loss_fn="importance_sampling"
+    )
+    # 4. Submit optimizer step
+    adam_params = types.AdamParams(
+        learning_rate=2e-5,
+        beta1=0.9,
+        beta2=0.95,
+        eps=1e-8
+    )
+    optim_future = await training_client.optim_step_async(adam_params)
+    # 5. Wait for both operations to complete
+    fwd_bwd_result = await fwd_bwd_future.result_async()
+    optim_result = await optim_future.result_async()
+    # 6. Log metrics including KL divergence
+    log_metrics(iteration, fwd_bwd_result, optim_result, metadata)
+```
+- Run ≈25 iterations (256 rollouts each). Adjust based on convergence and KL monitoring.
+### KL Monitoring
+| Status | KL Range | Action |
+| --- | --- | --- |
+| Target | <0.005 | Optimal on-policy stability |
+| Warning | 0.005–0.01 | Log warning, monitor closely; still stable per Tinker guidance |
+| Critical | >0.01 | Halt run immediately, inspect sampler vs learner drift |
+**Implementation Notes**:
+- Always log `kl_sample_train_v1` and `kl_sample_train_v2` (two KL estimators per Tinker RL docs).
+- Per Tinker: "training is stable with KL divergence below 0.01" - values above this threshold indicate numerical instability or off-policy issues.
+- Even with full on-policy training, KL won't be exactly zero due to [non-determinism](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/) in batched inference.
+- Keep sampling temperature at 0.0 for deterministic rollouts.
+- Enable gradient clipping (max_norm=1.0) if KL repeatedly exceeds 0.005.
+- If KL exceeds 0.01, halt training, inspect checkpoint drift, and verify sampling client is using correct weights.
+### Future Throughput Optimizations
+- After MVP, consider `StreamMinibatchConfig` to overlap sampling/training per Tinker RL docs (20–30% throughput gain).
+## 9. Evaluation & Monitoring
+### Inline
+- SFT: track train/test loss, exact-match, macro/micro F1, avg categories.
+- RL: log reward components, KL metrics, entropy, avg categories, stop reasons.
+### Evaluators
+```python
+from tinker_cookbook.evaluators import SamplingClientEvaluator
+from tinker import types
+class MemoryRoutingEvaluator(SamplingClientEvaluator):
+    """Evaluates memory routing classification on held-out test set."""
+    def __init__(self, test_set, renderer, tokenizer):
+        self.test_set = test_set  # List of preprocessed conversations with gold labels
+        self.renderer = renderer
+        self.tokenizer = tokenizer
+    async def __call__(self, sampling_client):
+        """Run holdout evaluation on the test set."""
+        predictions = []
+        gold_labels = []
+        # Sample predictions for each test conversation
+        for example in self.test_set:
+            # Build generation prompt from conversation
+            prompt = self.renderer.build_generation_prompt(
+                example["conversation"]
+            )
+            # Generate classification
+            sampling_params = types.SamplingParams(
+                max_tokens=150,
+                temperature=0.0,
+                stop=self.renderer.get_stop_sequences(),
+            )
+            result = await sampling_client.sample_async(
+                prompt=prompt,
+                num_samples=1,
+                sampling_params=sampling_params
+            )
+            # Parse model output into categories
+            pred_tokens = result.sequences[0].tokens
+            pred_text = self.tokenizer.decode(pred_tokens)
+            pred_categories = self._parse_categories(pred_text)
+            predictions.append(pred_categories)
+            gold_labels.append(set(example["labels"]["categories"]))
+        # Compute metrics
+        return {
+            "exact_match": self._compute_exact_match(predictions, gold_labels),
+            "macro_f1": self._compute_macro_f1(predictions, gold_labels),
+            "none_precision": self._compute_none_precision(predictions, gold_labels),
+            "temporal_accuracy": self._compute_temporal_accuracy(predictions, gold_labels),
+        }
+    def _parse_categories(self, text: str) -> set:
+        """Parse comma-separated categories from model output."""
+        # Implementation: split on comma, strip whitespace, validate against taxonomy
+        # Return set of valid categories or {"none"} if parsing fails
+        pass
+    def _compute_exact_match(self, preds, golds) -> float:
+        """Fraction of examples where predicted set exactly matches gold set."""
+        pass
+    def _compute_macro_f1(self, preds, golds) -> float:
+        """Macro-averaged F1 across all categories."""
+        pass
+    def _compute_none_precision(self, preds, golds) -> float:
+        """Precision of 'none' category predictions."""
+        pass
+    def _compute_temporal_accuracy(self, preds, golds) -> float:
+        """Accuracy of persistence horizon alignment (requires loading full examples)."""
+        pass
+```
+- Register evaluator builders with `eval_every=20` for SFT (every checkpoint) and RL loops.
+- Consider Inspect AI tasks after MVP for standardized benchmarking.
+### Offline & Compliance
+- Offline script computes exact-match, macro/micro F1, `none` precision/recall, temporal accuracy, confusion matrix.
+- Regression suite: 100 held-out dialogues rerun after each checkpoint.
+- Format validator ensures comma-separated taxonomy outputs and ≤3 categories typical.
+## 10. Implementation Plan
+1. **Scenario Refresh & QA** – Generate new datasets, run teacher labeling, enforce Section 6.5 checks.
+2. **Preprocessing & Validation** – Convert to `Datum`, run parser + validator.
+3. **SFT Training** – 120–160 async steps with early stopping, checkpoint weights.
+4. **RL Environment Build** – Implement env/reward/evaluators, add unit tests.
+5. **RL Training** – 25 iteration importance sampling run with KL monitoring.
+6. **Evaluation & Sign-off** – Execute evaluator builders + offline scripts, capture qualitative samples, document results.
+7. **Future Optimization** – Investigate streaming minibatch and Inspect AI integration after MVP.
+## 11. Risks & Mitigations
+- **Format Drift:** reward penalty + strict parser; renderer stop sequences enforce termination.
+- **`none` Collapse:** reward weights emphasize recall, track per-category confusion, rebalance data.
+- **Off-Policy Instability:** monitor KL each step, warn at 0.01, halt at 0.05, keep temperature=0.0, clip gradients.
+- **Temporal Mislabeling:** targeted scenario generation plus dedicated reward component; run temporal audits weekly.
+- **Synthetic Bias:** quarterly data refresh with human audits; ingest production edge cases.
+## 12. Deployment Considerations
+- **Inference:** Export final LoRA checkpoint via `save_weights_for_sampler(name="prod_v1")`; serve via Tinker SamplingClient or export to preferred inference stack.
+- **Performance Targets:** <200 ms p95 latency per routing decision; ≥100 decisions/sec on A100 (LoRA overhead ≈8 GB).
+- **Monitoring:** Weekly dashboards for category distribution, `none` precision (>85%), avg categories (<2.5), temporal accuracy, reward drift.
+- **Versioning:** Semantic versioning (major.minor.patch); record lineage (base → SFT → RL); keep last 3 versions for rollback.
+---
+**Owner:** Technical Architecture Lead

docs/Synthetic-Data-Gen/synthetic_data.md ADDED Viewed

	@@ -0,0 +1,784 @@

+# Synthetic Data Generation Guide: Memory Routing System
+## Overview
+This guide provides detailed specifications for generating synthetic training data for the intelligent memory routing system. The data will be used to train both the supervised (teacher labels) and RL (reward signals) stages.
+---
+## Output Format Specification
+### JSONL Schema
+Each line in the output file should be a valid JSON object with the following structure:
+```json
+{
+  "scenario_id": "string",
+  "conversation": [
+    {
+      "role": "user|assistant",
+      "content": "string"
+    }
+  ],
+  "labels": {
+    "categories": ["string"],
+    "persistence_horizon": "long|medium|short",
+    "memory_scope": "company|user|mixed|none",
+    "rationale": "string"
+  },
+  "metadata": {
+    "scenario_type": "string",
+    "primary_category": "string",
+    "distractor_present": boolean,
+    "turn_count": integer,
+    "signals_present": ["string"]
+  }
+}
+```
+### Field Definitions
+**scenario_id**: Unique identifier (format: `{category}_{type}_{counter}`, e.g., `brand_core_standard_001`)
+**conversation**: Array of message objects representing the dialogue
+- Must be 4-10 turns total
+- Should alternate between user and assistant (can start with either)
+- Content should be realistic marketing/strategy dialogue
+**labels.categories**: Array of category strings from taxonomy
+- Valid values: `company.brand_core`, `company.strategic_signatures`, `company.knowledge_artifacts`, `company.business_priorities`, `company.tools_config`, `company.performance_context`, `user.communication_style`, `user.strategic_approach`, `user.role_context`, `user.workflow_patterns`, `user.session_history`, `user.interaction_preferences`, `none`
+- Can be multi-label (typically 1-3 categories)
+- Use `["none"]` for transactional/vague content
+**labels.persistence_horizon**: Expected lifetime of information
+- `long`: >1 year (e.g., brand values, communication style)
+- `medium`: 6-12 months (e.g., role context, tools config)
+- `short`: <3 months (e.g., business priorities, session history)
+**labels.memory_scope**: Who this information pertains to
+- `company`: Company-level information (brand, processes, etc.)
+- `user`: Individual user preferences/context
+- `mixed`: Contains both company and user information
+- `none`: No memorable information
+**labels.rationale**: Brief explanation (1-2 sentences) of why these categories were chosen
+**metadata**: Additional context for training/evaluation
+- `scenario_type`: Descriptive label (e.g., "brand_discovery", "campaign_review", "preference_setting")
+- `primary_category`: The main category this example focuses on
+- `distractor_present`: Whether irrelevant information was intentionally included
+- `turn_count`: Number of conversation turns
+- `signals_present`: List of specific signals (e.g., ["brand_voice_example", "tone_preference", "transactional_question"])
+---
+## Data Generation Prompts
+### Stage 1: Scenario Generation Prompt
+Use this prompt to generate diverse scenario specifications first, then use those to create conversations.
+```
+You are designing training scenarios for an AI memory system in marketing context. Generate a scenario specification with the following requirements:
+TARGET SPECIFICATIONS:
+- Primary Category: {category}
+- Distractor Category: {distractor_category if applicable}
+- Persistence Level: {long/medium/short}
+- Emotional Tone: {neutral/excited/frustrated/collaborative}
+- Turn Count: {4-10}
+- Special Requirements: {e.g., "include specific brand voice example", "multi-label with user preference"}
+OUTPUT FORMAT:
+Return a JSON object with:
+{
+  "scenario_description": "Brief narrative setup (2-3 sentences)",
+  "user_profile": "User role and context",
+  "key_signals_to_include": ["List of 2-4 specific memory-worthy signals"],
+  "distractor_signals": ["Optional list of noise/irrelevant info"],
+  "suggested_turn_breakdown": "How the conversation should flow"
+}
+EXAMPLE OUTPUT:
+{
+  "scenario_description": "Marketing director discussing their personal communication preferences while reviewing campaign performance. They reveal tone expectations and decision-making style.",
+  "user_profile": "Senior Marketing Director at B2B SaaS company, prefers data-driven discussions, values conciseness",
+  "key_signals_to_include": [
+    "Explicit statement about preferring bullet points over paragraphs",
+    "Request for 'bottom-line-up-front' approach",
+    "Mention of quarterly review cadence"
+  ],
+  "distractor_signals": [
+    "Transactional question about meeting time",
+    "Small talk about weather"
+  ],
+  "suggested_turn_breakdown": "Start with campaign review (business_priorities), transition to feedback on communication style (user.communication_style), end with scheduling (none)"
+}
+Generate a scenario for: {TARGET SPECIFICATIONS}
+```
+### Stage 2: Conversation Generation Prompt
+Use this prompt with GPT-5 (or Claude) to generate the actual conversation based on a scenario spec.
+```
+You are generating realistic marketing conversations between a user and an AI marketing assistant. Generate natural dialogue that contains specific information worth storing in long-term memory.
+CONTEXT:
+You will create a conversation that exemplifies certain memory categories while maintaining realism and natural flow.
+SCENARIO SPECIFICATION:
+{Insert scenario_spec from Stage 1}
+MEMORY TAXONOMY (for reference):
+COMPANY MEMORY:
+- company.brand_core: Voice, values, positioning, identity anchors (Persistence: Long >1y)
+- company.strategic_signatures: Decision frameworks, strategic heuristics (Persistence: Long >1y)
+- company.knowledge_artifacts: Docs, style guides, playbooks (Persistence: Long >1y)
+- company.business_priorities: Quarterly/seasonal goals, active campaigns (Persistence: Short <3m)
+- company.tools_config: Integrations, API keys, workflow settings (Persistence: Medium ~6m)
+- company.performance_context: Campaign metrics, retrospectives, learnings (Persistence: Rolling ~6m)
+USER MEMORY:
+- user.communication_style: Tone, verbosity, format expectations (Persistence: Long >1y)
+- user.strategic_approach: Personal priorities, success definitions (Persistence: Long >1y)
+- user.role_context: Title, scope, decision authority (Persistence: Medium ~1y)
+- user.workflow_patterns: Review cadence, collaboration norms (Persistence: Medium ~1y)
+- user.session_history: Immediate context, recent asks (Persistence: Short <2w)
+- user.interaction_preferences: Coaching style, feedback expectations (Persistence: Evolving)
+SPECIAL:
+- none: Irrelevant, vague, or transactional content
+GENERATION RULES:
+1. Make conversations feel natural - include some filler, transitions, acknowledgments
+2. Embed memory-worthy information organically (don't make it too obvious)
+3. Include 1-2 utterances that should map to "none" for realism
+4. If multi-label scenario, ensure signals for both categories are present
+5. Length: {turn_count} turns (alternating user/assistant)
+6. Include specific, concrete details (not generic statements)
+7. For company.* categories: use "we", "our company", "our brand"
+8. For user.* categories: use "I prefer", "my approach", "I typically"
+OUTPUT FORMAT:
+Return a JSON object with:
+{
+  "scenario_id": "{primary_category}_{scenario_type}_{random_3_digit_number}",
+  "conversation": [
+    {"role": "user", "content": "..."},
+    {"role": "assistant", "content": "..."},
+    ...
+  ],
+  "labels": {
+    "categories": ["array of applicable categories"],
+    "persistence_horizon": "long|medium|short",
+    "memory_scope": "company|user|mixed|none",
+    "rationale": "1-2 sentence explanation of category choices"
+  },
+  "metadata": {
+    "scenario_type": "descriptive_label",
+    "primary_category": "main_category",
+    "distractor_present": true|false,
+    "turn_count": integer,
+    "signals_present": ["list of specific signals included"]
+  }
+}
+CRITICAL: Respond with ONLY the JSON object. No markdown formatting, no explanation, no preamble.
+Generate the conversation now.
+```
+---
+## Example Synthetic Data Instances
+### Example 1: Single-Label, company.brand_core
+```json
+{
+  "scenario_id": "brand_core_voice_definition_047",
+  "conversation": [
+    {
+      "role": "user",
+      "content": "I'm working on our new product launch messaging and want to make sure we're staying true to our brand voice. Can you remind me what our core voice attributes are?"
+    },
+    {
+      "role": "assistant",
+      "content": "I'd be happy to help! Let me check our brand guidelines. What specific aspect of the launch are you working on?"
+    },
+    {
+      "role": "user",
+      "content": "The hero section copy. We've always positioned ourselves as the approachable expert - professional but not corporate, if that makes sense. Our voice should be confident without being arrogant, and we use contractions to sound human."
+    },
+    {
+      "role": "assistant",
+      "content": "That's perfect context. So you're looking for that balance of expertise and warmth. Would you like me to review a draft with those voice principles in mind?"
+    },
+    {
+      "role": "user",
+      "content": "Yes, that would be great. Also, I want to make sure we avoid jargon - that's been a key part of our brand since day one. We explain complex marketing concepts in plain English."
+    },
+    {
+      "role": "assistant",
+      "content": "Absolutely, accessibility through clear language is core to your brand identity. Share the draft whenever you're ready and I'll evaluate it against those voice principles."
+    }
+  ],
+  "labels": {
+    "categories": ["company.brand_core"],
+    "persistence_horizon": "long",
+    "memory_scope": "company",
+    "rationale": "User explicitly defines multiple core brand voice attributes (approachable expert, contractions, jargon-free) that represent enduring identity anchors."
+  },
+  "metadata": {
+    "scenario_type": "brand_voice_definition",
+    "primary_category": "company.brand_core",
+    "distractor_present": false,
+    "turn_count": 6,
+    "signals_present": [
+      "voice_attribute_approachable_expert",
+      "voice_attribute_contractions",
+      "voice_principle_no_jargon",
+      "positioning_statement"
+    ]
+  }
+}
+```
+### Example 2: Multi-Label, company.business_priorities + user.communication_style
+```json
+{
+  "scenario_id": "mixed_priority_communication_183",
+  "conversation": [
+    {
+      "role": "user",
+      "content": "Quick update on Q4 - our north star metric is increasing trial-to-paid conversion by 15%. Everything else is secondary."
+    },
+    {
+      "role": "assistant",
+      "content": "Got it, trial-to-paid conversion is the priority. What initiatives are you planning to support that goal?"
+    },
+    {
+      "role": "user",
+      "content": "Three main campaigns: onboarding email sequence optimization, in-app value demonstration, and sales-assist for high-intent trials. When you give me updates on these, I need the data upfront - show me the numbers first, then the narrative. I don't have time for long explanations before I see performance."
+    },
+    {
+      "role": "assistant",
+      "content": "Understood - metrics first, context second. Should I track these three campaigns as your active Q4 priorities?"
+    },
+    {
+      "role": "user",
+      "content": "Yes, and one more thing - if any campaign is underperforming, just tell me straight. I value directness over softening bad news."
+    }
+  ],
+  "labels": {
+    "categories": ["company.business_priorities", "user.communication_style"],
+    "persistence_horizon": "mixed",
+    "memory_scope": "mixed",
+    "rationale": "Contains Q4 business goal and active campaigns (short-term, company-level) plus explicit communication preferences about data-first reporting and directness (long-term, user-level)."
+  },
+  "metadata": {
+    "scenario_type": "priority_setting_with_communication_preference",
+    "primary_category": "company.business_priorities",
+    "distractor_present": false,
+    "turn_count": 5,
+    "signals_present": [
+      "quarterly_goal_conversion",
+      "active_campaigns_list",
+      "communication_pref_data_first",
+      "communication_pref_directness"
+    ]
+  }
+}
+```
+### Example 3: None Category (Transactional)
+```json
+{
+  "scenario_id": "none_transactional_312",
+  "conversation": [
+    {
+      "role": "user",
+      "content": "What time is it in New York right now?"
+    },
+    {
+      "role": "assistant",
+      "content": "It's currently 2:34 PM EST in New York."
+    },
+    {
+      "role": "user",
+      "content": "Thanks. Can you find that report I asked for earlier?"
+    },
+    {
+      "role": "assistant",
+      "content": "I don't see a previous request for a report in our conversation. Could you clarify which report you're looking for?"
+    },
+    {
+      "role": "user",
+      "content": "Never mind, I think I was talking to someone else about it."
+    }
+  ],
+  "labels": {
+    "categories": ["none"],
+    "persistence_horizon": "short",
+    "memory_scope": "none",
+    "rationale": "Purely transactional queries with no durable information worth storing. Time query and confused reference have no strategic or personal value."
+  },
+  "metadata": {
+    "scenario_type": "transactional_noise",
+    "primary_category": "none",
+    "distractor_present": false,
+    "turn_count": 5,
+    "signals_present": ["time_query", "confused_context"]
+  }
+}
+```
+### Example 4: Multi-Label with Distractor, user.strategic_approach + user.workflow_patterns + none
+```json
+{
+  "scenario_id": "user_strategic_workflow_mixed_089",
+  "conversation": [
+    {
+      "role": "user",
+      "content": "Before we dive into campaign planning, I need to set some expectations about how I work. I'm a big believer in testing small before scaling - I'd rather run five $1k experiments than one $5k campaign until we find what works."
+    },
+    {
+      "role": "assistant",
+      "content": "That makes sense - iterative testing to de-risk before investment. How do you typically structure these experiments?"
+    },
+    {
+      "role": "user",
+      "content": "I run them for two weeks minimum, review results every Friday afternoon, and I need at least 3x ROAS to consider something scalable. Also, completely unrelated, but is it supposed to rain tomorrow?"
+    },
+    {
+      "role": "assistant",
+      "content": "Let me check the weather... Yes, there's a 70% chance of rain tomorrow afternoon. Going back to your testing framework - do you involve your team in the Friday reviews?"
+    },
+    {
+      "role": "user",
+      "content": "Always. I do weekly reviews with the full marketing team, and I expect everyone to come prepared with their numbers. Collaboration is key to my approach - I don't make scaling decisions alone."
+    },
+    {
+      "role": "assistant",
+      "content": "Got it - test-and-learn methodology with collaborative weekly reviews every Friday. I'll keep that workflow in mind when making recommendations."
+    }
+  ],
+  "labels": {
+    "categories": ["user.strategic_approach", "user.workflow_patterns"],
+    "persistence_horizon": "long",
+    "memory_scope": "user",
+    "rationale": "Defines personal strategic philosophy (test small before scaling, 3x ROAS threshold) and recurring workflow pattern (Friday reviews with team). Weather question is noise."
+  },
+  "metadata": {
+    "scenario_type": "strategic_workflow_definition_with_distractor",
+    "primary_category": "user.strategic_approach",
+    "distractor_present": true,
+    "turn_count": 6,
+    "signals_present": [
+      "strategic_approach_test_before_scale",
+      "success_metric_3x_roas",
+      "workflow_friday_reviews",
+      "workflow_collaborative_decisions",
+      "distractor_weather"
+    ]
+  }
+}
+```
+### Example 5: Edge Case - Long-term company.knowledge_artifacts
+```json
+{
+  "scenario_id": "knowledge_artifacts_style_guide_221",
+  "conversation": [
+    {
+      "role": "user",
+      "content": "I just updated our email marketing style guide - it's in the shared drive under Marketing/Guidelines/Email_Style_Guide_v3.pdf. This supersedes the old version from 2023."
+    },
+    {
+      "role": "assistant",
+      "content": "Thanks for letting me know. What are the major changes in version 3?"
+    },
+    {
+      "role": "user",
+      "content": "Main updates: we now use sentence case for subject lines instead of title case, maximum 50 characters. Email body should be 150 words max, and we've added a new section on accessibility requirements - minimum 14pt font, 1.5 line spacing, alt text mandatory for all images."
+    },
+    {
+      "role": "assistant",
+      "content": "Those are significant changes, especially the accessibility requirements. Should I reference this guide when reviewing email campaigns?"
+    },
+    {
+      "role": "user",
+      "content": "Yes, absolutely. This is now the canonical source for all email creative. If anyone asks about email standards, point them to this document."
+    }
+  ],
+  "labels": {
+    "categories": ["company.knowledge_artifacts"],
+    "persistence_horizon": "long",
+    "memory_scope": "company",
+    "rationale": "Introduction of updated canonical style guide with specific location and new standards. This is a durable knowledge artifact that will be referenced repeatedly."
+  },
+  "metadata": {
+    "scenario_type": "knowledge_artifact_update",
+    "primary_category": "company.knowledge_artifacts",
+    "distractor_present": false,
+    "turn_count": 5,
+    "signals_present": [
+      "document_location",
+      "canonical_source_declaration",
+      "specific_guidelines_subject_line",
+      "specific_guidelines_accessibility"
+    ]
+  }
+}
+```
+---
+## Generation Strategy & Best Practices
+## Preparing Data for Tinker Training
+Use the official Tinker renderer utilities to transform the JSONL data into `types.Datum` objects before SFT/RL runs. This ensures the tokenizer, stop sequences, and weight masks match what the trainer expects ([`renderers.build_supervised_example`](tinker_docs.md#file-renderingmdx) and [`types.Datum`](tinker_docs.md#part-2-type-definitions)).
+```python
+import json
+import tinker
+from tinker import types
+from tinker_cookbook import renderers, tokenizer_utils
+tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
+renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
+def conversation_to_datum(conversation_json: dict) -> types.Datum:
+    tokens, weights = renderer.build_supervised_example(conversation_json["conversation"])
+    model_input = types.ModelInput.from_ints(tokens[:-1])
+    datum = types.Datum(
+        model_input=model_input,
+        loss_fn_inputs=dict(
+            target_tokens=tokens[1:],
+            weights=weights[1:],
+        ),
+    )
+    if datum.model_input.length > 4096:
+        raise ValueError("Conversation exceeds model context window")
+    return datum
+```
+**Checklist**
+- [ ] Conversations tokenized with the same renderer used during training
+- [ ] Resulting `ModelInput` length < base model context window (4k for Llama-3.1-8B)
+- [ ] Non-zero loss weights present (otherwise drop example)
+- [ ] Saved as pickled or JSONL `Datum` payloads ready for `forward_backward_async`
+### Coverage Matrix
+Generate scenarios to ensure balanced coverage:
+| Category | Target % | Min Examples | With Distractor | Multi-Label |
+|----------|----------|--------------|-----------------|-------------|
+| company.brand_core | 10% | 100 | 30 | 20 |
+| company.strategic_signatures | 8% | 80 | 25 | 15 |
+| company.knowledge_artifacts | 8% | 80 | 25 | 15 |
+| company.business_priorities | 10% | 100 | 40 | 30 |
+| company.tools_config | 7% | 70 | 20 | 10 |
+| company.performance_context | 9% | 90 | 30 | 20 |
+| user.communication_style | 10% | 100 | 30 | 25 |
+| user.strategic_approach | 9% | 90 | 25 | 20 |
+| user.role_context | 7% | 70 | 20 | 15 |
+| user.workflow_patterns | 8% | 80 | 25 | 20 |
+| user.session_history | 6% | 60 | 15 | 10 |
+| user.interaction_preferences | 8% | 80 | 25 | 20 |
+| none | 10% | 100 | 50 | 5 |
+**Total Target:** 1,100-1,200 examples minimum for SFT
+### Quality Validation Checklist
+For each generated example, validate:
+**Structural:**
+- [ ] Valid JSON format
+- [ ] All required fields present
+- [ ] 4-10 turns in conversation
+- [ ] Alternating roles (mostly)
+- [ ] Categories are from valid taxonomy
+**Content:**
+- [ ] Natural language flow (not robotic)
+- [ ] Specific details present (not generic)
+- [ ] Clear signal for each labeled category
+- [ ] Distractor is truly off-topic (if present)
+- [ ] Persistence horizon matches category definition
+**Label Quality:**
+- [ ] Rationale explains category choice
+- [ ] Multi-label examples have signals for all categories
+- [ ] "none" examples have no memorable information
+- [ ] Memory scope matches categories (company.* → company)
+### Batch Generation Process
+1. **Define Coverage Plan**
+   - Decide on total dataset size (1,500-2,000 recommended)
+   - Allocate examples per category per coverage matrix
+   - Generate scenario specifications for each category
+2. **Generate Conversations**
+   - Process scenarios in batches of 50-100
+   - Use GPT-5 or Claude Opus for generation
+   - Temperature: 0.7 for diversity
+   - Validate each batch before proceeding
+3. **Quality Review**
+   - Sample 100 random examples for human review
+   - Check for common failure modes:
+     * Generic statements ("our brand is innovative")
+     * Unclear signals (ambiguous category)
+     * Unrealistic dialogue (too formal/robotic)
+     * Missing distractors where planned
+   - Iterate prompts if quality issues found
+4. **Teacher Labeling**
+   - Run ALL examples through teacher labeling prompt
+   - Compare teacher labels to synthetic labels
+   - Agreement threshold: >95%
+   - If disagreement, review and regenerate
+5. **Final Dataset Assembly**
+   - Split train/test (80/20)
+   - Stratify by category to ensure test coverage
+   - Save as `train.jsonl` and `test.jsonl`
+   - Document metadata (generation date, model used, prompts)
+---
+## Teacher Labeling Prompt
+Use this prompt to generate gold labels for any conversation (including real production data later):
+```
+You are a memory routing classifier for a marketing AI system. Your job is to analyze conversations and determine what information should be stored in long-term memory and in which categories.
+MEMORY TAXONOMY:
+COMPANY MEMORY (about the organization):
+1. company.brand_core - Voice, values, positioning, identity anchors [Long-term: >1y]
+2. company.strategic_signatures - Decision frameworks, strategic heuristics [Long-term: >1y]
+3. company.knowledge_artifacts - Documents, style guides, playbooks [Long-term: >1y]
+4. company.business_priorities - Quarterly/seasonal goals, active campaigns [Short-term: <3m]
+5. company.tools_config - Integrations, API keys, workflow settings [Medium-term: ~6m]
+6. company.performance_context - Campaign metrics, retrospectives, learnings [Rolling: ~6m]
+USER MEMORY (about the individual):
+7. user.communication_style - Tone, verbosity, format expectations [Long-term: >1y]
+8. user.strategic_approach - Personal priorities, success definitions [Long-term: >1y]
+9. user.role_context - Title, scope, decision authority [Medium-term: ~1y]
+10. user.workflow_patterns - Review cadence, collaboration norms [Medium-term: ~1y]
+11. user.session_history - Immediate context, recent asks [Short-term: <2w]
+12. user.interaction_preferences - Coaching style, feedback expectations [Evolving]
+SPECIAL:
+13. none - Irrelevant, vague, or transactional content (use when nothing is worth remembering)
+ROUTING RULES:
+1. Distinguish company.* (organization-level) from user.* (individual-level)
+2. Match persistence horizon to information lifetime
+3. Predict ≤3 categories unless strictly necessary
+4. Prefer "none" unless there's CONCRETE, DURABLE information
+5. For company.* categories: look for "we", "our", organizational facts
+6. For user.* categories: look for "I", "my", personal preferences
+7. Vague statements like "we should be innovative" → none (too generic)
+8. Specific statements like "our brand voice uses contractions" → company.brand_core
+CONVERSATION TO ANALYZE:
+{conversation}
+OUTPUT FORMAT (JSON):
+{
+  "categories": ["category1", "category2"],
+  "persistence_horizon": "long|medium|short",
+  "memory_scope": "company|user|mixed|none",
+  "rationale": "Brief explanation of why these categories were selected",
+  "confidence": "high|medium|low",
+  "extractable_facts": [
+    "List 1-3 specific facts that would be stored in memory"
+  ]
+}
+CRITICAL RULES:
+- Be conservative with labeling - when in doubt, use "none"
+- Only label what's EXPLICITLY stated, not implied
+- Multi-label only when multiple distinct types of information are present
+- If conversation is small talk or transactional → ["none"]
+Analyze the conversation and provide your classification:
+```
+---
+## Validation & Metrics
+### Data Quality Metrics to Track
+**Coverage Metrics:**
+- Category distribution (should match target ±5%)
+- Persistence distribution (long: 35%, medium: 30%, short: 25%, mixed: 10%)
+- Memory scope distribution (company: 45%, user: 45%, mixed: 5%, none: 5%)
+- Multi-label frequency (target: 20-25% of non-none examples)
+**Quality Metrics:**
+- Teacher agreement rate (target: >95%)
+- Average turn length (target: 20-150 tokens)
+- Conversation length distribution (target: mean 6.5 ± 1.5 turns)
+- "none" precision via human review (target: >90%)
+**Signal Metrics:**
+- Average signals per conversation (target: 2-3)
+- Signal diversity (unique signal types / total signals: target >0.7)
+- Distractor effectiveness (human annotators can identify: target >85%)
+### Automated Validation Script
+```python
+import json
+from collections import Counter
+from typing import Dict, List
+def validate_synthetic_data(filepath: str) -> Dict[str, any]:
+    """Validate synthetic data quality"""
+    with open(filepath, 'r') as f:
+        data = [json.loads(line) for line in f]
+    # Category distribution
+    all_categories = []
+    for item in data:
+        all_categories.extend(item['labels']['categories'])
+    category_dist = Counter(all_categories)
+    # Multi-label frequency
+    multi_label_count = sum(1 for item in data if len(item['labels']['categories']) > 1)
+    multi_label_freq = multi_label_count / len(data)
+    # Turn count distribution
+    turn_counts = [item['metadata']['turn_count'] for item in data]
+    avg_turns = sum(turn_counts) / len(turn_counts)
+    # Persistence distribution
+    persistence_dist = Counter(item['labels']['persistence_horizon'] for item in data)
+    # Memory scope distribution
+    scope_dist = Counter(item['labels']['memory_scope'] for item in data)
+    return {
+        'total_examples': len(data),
+        'category_distribution': dict(category_dist),
+        'multi_label_frequency': multi_label_freq,
+        'avg_turns_per_conversation': avg_turns,
+        'persistence_distribution': dict(persistence_dist),
+        'scope_distribution': dict(scope_dist),
+        'warnings': _generate_warnings(category_dist, multi_label_freq, avg_turns)
+    }
+def _generate_warnings(cat_dist, ml_freq, avg_turns):
+    warnings = []
+    # Check for imbalanced categories
+    total = sum(cat_dist.values())
+    for cat, count in cat_dist.items():
+        if count / total < 0.05:
+            warnings.append(f"Category '{cat}' underrepresented: {count/total:.1%}")
+    # Check multi-label frequency
+    if ml_freq < 0.15:
+        warnings.append(f"Low multi-label frequency: {ml_freq:.1%} (target: 20-25%)")
+    # Check turn length
+    if avg_turns < 5 or avg_turns > 8:
+        warnings.append(f"Average turns out of range: {avg_turns:.1f} (target: 6.5±1.5)")
+    return warnings
+# Usage
+metrics = validate_synthetic_data('train.jsonl')
+print(json.dumps(metrics, indent=2))
+```
+---
+## Prompt Engineering Tips
+### For Better Diversity:
+1. Set the `temperature` parameter (1)
+2. Add "Generate a DIFFERENT conversation that..." to avoid repetition
+3. Provide counter-examples: "Don't make it like this: [generic example]"
+4. Use different starting phrases: "Create", "Generate", "Produce", "Design"
+### For Better Quality:
+1. Include GOOD and BAD examples in prompt
+2. Specify "Be specific, not generic" multiple times
+3. Add negative instructions: "Avoid phrases like 'let's think about', 'going forward'"
+4. Request concrete numbers, names, specific tools/platforms
+### For Better Label Accuracy:
+1. Show the model 3-5 labeled examples before asking for labels
+2. Use chain-of-thought: "First, identify the durable facts. Then, assign categories."
+3. Add calibration: "Be conservative - most conversations should have 1-2 categories, not 4+"
+4. Include edge cases in few-shot examples (generic vs specific, multi-label)
+---
+## Common Pitfalls & How to Avoid Them
+### Pitfall 1: Generic, Vague Content
+**Bad Example:** "We value innovation and customer focus."
+**Good Example:** "We use contractions in all customer-facing copy to sound conversational, and we never say 'utilize' when 'use' works fine."
+**Fix:** Add to prompt: "Include SPECIFIC details like exact phrases, numbers, tool names, or concrete examples."
+### Pitfall 2: Over-labeling
+**Bad:** Every conversation gets 3-4 categories
+**Good:** Most conversations get 1-2 categories, some get 0 (none)
+**Fix:** Emphasize in teacher prompt: "Be conservative. Most conversations are noise or context-specific."
+### Pitfall 3: Unrealistic Dialogue
+**Bad:** "Hello, I would like to discuss our brand positioning strategy and establish core value propositions."
+**Good:** "Hey, quick question about our brand voice - are we still doing the contractions thing in emails?"
+**Fix:** Add natural language examples and specify: "Make it sound like a real conversation, including filler words, casual language, and natural transitions."
+### Pitfall 4: Missing Edge Cases
+**Bad:** Every example is clean and obvious
+**Good:** 20% have ambiguity, distractors, or edge cases
+**Fix:** Explicitly generate "hard negative" scenarios: near-misses, multi-label, heavy distractors
+### Pitfall 5: Persistence Mismatch
+**Bad:** "Our Q1 campaign goal" labeled as long-term
+**Good:** "Our Q1 campaign goal" labeled as short-term (company.business_priorities)
+**Fix:** Include persistence definitions in EVERY prompt and validate programmatically
+---
+## Summary Checklist
+Before finalizing your synthetic dataset:
+- [ ] 1,500-2,000 total examples generated
+- [ ] All 13 categories represented (100+ examples each)
+- [ ] 20-25% multi-label examples
+- [ ] 10-15% "none" examples
+- [ ] 30% with intentional distractors
+- [ ] Teacher labeling agreement >95%
+- [ ] Average 6.5 ± 1.5 turns per conversation
+- [ ] Persistence distribution: ~35% long, ~30% medium, ~25% short
+- [ ] Human review of 100 random samples (quality check)
+- [ ] Train/test split (80/20, stratified by category)
+- [ ] Documentation of generation process and prompts saved

docs/tinker_docs.md ADDED Viewed

The diff for this file is too large to render. See raw diff

huggingface/README.md ADDED Viewed

	@@ -0,0 +1,140 @@

+---
+license: apache-2.0
+language:
+- en
+tags:
+- memory-routing
+- marketing
+- classification
+- llama
+- lora
+- tinker
+base_model: meta-llama/Llama-3.1-8B
+datasets:
+- muratcankoylan/memory-routing-marketing
+metrics:
+- f1
+- accuracy
+pipeline_tag: text-classification
+---
+# Memory Routing Agent (Llama-8B + LoRA)
+A specialized 8B parameter model that **outperforms 104B models** on marketing conversation classification.
+## Key Results
+| Model | Size | Avg F1 | Exact Match |
+|-------|------|--------|-------------|
+| **This Model** | 8B | **0.68** | **60%** |
+| Cohere Command-R-Plus | 104B | 0.61 | 26% |
+**11.1% higher F1** than the 104B teacher model that generated its training data.
+## Model Description
+The Memory Routing Agent classifies marketing conversations into 13 memory categories:
+### Company Categories
+- `company.brand_core` - Voice, values, positioning
+- `company.strategic_signatures` - Decision frameworks
+- `company.knowledge_artifacts` - Docs, style guides
+- `company.business_priorities` - Quarterly goals
+- `company.tools_config` - Integrations, APIs
+- `company.performance_context` - Campaign metrics
+### User Categories
+- `user.communication_style` - Tone, format preferences
+- `user.strategic_approach` - Personal priorities
+- `user.role_context` - Title, scope
+- `user.workflow_patterns` - Review cadence
+- `user.session_history` - Immediate context
+- `user.interaction_preferences` - Coaching style
+### Special
+- `none` - Transactional or irrelevant content
+## Training
+- **Base Model**: meta-llama/Llama-3.1-8B
+- **Method**: LoRA (rank 32) + SFT + RL
+- **Platform**: Tinker (Thinking Machines)
+- **Dataset**: 2,001 synthetic marketing conversations
+- **Teacher**: Cohere Command-R-Plus (104B)
+### Training Pipeline
+1. **SFT Phase**: 100 steps, batch size 128, cross-entropy loss
+2. **RL Phase**: 12 iterations, importance sampling policy gradient
+3. **Reward**: 0.6×F1 + 0.2×temporal + 0.1×parity + 0.1×efficiency
+## Usage
+```python
+# Note: This model was trained on Tinker platform
+# The checkpoint is: tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012
+import tinker
+from tinker import types
+from tinker_cookbook import renderers
+from tinker_cookbook.tokenizer_utils import get_tokenizer
+service_client = tinker.ServiceClient()
+checkpoint = "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012"
+sampling_client = service_client.create_sampling_client(model_path=checkpoint)
+tokenizer = get_tokenizer("meta-llama/Llama-3.1-8B")
+renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
+conversation = """
+USER: Our brand voice is professional but approachable.
+ASSISTANT: So authoritative content with a conversational tone?
+USER: Exactly. We never use jargon without explaining it first.
+"""
+messages = [
+    {"role": "system", "content": "You route marketing conversations into structured memory categories..."},
+    {"role": "user", "content": f"Analyze this conversation:\n\n{conversation}"}
+]
+prompt = renderer.build_generation_prompt(messages)
+params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=renderer.get_stop_sequences())
+result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
+response, _ = renderer.parse_response(result.sequences[0].tokens)
+print(f"Categories: {response['content']}")
+# Output: company.brand_core
+```
+## Benchmark
+50 challenging marketing scenarios across 7 domains:
+| Difficulty | Our Model | Cohere (104B) |
+|------------|-----------|---------------|
+| Easy | 0.86 F1 | 0.48 F1 |
+| Medium | 0.65 F1 | 0.64 F1 |
+| Hard | 0.50 F1 | 0.72 F1 |
+## Limitations
+- Under-predicts multi-label scenarios
+- Sometimes confuses company vs user categories
+- Marketing-specific; not tested on other domains
+## Citation
+```bibtex
+@misc{memory-routing-agent-2024,
+  title={Memory Routing Agent: Prompt Distillation for Marketing AI},
+  author={Muratcan Koylan},
+  year={2024},
+  howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
+}
+```
+## Links
+- **GitHub**: [muratcankoylan/memory-routing-agent](https://github.com/muratcankoylan/memory-routing-agent)
+- **Training Platform**: [Tinker by Thinking Machines](https://thinkingmachines.ai/)

huggingface/upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Upload Memory Routing Agent to HuggingFace Hub
+This script uploads:
+1. Model card (README.md)
+2. Training dataset
+3. Benchmark dataset
+4. Training configuration
+"""
+import os
+import json
+from dotenv import load_dotenv
+from huggingface_hub import HfApi, create_repo, upload_file, upload_folder, login
+load_dotenv()
+# Configuration
+HF_TOKEN = os.getenv("HF_TOKEN")
+REPO_ID = "muratcankoylan/memory-routing-agent"
+DATASET_REPO_ID = "muratcankoylan/memory-routing-marketing"
+# Login first
+if HF_TOKEN:
+    print(f"Logging in with token (first 10 chars): {HF_TOKEN[:10]}...")
+    login(token=HF_TOKEN)
+else:
+    print("ERROR: HF_TOKEN not found in .env file")
+    exit(1)
+def upload_model():
+    """Upload model card and metadata to HuggingFace."""
+    api = HfApi(token=HF_TOKEN)
+    # Create model repo
+    try:
+        create_repo(repo_id=REPO_ID, token=HF_TOKEN, exist_ok=True)
+        print(f"Created/verified repo: {REPO_ID}")
+    except Exception as e:
+        print(f"Repo creation note: {e}")
+    # Upload README (model card)
+    upload_file(
+        path_or_fileobj="huggingface/README.md",
+        path_in_repo="README.md",
+        repo_id=REPO_ID,
+        token=HF_TOKEN,
+    )
+    print("Uploaded model card")
+    # Upload benchmark
+    upload_file(
+        path_or_fileobj="training/benchmarks/marketing_routing_benchmark.json",
+        path_in_repo="benchmark/marketing_routing_benchmark.json",
+        repo_id=REPO_ID,
+        token=HF_TOKEN,
+    )
+    print("Uploaded benchmark")
+    # Upload training config
+    config = {
+        "base_model": "meta-llama/Llama-3.1-8B",
+        "lora_rank": 32,
+        "sft_steps": 100,
+        "sft_batch_size": 128,
+        "sft_learning_rate": 2.86e-4,
+        "rl_iterations": 12,
+        "rl_groups_per_batch": 64,
+        "rl_group_size": 32,
+        "rl_learning_rate": 2e-5,
+        "tinker_checkpoint": "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012",
+        "reward_weights": {
+            "f1": 0.6,
+            "temporal": 0.2,
+            "parity": 0.1,
+            "efficiency": 0.1
+        }
+    }
+    with open("huggingface/config.json", "w") as f:
+        json.dump(config, f, indent=2)
+    upload_file(
+        path_or_fileobj="huggingface/config.json",
+        path_in_repo="config.json",
+        repo_id=REPO_ID,
+        token=HF_TOKEN,
+    )
+    print("Uploaded config")
+    print(f"\nModel uploaded to: https://huggingface.co/{REPO_ID}")
+def upload_dataset():
+    """Upload training dataset to HuggingFace Datasets."""
+    api = HfApi(token=HF_TOKEN)
+    # Create dataset repo
+    try:
+        create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
+        print(f"Created/verified dataset repo: {DATASET_REPO_ID}")
+    except Exception as e:
+        print(f"Dataset repo creation note: {e}")
+    # Create dataset README
+    dataset_readme = """---
+license: apache-2.0
+language:
+- en
+tags:
+- memory-routing
+- marketing
+- classification
+- synthetic
+size_categories:
+- 1K<n<10K
+---
+# Memory Routing Marketing Dataset
+2,001 synthetic marketing conversations for training memory routing classifiers.
+## Dataset Description
+This dataset contains marketing conversations labeled with memory categories. Each conversation includes:
+- Multi-turn dialogue between a user and AI assistant
+- Category labels (13 possible categories)
+- Persistence horizon (long/medium/short)
+- Memory scope (company/user/none)
+## Categories
+### Company Categories
+- `company.brand_core` - Voice, values, positioning
+- `company.strategic_signatures` - Decision frameworks
+- `company.knowledge_artifacts` - Docs, style guides
+- `company.business_priorities` - Quarterly goals
+- `company.tools_config` - Integrations, APIs
+- `company.performance_context` - Campaign metrics
+### User Categories
+- `user.communication_style` - Tone, format preferences
+- `user.strategic_approach` - Personal priorities
+- `user.role_context` - Title, scope
+- `user.workflow_patterns` - Review cadence
+- `user.session_history` - Immediate context
+- `user.interaction_preferences` - Coaching style
+### Special
+- `none` - Transactional or irrelevant content
+## Generation
+Generated using Cohere Command-R-Plus (104B) as teacher model with diverse prompts covering:
+- Multiple industries (tech, retail, healthcare, finance, etc.)
+- Various user roles (CMO, VP Marketing, Growth Lead, etc.)
+- Different conversation styles and complexities
+## Usage
+```python
+from datasets import load_dataset
+dataset = load_dataset("muratcankoylan/memory-routing-marketing")
+```
+## Citation
+```bibtex
+@misc{memory-routing-agent-2024,
+  title={Memory Routing Agent: Prompt Distillation for Marketing AI},
+  author={Muratcan Koylan},
+  year={2024},
+  howpublished={\\url{https://github.com/muratcankoylan/memory-routing-agent}},
+}
+```
+"""
+    with open("huggingface/dataset_readme.md", "w") as f:
+        f.write(dataset_readme)
+    upload_file(
+        path_or_fileobj="huggingface/dataset_readme.md",
+        path_in_repo="README.md",
+        repo_id=DATASET_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    print("Uploaded dataset README")
+    # Upload training data
+    upload_file(
+        path_or_fileobj="synthetic_data/merged_training_dataset_2001.jsonl",
+        path_in_repo="data/train.jsonl",
+        repo_id=DATASET_REPO_ID,
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    print("Uploaded training data")
+    print(f"\nDataset uploaded to: https://huggingface.co/datasets/{DATASET_REPO_ID}")
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Uploading Memory Routing Agent to HuggingFace")
+    print("=" * 60)
+    print("\n1. Uploading model...")
+    upload_model()
+    print("\n2. Uploading dataset...")
+    upload_dataset()
+    print("\n" + "=" * 60)
+    print("UPLOAD COMPLETE")
+    print("=" * 60)
+    print(f"Model: https://huggingface.co/{REPO_ID}")
+    print(f"Dataset: https://huggingface.co/datasets/{DATASET_REPO_ID}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# Core dependencies
+tinker-toolkit>=0.1.0
+python-dotenv>=1.0.0
+cohere>=5.0.0
+# ML/Data processing
+torch>=2.0.0
+numpy>=1.24.0
+transformers>=4.35.0
+huggingface-hub>=0.19.0
+# Utilities
+tqdm>=4.65.0

synthetic_data/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Synthetic Data Generation Pipeline
+This directory contains the tools for generating and validating synthetic training data using Cohere's `command-a-reasoning-08-2025` model.
+## Setup
+1.  **Install Dependencies**:
+    ```bash
+    python3 -m venv venv
+    source venv/bin/activate
+    pip install cohere python-dotenv tinker tinker-cookbook
+    ```
+2.  **Environment Variables**:
+    Ensure your `.env` file contains your Cohere API key:
+    ```
+    COHERE_API_KEY=your_api_key_here
+    ```
+## Usage
+### 1. Generate Data
+Use the `SyntheticDataPipeline` class to generate data batches.
+```python
+from synthetic_data.pipeline import SyntheticDataPipeline
+pipeline = SyntheticDataPipeline()
+# Generate 10 examples for a specific category
+results = pipeline.run_batch(count=10, category="company.brand_core")
+```
+You can also run the sample generator script:
+```bash
+python3 synthetic_data/generate_sample.py
+```
+### 2. Validate Data
+Run the validation script on any generated JSON or JSONL file to check compliance with the schema and distribution targets.
+```bash
+python3 synthetic_data/validate.py synthetic_data/sample_batch.json
+```
+The validator checks:
+*   JSON structure and required fields
+*   Category distribution
+*   Multi-label frequency
+*   Conversation length
+*   Persistence and scope consistency
+## Pipeline Components
+*   `pipeline.py`: Core logic for 2-stage generation (Scenario -> Conversation) using Cohere.
+*   `validate.py`: Quality assurance script implementing checks from `docs/synthetic_data.md`.
+*   `test_pipeline.py`: Unit tests for the pipeline structure.
+*   `generate_sample.py`: Helper script to produce a quick sample batch.

synthetic_data/all_generated_data_1000.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/balanced_async_log.txt ADDED Viewed

	@@ -0,0 +1,67 @@

+======================================================================
+BALANCED CONCURRENT DATASET GENERATION
+======================================================================
+Target per category: 77
+Total categories: 13
+Expected total: 1001
+Batch size: 10 concurrent requests
+[Batch 1] Generating 10 items...
+  Success: 10/10 | Total: 10/1001
+[Batch 2] Generating 10 items...
+  Success: 10/10 | Total: 20/1001
+[Batch 3] Generating 10 items...
+  Success: 10/10 | Total: 30/1001
+[Batch 4] Generating 10 items...
+  Success: 10/10 | Total: 40/1001
+[Batch 5] Generating 10 items...
+  Success: 10/10 | Total: 50/1001
+[Batch 6] Generating 10 items...
+  Success: 10/10 | Total: 60/1001
+[Batch 7] Generating 10 items...
+  Success: 10/10 | Total: 70/1001
+[Batch 8] Generating 10 items...
+  Success: 10/10 | Total: 80/1001
+[Batch 9] Generating 10 items...
+  Success: 10/10 | Total: 90/1001
+[Batch 10] Generating 10 items...
+  Success: 10/10 | Total: 100/1001
+  Category Progress:
+    company.brand_core                  [█████░░░░░░░░░░░░░░░] 20/77
+    company.business_priorities         [█████░░░░░░░░░░░░░░░] 20/77
+    company.knowledge_artifacts         [█████░░░░░░░░░░░░░░░] 20/77
+    company.performance_context         [░░░░░░░░░░░░░░░░░░░░] 0/77
+    company.strategic_signatures        [█████░░░░░░░░░░░░░░░] 20/77
+    company.tools_config                [█████░░░░░░░░░░░░░░░] 20/77
+    none                                [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.communication_style            [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.interaction_preferences        [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.role_context                   [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.session_history                [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.strategic_approach             [░░░░░░░░░░░░░░░░░░░░] 0/77
+    user.workflow_patterns              [░░░░░░░░░░░░░░░░░░░░] 0/77
+[Batch 11] Generating 10 items...
+  Success: 10/10 | Total: 110/1001
+[Batch 12] Generating 10 items...
+  Success: 10/10 | Total: 120/1001
+[Batch 13] Generating 10 items...
+  Success: 10/10 | Total: 130/1001
+[Batch 14] Generating 10 items...
+  Success: 10/10 | Total: 140/1001
+[Batch 15] Generating 10 items...

synthetic_data/balanced_generation_log.txt ADDED Viewed

File without changes

synthetic_data/balanced_generation_log_20251124_184530.txt ADDED Viewed

File without changes

synthetic_data/clean_batch.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import json
+import sys
+def clean_batch(filepath):
+    print(f"Cleaning {filepath}...")
+    cleaned_data = []
+    fixed_count = 0
+    with open(filepath, 'r') as f:
+        for line in f:
+            if not line.strip():
+                continue
+            item = json.loads(line)
+            cats = item['labels']['categories']
+            if 'none' in cats and len(cats) > 1:
+                print(f"Fixing mixed 'none' in {item['scenario_id']}: {cats}")
+                cats.remove('none')
+                item['labels']['categories'] = cats
+                item['metadata']['cleaned_none_mix'] = True
+                fixed_count += 1
+            cleaned_data.append(item)
+    output_path = filepath.replace('.jsonl', '_cleaned.jsonl')
+    with open(output_path, 'w') as f:
+        for item in cleaned_data:
+            f.write(json.dumps(item) + '\n')
+    print(f"Cleaned {len(cleaned_data)} items. Fixed {fixed_count} issues.")
+    print(f"Saved to {output_path}")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python3 clean_batch.py <jsonl_file>")
+        sys.exit(1)
+    clean_batch(sys.argv[1])

synthetic_data/clean_data.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import sys
+from typing import List, Dict, Any
+def clean_datum(item: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Clean a single data item:
+    1. Remove 'none' if other categories are present.
+    2. Deduplicate categories.
+    3. Ensure consistent formatting.
+    """
+    if "labels" not in item or "categories" not in item["labels"]:
+        return item
+    cats = item["labels"]["categories"]
+    # Deduplicate
+    cats = list(set(cats))
+    # Remove 'none' if other categories exist
+    if len(cats) > 1 and "none" in cats:
+        cats.remove("none")
+    # Update the item
+    item["labels"]["categories"] = cats
+    return item
+def clean_file(input_path: str, output_path: str):
+    print(f"Cleaning {input_path} -> {output_path}")
+    cleaned_count = 0
+    data = []
+    # Read input
+    with open(input_path, 'r') as f:
+        content = f.read().strip()
+        if not content:
+            print("Empty file")
+            return
+        # Handle JSONL or list of JSON
+        if content.startswith('[') and content.endswith(']'):
+            raw_data = json.loads(content)
+        else:
+            raw_data = [json.loads(line) for line in content.split('\n') if line.strip()]
+    # Process
+    for item in raw_data:
+        original_cats = item.get("labels", {}).get("categories", [])
+        cleaned_item = clean_datum(item)
+        new_cats = cleaned_item["labels"]["categories"]
+        if set(original_cats) != set(new_cats):
+            cleaned_count += 1
+        data.append(cleaned_item)
+    # Write output (always as JSONL for training)
+    with open(output_path, 'w') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+    print(f"Processed {len(data)} items. Cleaned {cleaned_count} items (removed 'none' or duplicates).")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python clean_data.py input_file [output_file]")
+        sys.exit(1)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.json', '_cleaned.jsonl').replace('.jsonl', '_cleaned.jsonl')
+    clean_file(input_file, output_file)

synthetic_data/debug_key.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from dotenv import load_dotenv
+import cohere
+load_dotenv()
+key = os.getenv("CO_API_KEY")
+if key:
+    print(f"Key found. Length: {len(key)}")
+    print(f"Prefix: {key[:4]}...")
+    try:
+        client = cohere.ClientV2(api_key=key)
+        print("Client initialized.")
+        # Simple test call
+        print("Testing simple chat...")
+        response = client.chat(
+            model="command-r-plus", # Use a cheaper/standard model for quick test
+            messages=[{"role": "user", "content": "Hello"}]
+        )
+        print("Response received!")
+        print(response)
+    except Exception as e:
+        print(f"Error: {e}")
+else:
+    print("Key NOT found in environment.")

synthetic_data/debug_key_raw.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+key = os.getenv("CO_API_KEY")
+if key:
+    print(f"Key raw: '{key}'")
+    print(f"Length: {len(key)}")
+else:
+    print("Key NOT found.")

synthetic_data/diverse_dataset_20251124_192207.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/diverse_generation_log.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+======================================================================
+WORLD-CLASS DATASET GENERATION
+======================================================================
+Temperature: 0.95 | Max creative freedom
+Target: 77 per category x 13 = 1001 total
+Output: synthetic_data/diverse_dataset_20251124_190248.jsonl
+======================================================================
+[Batch 1] 10 items | Categories: {'none', 'company.business_priorities', 'company.performance_context', 'user.session_history', 'company.brand_core', 'user.interaction_preferences', 'company.strategic_signatures', 'user.strategic_approach'}

synthetic_data/diverse_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/generate_sample.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+from synthetic_data.pipeline import SyntheticDataPipeline
+def generate_sample():
+    pipeline = SyntheticDataPipeline()
+    print("Generating sample batch...")
+    results = pipeline.run_batch(count=2, category="company.brand_core")
+    with open("synthetic_data/sample_batch.json", "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"Saved {len(results)} examples to synthetic_data/sample_batch.json")
+if __name__ == "__main__":
+    generate_sample()

synthetic_data/generation_log_100.txt ADDED Viewed

	@@ -0,0 +1,702 @@

+Starting generation of 100 items in 10 batches (Size: 10)...
+=== Processing Batch 1/10 ===
+  Generating item 1/10 (Category: user.strategic_approach)...
+    Generated: company.strategic_signatures_compliance_curriculum_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_descriptive_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: user.workflow_patterns)...
+    Generated: company.strategic_signatures_descriptive_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: user.session_history)...
+    Generated: company.tools_config_workflow_157
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: none)...
+    Generated: company.business_priorities_descriptive_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: company.knowledge_artifacts)...
+Scenario generation failed (attempt 1/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': 'f32c28401e57e5a48a02968561190d5c', 'date': 'Fri, 21 Nov 2025 21:02:43 GMT', 'x-envoy-upstream-service-time': '9', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': '16af4d33-1859-4a1e-b2c5-bab9a414a063', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
+Retrying in 5s...
+    Generated: company.business_priorities_CPG_seasonal_launch_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: user.strategic_approach)...
+    Generated: company.strategic_signatures_retention_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: user.workflow_patterns)...
+    Generated: company_business_priorities_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: company.tools_config)...
+    Generated: company.tools_config_fundraising_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: user.role_context)...
+    Generated: company.business_priorities_retention_strategy_123
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_01.json
+  Validating batch...
+{
+  "total_examples": 10,
+  "category_distribution": {
+    "company.strategic_signatures": 5,
+    "company.business_priorities": 9,
+    "user.strategic_approach": 5,
+    "user.workflow_patterns": 3,
+    "company.performance_context": 5,
+    "user.role_context": 7,
+    "company.knowledge_artifacts": 2,
+    "company.tools_config": 2,
+    "user.session_history": 1,
+    "user.interaction_preferences": 1,
+    "none": 1
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 6.4,
+  "persistence_distribution": {
+    "mixed": 7,
+    "medium": 2,
+    "short": 1
+  },
+  "scope_distribution": {
+    "mixed": 10
+  },
+  "warnings": []
+}
+=== Processing Batch 2/10 ===
+  Generating item 1/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_sponsorship_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: company.tools_config)...
+    Generated: company.tools_config_collaborative_configuration_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: user.session_history)...
+    Generated: user_session_history_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: user.interaction_preferences)...
+    Generated: company.knowledge_artifacts_EdTech_strategy_refinement_451
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: user.strategic_approach)...
+    Generated: company_strategic_signatures_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: user.communication_style)...
+    Generated: company.brand_core_collaborative_messaging_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: user.communication_style)...
+    Generated: company.strategic_signatures_donor_messaging_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: company.knowledge_artifacts)...
+    Generated: company.knowledge_artifacts_onboarding_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: user.workflow_patterns)...
+    Generated: company_tools_config_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: company.knowledge_artifacts)...
+    Generated: company.knowledge_artifacts_retail_452
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_02.json
+  Validating batch...
+{
+  "total_examples": 10,
+  "category_distribution": {
+    "company.business_priorities": 5,
+    "company.knowledge_artifacts": 7,
+    "company.strategic_signatures": 4,
+    "user.strategic_approach": 6,
+    "company.tools_config": 3,
+    "company.performance_context": 2,
+    "user.role_context": 3,
+    "user.session_history": 2,
+    "company.brand_core": 3,
+    "none": 4,
+    "user.interaction_preferences": 1,
+    "user.workflow_patterns": 3,
+    "user.communication_style": 2
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 7.6,
+  "persistence_distribution": {
+    "mixed": 8,
+    "medium": 1,
+    "long": 1
+  },
+  "scope_distribution": {
+    "mixed": 9,
+    "company": 1
+  },
+  "warnings": []
+}
+=== Processing Batch 3/10 ===
+  Generating item 1/10 (Category: user.session_history)...
+    Generated: company.business_priorities_execution_729
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_planning_753
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: user.role_context)...
+    Generated: company.business_priorities_compliance_update_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: user.communication_style)...
+    Generated: user.strategic_approach_negotiation_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: none)...
+    Generated: company.business_priorities_coordination_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: user.workflow_patterns)...
+    Generated: company.performance_context_b2b_saas_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: user.session_history)...
+    Generated: company_business_priorities_campaign_adjustment_789
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: user.interaction_preferences)...
+    Generated: user_interaction_preferences_AI_Collaboration_456
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: company.performance_context)...
+    Generated: company.performance_context_quarterly_review_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_healthcare_123
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_03.json
+  Validating batch...
+{
+  "total_examples": 10,
+  "category_distribution": {
+    "company.business_priorities": 9,
+    "user.role_context": 7,
+    "company.performance_context": 4,
+    "none": 3,
+    "company.knowledge_artifacts": 5,
+    "user.session_history": 1,
+    "user.strategic_approach": 4,
+    "user.communication_style": 1,
+    "company.strategic_signatures": 2,
+    "user.workflow_patterns": 2,
+    "company.tools_config": 3,
+    "user.interaction_preferences": 1
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 7.6,
+  "persistence_distribution": {
+    "short": 3,
+    "mixed": 4,
+    "medium": 3
+  },
+  "scope_distribution": {
+    "mixed": 10
+  },
+  "warnings": []
+}
+=== Processing Batch 4/10 ===
+  Generating item 1/10 (Category: user.strategic_approach)...
+    Generated: company.strategic_signatures_personalization_147
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: user.session_history)...
+    Generated: company.performance_context_procurement_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: company.business_priorities)...
+    Generated: company_business_priorities_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: none)...
+    Generated: company_performance_context_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: company.business_priorities)...
+    Generated: company.strategic_signatures_personalization_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: user.session_history)...
+Scenario generation failed (attempt 1/3): Server disconnected without sending a response.
+Retrying in 5s...
+    Generated: company.business_priorities_healthcare_campaign_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: user.communication_style)...
+    Generated: user.workflow_patterns_collaboration_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: user.interaction_preferences)...
+    Generated: company.tools_config_workflow_adjustment_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: company.brand_core)...
+    Generated: company_strategic_signatures_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: company.strategic_signatures)...
+    Generated: company.knowledge_artifacts_compliance_142
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_04.json
+  Validating batch...
+{
+  "total_examples": 10,
+  "category_distribution": {
+    "company.strategic_signatures": 5,
+    "company.performance_context": 5,
+    "company.tools_config": 4,
+    "user.strategic_approach": 5,
+    "user.role_context": 5,
+    "company.business_priorities": 7,
+    "none": 2,
+    "user.session_history": 2,
+    "company.brand_core": 2,
+    "user.communication_style": 2,
+    "user.workflow_patterns": 3,
+    "company.knowledge_artifacts": 2
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 7.8,
+  "persistence_distribution": {
+    "mixed": 8,
+    "medium": 1,
+    "long": 1
+  },
+  "scope_distribution": {
+    "mixed": 10
+  },
+  "warnings": []
+}
+=== Processing Batch 5/10 ===
+  Generating item 1/10 (Category: company.strategic_signatures)...
+    Generated: company.strategic_signatures_pilot_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: user.communication_style)...
+    Generated: company_knowledge_artifacts_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: user.role_context)...
+    Generated: company.business_priorities_campaign_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: user.communication_style)...
+    Generated: user.communication_style_retention_strategy_147
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: user.strategic_approach)...
+    Generated: company.business_priorities_compliance_training_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: company.brand_core)...
+    Generated: company.business_priorities_demand_response_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: company.strategic_signatures)...
+    Generated: company.business_priorities_negotiation_147
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: company.brand_core)...
+    Generated: company.brand_core_proposal_standardization_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_PLG_789
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: user.strategic_approach)...
+    Generated: company.strategic_signatures_planning_123
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_05.json
+  Validating batch...
+{
+  "total_examples": 10,
+  "category_distribution": {
+    "company.strategic_signatures": 5,
+    "company.brand_core": 3,
+    "company.performance_context": 7,
+    "company.business_priorities": 8,
+    "user.role_context": 4,
+    "user.strategic_approach": 7,
+    "company.knowledge_artifacts": 4,
+    "user.communication_style": 2,
+    "user.workflow_patterns": 2,
+    "company.tools_config": 2,
+    "user.session_history": 2,
+    "none": 1
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 8.9,
+  "persistence_distribution": {
+    "mixed": 8,
+    "short": 1,
+    "long": 1
+  },
+  "scope_distribution": {
+    "mixed": 10
+  },
+  "warnings": []
+}
+=== Processing Batch 6/10 ===
+  Generating item 1/10 (Category: company.knowledge_artifacts)...
+    Generated: company_performance_context_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: user.role_context)...
+    Generated: company.business_priorities_event_launch_coordination_452
+    Sleeping for 10s to avoid rate limits...
+  Generating item 3/10 (Category: user.role_context)...
+    Generated: company.business_priorities_developer_tools_142
+    Sleeping for 10s to avoid rate limits...
+  Generating item 4/10 (Category: company.brand_core)...
+    Generated: company_brand_core_742
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: company.business_priorities)...
+    Generated: company.strategic_signatures_fintech_147
+    Sleeping for 10s to avoid rate limits...
+  Generating item 6/10 (Category: company.strategic_signatures)...
+    Generated: company.business_priorities_partner_strategy_review_457
+    Sleeping for 10s to avoid rate limits...
+  Generating item 7/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_healthcare_compliance_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 8/10 (Category: none)...
+    Generated: company.strategic_signatures_hospitality_724
+    Sleeping for 10s to avoid rate limits...
+  Generating item 9/10 (Category: user.workflow_patterns)...
+Conversation generation failed (attempt 1/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '6b121e3dd2897f71e3b41c24deb7f246', 'date': 'Fri, 21 Nov 2025 21:34:38 GMT', 'x-envoy-upstream-service-time': '3', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': 'd6f09c3c-7117-4fde-81ae-070998fc069a', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
+Retrying in 5s...
+Conversation generation failed (attempt 2/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '0347bae8c3cc9541daddc86a581ba123', 'date': 'Fri, 21 Nov 2025 21:34:43 GMT', 'x-envoy-upstream-service-time': '5', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': '76cde387-275f-4942-9c12-46096a26ef59', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
+Retrying in 10s...
+Conversation generation failed (attempt 3/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '06d09c4265780b1648480b4022c7b208', 'date': 'Fri, 21 Nov 2025 21:34:53 GMT', 'x-envoy-upstream-service-time': '6', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': 'a8cbb61a-75f3-4faa-a11d-86640fe742cc', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
+    Failed to generate conversation for user.workflow_patterns
+    Sleeping for 10s to avoid rate limits...
+  Generating item 10/10 (Category: company.business_priorities)...
+    Generated: company.business_priorities_pilot_execution_142
+    Sleeping for 10s to avoid rate limits...
+  Saved batch to synthetic_data/batch_06.json
+  Validating batch...
+{
+  "total_examples": 9,
+  "category_distribution": {
+    "company.performance_context": 3,
+    "company.knowledge_artifacts": 2,
+    "user.role_context": 9,
+    "user.strategic_approach": 2,
+    "company.business_priorities": 7,
+    "user.workflow_patterns": 4,
+    "company.brand_core": 1,
+    "company.strategic_signatures": 3,
+    "company.tools_config": 2
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 6.111111111111111,
+  "persistence_distribution": {
+    "mixed": 6,
+    "short": 3
+  },
+  "scope_distribution": {
+    "mixed": 9
+  },
+  "warnings": []
+}
+=== Processing Batch 7/10 ===
+  Generating item 1/10 (Category: company.strategic_signatures)...
+    Generated: company_strategic_signatures_123
+    Sleeping for 10s to avoid rate limits...
+  Generating item 2/10 (Category: user.workflow_patterns)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.workflow_patterns
+  Generating item 3/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): The read operation timed out
+    Failed to generate scenario for none
+  Generating item 4/10 (Category: none)...
+Conversation generation failed (attempt 1/3): The read operation timed out
+Retrying in 5s...
+Conversation generation failed (attempt 2/3): The read operation timed out
+Retrying in 10s...
+Conversation generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate conversation for none
+    Sleeping for 10s to avoid rate limits...
+  Generating item 5/10 (Category: user.workflow_patterns)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.workflow_patterns
+  Generating item 6/10 (Category: company.strategic_signatures)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.strategic_signatures
+  Generating item 7/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 8/10 (Category: user.strategic_approach)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.strategic_approach
+  Generating item 9/10 (Category: user.workflow_patterns)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.workflow_patterns
+  Generating item 10/10 (Category: user.strategic_approach)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.strategic_approach
+  Saved batch to synthetic_data/batch_07.json
+  Validating batch...
+{
+  "total_examples": 1,
+  "category_distribution": {
+    "company.strategic_signatures": 2,
+    "company.business_priorities": 3,
+    "none": 1,
+    "user.workflow_patterns": 1
+  },
+  "multi_label_frequency": 1.0,
+  "avg_turns_per_conversation": 7.0,
+  "persistence_distribution": {
+    "mixed": 1
+  },
+  "scope_distribution": {
+    "mixed": 1
+  },
+  "warnings": []
+}
+=== Processing Batch 8/10 ===
+  Generating item 1/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 2/10 (Category: company.knowledge_artifacts)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.knowledge_artifacts
+  Generating item 3/10 (Category: user.session_history)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.session_history
+  Generating item 4/10 (Category: user.workflow_patterns)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.workflow_patterns
+  Generating item 5/10 (Category: company.knowledge_artifacts)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.knowledge_artifacts
+  Generating item 6/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 7/10 (Category: company.knowledge_artifacts)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.knowledge_artifacts
+  Generating item 8/10 (Category: user.role_context)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.role_context
+  Generating item 9/10 (Category: user.session_history)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.session_history
+  Generating item 10/10 (Category: user.role_context)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.role_context
+  Saved batch to synthetic_data/batch_08.json
+  Validating batch...
+{
+  "error": "Empty dataset"
+}
+=== Processing Batch 9/10 ===
+  Generating item 1/10 (Category: company.business_priorities)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.business_priorities
+  Generating item 2/10 (Category: company.brand_core)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.brand_core
+  Generating item 3/10 (Category: user.communication_style)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.communication_style
+  Generating item 4/10 (Category: company.knowledge_artifacts)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.knowledge_artifacts
+  Generating item 5/10 (Category: user.communication_style)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.communication_style
+  Generating item 6/10 (Category: company.tools_config)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.tools_config
+  Generating item 7/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 8/10 (Category: user.interaction_preferences)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.interaction_preferences
+  Generating item 9/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 10/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Saved batch to synthetic_data/batch_09.json
+  Validating batch...
+{
+  "error": "Empty dataset"
+}
+=== Processing Batch 10/10 ===
+  Generating item 1/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Generating item 2/10 (Category: company.business_priorities)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.business_priorities
+  Generating item 3/10 (Category: user.strategic_approach)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.strategic_approach
+  Generating item 4/10 (Category: company.performance_context)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.performance_context
+  Generating item 5/10 (Category: company.strategic_signatures)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.strategic_signatures
+  Generating item 6/10 (Category: company.brand_core)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.brand_core
+  Generating item 7/10 (Category: company.strategic_signatures)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.strategic_signatures
+  Generating item 8/10 (Category: user.strategic_approach)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for user.strategic_approach
+  Generating item 9/10 (Category: company.brand_core)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for company.brand_core
+  Generating item 10/10 (Category: none)...
+Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 5s...
+Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
+Retrying in 10s...
+Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
+    Failed to generate scenario for none
+  Saved batch to synthetic_data/batch_10.json
+  Validating batch...
+{
+  "error": "Empty dataset"
+}
+Completed. Total items generated: 60
+Full dataset saved to synthetic_data/all_generated_data_100.json

synthetic_data/generation_log_100_v2.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/generation_log_async.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/generation_log_final.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/merged_training_dataset_2001.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/pipeline.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+import json
+import random
+import asyncio
+import time
+import cohere
+from typing import List, Dict, Any, Optional
+from dotenv import load_dotenv
+load_dotenv()
+DOMAIN_CONTEXTS = [
+    "B2B SaaS workflow automation for enterprise teams",
+    "Consumer fintech budgeting assistant rolling out in LATAM",
+    "Healthcare patient engagement platform coordinating compliance content",
+    "Retail omnichannel loyalty program for a fashion brand",
+    "EdTech company designing AI tutoring playbooks",
+    "Hospitality chain redefining guest personalization across regions",
+    "Developer tools startup improving product-led growth motions",
+    "Sports media network negotiating sponsorship activations",
+    "Gaming studio planning live-ops launches",
+    "Non-profit fundraising platform balancing donor messaging",
+    "Enterprise cybersecurity firm running incident response playbooks",
+    "Supply-chain analytics platform optimizing vendor collaboration",
+    "CPG beverage brand planning seasonal launches with agencies",
+    "Real-estate marketplace coordinating broker enablement",
+    "Mobility/ride-hailing service planning driver communications",
+    "Streaming media company managing international content drops",
+    "Insurance carrier modernizing agent training workflows",
+    "Energy provider coordinating demand-response campaigns",
+    "Professional services firm standardizing proposal playbooks",
+    "AI infrastructure startup refining go-to-market with partners",
+    "Luxury beauty brand orchestrating influencer activations",
+    "Food delivery platform improving courier retention messaging",
+    "Corporate learning company updating compliance curricula",
+    "Outdoor gear company rolling out omnichannel retail pilots"
+]
+class SyntheticDataPipeline:
+    def __init__(self, api_key: Optional[str] = None, max_retries: int = 5):
+        self.api_key = api_key or os.getenv("COHERE_API_KEY")
+        if not self.api_key:
+            raise ValueError("COHERE_API_KEY not found in environment variables")
+        self.client = cohere.ClientV2(api_key=self.api_key)
+        # Switched to command-r-plus-08-2024 due to rate limits on reasoning model
+        self.model = "command-r-plus-08-2024"
+        self.max_retries = max_retries
+    def _sample_domain_context(self) -> str:
+        return random.choice(DOMAIN_CONTEXTS)
+    @staticmethod
+    def _extract_text(response) -> Optional[str]:
+        """Extract the first text block from a Cohere response."""
+        if not response or not getattr(response, "message", None):
+            return None
+        blocks = getattr(response.message, "content", []) or []
+        for block in blocks:
+            text = getattr(block, "text", None)
+            if isinstance(text, str) and text.strip():
+                return text
+        return None
+    def generate_scenario_spec(self, category: str, distractor: Optional[str] = None,
+                             persistence: str = "long", tone: str = "neutral",
+                             turns: int = 6, special_reqs: str = "") -> Dict[str, Any]:
+        """Stage 1: Generate a scenario specification."""
+        domain_context = self._sample_domain_context()
+        midstream_note = "Conversation should start mid-thread (no greetings) and refer back to earlier collaboration."
+        diversity_note = "Keep subject matter aligned with the given domain context; avoid repeating eco/climate themes unless category demands it."
+        combined_reqs = " | ".join(filter(None, [special_reqs, midstream_note, diversity_note]))
+        if category == "none":
+            prompt = f"""Generate a JSON scenario specification for a conversation that has NO long-term memory value (Category: none).
+The conversation should be strictly transactional, vague, or temporary.
+Examples: checking status, scheduling a meeting, asking a clarification, greeting, small talk, or discussing weather/lunch.
+CONTEXT: General professional setting. Do NOT include any strategic projects, specific brand details, or user preferences that would trigger memory storage.
+Requirements:
+- Primary Category: none
+- Distractor Category: {distractor if distractor else "None"}
+- Persistence Level: short
+- Turn Count: {turns}
+- Special Requirements: {combined_reqs}
+Return a JSON object with:
+{{
+  "scenario_description": "Brief narrative setup (2-3 sentences) - MUST BE NON-MEMORABLE",
+  "user_profile": "User role",
+  "key_signals_to_include": ["List of 2-4 signals that are specifically IRRELEVANT or TEMPORARY"],
+  "distractor_signals": ["Optional list of signals"],
+  "suggested_turn_breakdown": "Flow of conversation"
+}}
+"""
+        else:
+            prompt = f"""You are designing training scenarios for an AI memory system in marketing context. Generate a scenario specification tailored to this business setting: {domain_context}.
+Requirements:
+- Primary Category: {category}
+- Distractor Category: {distractor if distractor else "None"}
+- Persistence Level: {persistence}
+- Emotional Tone: {tone}
+- Turn Count: {turns}
+- Special Requirements: {combined_reqs}
+Return a JSON object with:
+{{
+  "scenario_description": "Brief narrative setup (2-3 sentences)",
+  "user_profile": "User role and context",
+  "key_signals_to_include": ["List of 2-4 specific memory-worthy signals"],
+  "distractor_signals": ["Optional list of noise/irrelevant info"],
+  "suggested_turn_breakdown": "How the conversation should flow"
+}}
+"""
+        for attempt in range(self.max_retries + 1):
+            try:
+                response = self.client.chat(
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.7,
+                    model=self.model,
+                    response_format={"type": "json_object"}
+                )
+                content = self._extract_text(response)
+                if not content:
+                    raise ValueError("No text content found in scenario response")
+                if content.startswith("```json"):
+                    content = content[7:]
+                if content.endswith("```"):
+                    content = content[:-3]
+                return json.loads(content.strip())
+            except Exception as e:
+                print(f"Scenario generation failed (attempt {attempt+1}/{self.max_retries+1}): {e}")
+                if attempt < self.max_retries:
+                    sleep_time = 10 * (2 ** attempt)
+                    print(f"Retrying in {sleep_time}s...")
+                    time.sleep(sleep_time)
+        return {}
+    def generate_conversation(self, scenario_spec: Dict[str, Any], turn_count: int = 6, category: Optional[str] = None) -> Dict[str, Any]:
+        """Stage 2: Generate conversation based on scenario spec."""
+        domain_context = self._sample_domain_context()
+        # Detect if this is a NONE category scenario
+        is_none = category == "none" or (category is None and "none" in str(scenario_spec).lower())
+        if is_none:
+             prompt = f"""You are generating a realistic conversation between a user and an AI assistant.
+The conversation should be transactional, casual, or vague. IT SHOULD NOT contain any significant long-term memory value for a marketing context.
+CONTEXT: General professional setting.
+SCENARIO SPECIFICATION:
+{json.dumps(scenario_spec, indent=2)}
+GENERATION RULES:
+1. Make it natural and fluid.
+2. DO NOT include detailed strategic plans, brand values, or user preferences.
+3. Focus on immediate tasks (scheduling, clarifications, small talk).
+4. Length: {turn_count} turns.
+5. Avoid opening pleasantries like "Hi" - start mid-thread if appropriate, or just dive in.
+OUTPUT FORMAT:
+Return a JSON object with:
+{{
+  "scenario_id": "none_transactional_{{random_3_digit_number}}",
+  "conversation": [
+    {{"role": "user", "content": "..."}},
+    {{"role": "assistant", "content": "..."}}
+  ],
+  "labels": {{
+    "categories": ["none"],
+    "persistence_horizon": "short",
+    "memory_scope": "none",
+    "rationale": "Explanation why this is not memory-worthy"
+  }},
+  "metadata": {{
+    "scenario_type": "negative_example",
+    "primary_category": "none",
+    "distractor_present": false,
+    "turn_count": {turn_count},
+    "signals_present": []
+  }}
+}}
+CRITICAL: Respond with ONLY the JSON object.
+"""
+        else:
+            prompt = f"""You are generating realistic marketing conversations between a user and an AI marketing assistant. Generate natural dialogue that contains specific information worth storing in long-term memory. The conversation should start mid-thread (no greetings) and reference the ongoing initiative described below.
+CONTEXT:
+You will create a conversation that exemplifies certain memory categories while maintaining realism and natural flow. Assume this is part of {domain_context}.
+SCENARIO SPECIFICATION:
+{json.dumps(scenario_spec, indent=2)}
+MEMORY TAXONOMY (for reference):
+COMPANY MEMORY:
+- company.brand_core: Voice, values, positioning, identity anchors (Persistence: Long >1y)
+- company.strategic_signatures: Decision frameworks, strategic heuristics (Persistence: Long >1y)
+- company.knowledge_artifacts: Docs, style guides, playbooks (Persistence: Long >1y)
+- company.business_priorities: Quarterly/seasonal goals, active campaigns (Persistence: Short <3m)
+- company.tools_config: Integrations, API keys, workflow settings (Persistence: Medium ~6m)
+- company.performance_context: Campaign metrics, retrospectives, learnings (Persistence: Rolling ~6m)
+USER MEMORY:
+- user.communication_style: Tone, verbosity, format expectations (Persistence: Long >1y)
+- user.strategic_approach: Personal priorities, success definitions (Persistence: Long >1y)
+- user.role_context: Title, scope, decision authority (Persistence: Medium ~1y)
+- user.workflow_patterns: Review cadence, collaboration norms (Persistence: Medium ~1y)
+- user.session_history: Immediate context, recent asks (Persistence: Short <2w)
+- user.interaction_preferences: Coaching style, feedback expectations (Persistence: Evolving)
+SPECIAL:
+- none: Irrelevant, vague, or transactional content
+GENERATION RULES:
+1. Make conversations feel natural - include some filler, transitions, acknowledgments
+2. Embed memory-worthy information organically (don't make it too obvious)
+3. Include 1-2 utterances that should map to "none" for realism
+4. If multi-label scenario, ensure signals for both categories are present
+5. Length: {turn_count} turns (alternating user/assistant)
+6. Include specific, concrete details (not generic statements)
+7. For company.* categories: use "we", "our company", "our brand"
+8. For user.* categories: use "I prefer", "my approach", "I typically"
+9. Avoid opening pleasantries like "Hi" or "Hello"—jump straight into the ongoing topic.
+10. **CRITICAL CONSTRAINT**: Limit output to 1-3 categories maximum.
+11. **EXCLUSIVE NONE**: If "none" is in the categories list, it MUST be the ONLY category. NEVER mix "none" with other categories. If valid signals exist, do NOT include "none".
+    OUTPUT FORMAT:
+Return a JSON object with:
+{{
+  "scenario_id": "{{primary_category}}_{{scenario_type}}_{{random_3_digit_number}}",
+  "conversation": [
+    {{"role": "user", "content": "..."}},
+    {{"role": "assistant", "content": "..."}},
+    ...
+  ],
+  "labels": {{
+    "categories": ["array of applicable categories"],
+    "persistence_horizon": "long|medium|short",
+    "memory_scope": "company|user|mixed|none",
+    "rationale": "1-2 sentence explanation of category choices"
+  }},
+  "metadata": {{
+    "scenario_type": "descriptive_label",
+    "primary_category": "main_category",
+    "distractor_present": true|false,
+    "turn_count": integer,
+    "signals_present": ["list of specific signals included"]
+  }}
+}}
+CRITICAL: Respond with ONLY the JSON object. No markdown formatting, no explanation, no preamble.
+Generate the conversation now."""
+        for attempt in range(self.max_retries + 1):
+            try:
+                response = self.client.chat(
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.7,
+                    model=self.model,
+                    response_format={"type": "json_object"}
+                )
+                content = self._extract_text(response)
+                if not content:
+                    raise ValueError("No text content found in conversation response")
+                if content.startswith("```json"):
+                    content = content[7:]
+                if content.endswith("```"):
+                    content = content[:-3]
+                return json.loads(content.strip())
+            except Exception as e:
+                print(f"Conversation generation failed (attempt {attempt+1}/{self.max_retries+1}): {e}")
+                if attempt < self.max_retries:
+                    sleep_time = 10 * (2 ** attempt)
+                    print(f"Retrying in {sleep_time}s...")
+                    time.sleep(sleep_time)
+        return {}
+    def run_batch(self, count: int = 1, category: str = "company.brand_core") -> List[Dict[str, Any]]:
+        """Run a batch generation."""
+        results = []
+        print(f"Starting batch generation for {count} examples of {category}...")
+        for i in range(count):
+            print(f"Generating example {i+1}/{count}...")
+            scenario = self.generate_scenario_spec(category=category)
+            if not scenario:
+                print("Skipping due to scenario generation failure")
+                continue
+            conversation = self.generate_conversation(scenario)
+            if conversation:
+                results.append(conversation)
+                print(f"Successfully generated conversation: {conversation.get('scenario_id', 'unknown')}")
+            else:
+                print("Failed to generate conversation")
+        return results
+if __name__ == "__main__":
+    # Simple test run
+    pipeline = SyntheticDataPipeline()
+    results = pipeline.run_batch(count=1)
+    print(json.dumps(results, indent=2))

synthetic_data/quick_test_diverse.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""Quick test of diverse generation."""
+import json
+import random
+import os
+from dotenv import load_dotenv
+load_dotenv()
+import cohere
+client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
+# Test one generation
+category = "company.tools_config"
+industry = "Series A fintech building a neobank"
+persona = "a growth lead obsessed with metrics"
+situation = "debugging why a campaign tanked"
+tone = "frustrated"
+prompt = f"""You are a world-class creative writer generating training data for an AI memory routing system.
+Create a completely unique, realistic conversation between {persona} at a {industry} and their AI marketing assistant.
+Context: They are {situation}. The tone is {tone}.
+CATEGORY TO DEMONSTRATE: {category}
+The conversation should involve tool setup, integrations, APIs, or workflow automation.
+CREATIVE FREEDOM:
+- Invent specific, realistic details (names, numbers, dates, products)
+- The conversation can start anywhere - mid-thought, mid-project, mid-crisis
+- Vary structure dramatically
+- Include natural speech patterns
+- Make it feel like eavesdropping on a real conversation
+The ONLY hard requirement: the conversation must clearly demonstrate {category}.
+Output as JSON:
+{{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"]}}, "metadata": {{"primary_category": "{category}", "industry": "{industry}"}}}}"""
+print("Sending request...")
+response = client.chat(
+    messages=[{"role": "user", "content": prompt}],
+    temperature=0.95,
+    model="command-r-plus-08-2024",
+    response_format={"type": "json_object"}
+)
+content = response.message.content[0].text
+print("\n=== RAW RESPONSE ===")
+print(content[:500])
+data = json.loads(content)
+print("\n=== PARSED ===")
+print(f"Categories: {data.get('labels', {}).get('categories', [])}")
+conv = data.get("conversation", [])
+if conv:
+    for i, turn in enumerate(conv[:4]):
+        if isinstance(turn, dict):
+            print(f"\n[{turn.get('role', 'unknown')}]: {turn.get('content', '')[:150]}...")
+        else:
+            print(f"\n[turn {i}]: {str(turn)[:150]}...")

synthetic_data/run_balanced_async.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+Balanced Dataset Generation with Concurrent API Calls
+Generates 10 items simultaneously per batch for faster generation.
+"""
+import json
+import random
+import time
+import sys
+import asyncio
+import os
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+import cohere
+from dotenv import load_dotenv
+load_dotenv()
+# Target counts per category (balanced)
+CATEGORY_TARGETS = {
+    "company.brand_core": 77,
+    "company.strategic_signatures": 77,
+    "company.knowledge_artifacts": 77,
+    "company.business_priorities": 77,
+    "company.tools_config": 77,
+    "company.performance_context": 77,
+    "user.communication_style": 77,
+    "user.strategic_approach": 77,
+    "user.role_context": 77,
+    "user.workflow_patterns": 77,
+    "user.session_history": 77,
+    "user.interaction_preferences": 77,
+    "none": 77,
+}
+CATEGORY_EXAMPLES = {
+    "company.brand_core": {
+        "signals": ["brand voice is warm", "primary color is #2563EB", "never use jargon", "tagline is..."],
+    },
+    "company.strategic_signatures": {
+        "signals": ["always prioritize retention", "80/20 rule", "never launch without testing"],
+    },
+    "company.knowledge_artifacts": {
+        "signals": ["style guide says", "playbook recommends", "SOP for launches", "template includes"],
+    },
+    "company.business_priorities": {
+        "signals": ["Q4 focus is", "this quarter's target", "holiday campaign", "prioritizing APAC"],
+    },
+    "company.tools_config": {
+        "signals": ["Slack webhook URL", "HubSpot sync", "API key is", "Zapier integration"],
+    },
+    "company.performance_context": {
+        "signals": ["24% open rate", "CTR improved by", "retrospective showed", "conversion dropped"],
+    },
+    "user.communication_style": {
+        "signals": ["prefer bullet points", "keep it under 200 words", "casual tone", "data-driven"],
+    },
+    "user.strategic_approach": {
+        "signals": ["prioritize speed over perfection", "test fast fail fast", "customer feedback"],
+    },
+    "user.role_context": {
+        "signals": ["As VP of Marketing", "report to CMO", "budget authority up to", "manage team of"],
+    },
+    "user.workflow_patterns": {
+        "signals": ["review drafts Monday", "don't send Friday", "async via Slack", "weekly sync Tuesday"],
+    },
+    "user.session_history": {
+        "signals": ["as we discussed yesterday", "continuing from last", "proposal we started"],
+    },
+    "user.interaction_preferences": {
+        "signals": ["push back on my ideas", "give me options", "be direct", "ask clarifying questions"],
+    },
+    "none": {
+        "signals": ["what time is meeting", "checking status", "confirming receipt", "quick question"],
+    },
+}
+class BalancedAsyncGenerator:
+    def __init__(self):
+        self.api_key = os.getenv("COHERE_API_KEY")
+        if not self.api_key:
+            raise ValueError("COHERE_API_KEY not found")
+        self.client = cohere.ClientV2(api_key=self.api_key)
+        self.model = "command-r-plus-08-2024"
+        self.executor = ThreadPoolExecutor(max_workers=10)
+    def _extract_text(self, response) -> Optional[str]:
+        if not response or not getattr(response, "message", None):
+            return None
+        blocks = getattr(response.message, "content", []) or []
+        for block in blocks:
+            text = getattr(block, "text", None)
+            if isinstance(text, str) and text.strip():
+                return text
+        return None
+    def _generate_sync(self, category: str) -> Optional[Dict]:
+        """Synchronous generation for a single category."""
+        signals = CATEGORY_EXAMPLES.get(category, {}).get("signals", [])
+        signals_text = "\n".join(f"- {s}" for s in signals[:4])
+        if category == "none":
+            prompt = f"""Generate a marketing conversation with NO long-term memory value.
+Transactional, vague, or temporary only. Examples: status check, scheduling, confirming.
+4-6 turns, no greetings, start mid-conversation.
+OUTPUT (JSON only):
+{{"scenario_id": "none_{random.randint(100,999)}", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["none"], "persistence_horizon": "short", "memory_scope": "none", "rationale": "..."}}, "metadata": {{"primary_category": "none", "turn_count": 4}}}}"""
+        else:
+            prompt = f"""Generate a marketing conversation demonstrating: {category}
+SIGNALS FOR THIS CATEGORY:
+{signals_text}
+REQUIREMENTS:
+1. MUST contain clear signals for {category}
+2. 4-6 turns, no greetings, start mid-conversation
+3. Include specific details (names, numbers, dates)
+CRITICAL: categories array MUST include "{category}"
+OUTPUT (JSON only):
+{{"scenario_id": "{category.replace('.', '_')}_{random.randint(100,999)}", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"], "persistence_horizon": "long", "memory_scope": "company", "rationale": "..."}}, "metadata": {{"primary_category": "{category}", "turn_count": 4}}}}"""
+        try:
+            response = self.client.chat(
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.7,
+                model=self.model,
+                response_format={"type": "json_object"}
+            )
+            content = self._extract_text(response)
+            if not content:
+                return None
+            if content.startswith("```json"):
+                content = content[7:]
+            if content.endswith("```"):
+                content = content[:-3]
+            data = json.loads(content.strip())
+            # Validate target category is present
+            categories = data.get("labels", {}).get("categories", [])
+            if category.lower() not in [c.lower() for c in categories]:
+                return None
+            # Clean: Remove "none" if other categories exist
+            if len(categories) > 1 and "none" in [c.lower() for c in categories]:
+                data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
+            return data
+        except Exception as e:
+            return None
+    async def generate_batch(self, categories: List[str]) -> List[Dict]:
+        """Generate a batch of items concurrently."""
+        loop = asyncio.get_event_loop()
+        tasks = [
+            loop.run_in_executor(self.executor, self._generate_sync, cat)
+            for cat in categories
+        ]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [r for r in results if isinstance(r, dict)]
+async def run_balanced_generation_async():
+    """Run balanced generation with concurrent batches."""
+    generator = BalancedAsyncGenerator()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = f"synthetic_data/balanced_dataset_{timestamp}.jsonl"
+    # Track progress per category
+    category_counts = {cat: 0 for cat in CATEGORY_TARGETS}
+    all_data = []
+    print("=" * 70, flush=True)
+    print("BALANCED CONCURRENT DATASET GENERATION", flush=True)
+    print("=" * 70, flush=True)
+    print(f"Target per category: 77", flush=True)
+    print(f"Total categories: {len(CATEGORY_TARGETS)}", flush=True)
+    print(f"Expected total: {77 * len(CATEGORY_TARGETS)}", flush=True)
+    print(f"Batch size: 10 concurrent requests", flush=True)
+    print(flush=True)
+    batch_num = 0
+    while True:
+        # Find categories that still need examples
+        needed = []
+        for cat, target in CATEGORY_TARGETS.items():
+            remaining = target - category_counts[cat]
+            needed.extend([cat] * min(remaining, 2))  # Up to 2 per category per batch
+        if not needed:
+            break
+        # Take up to 10 for this batch
+        batch_categories = needed[:10]
+        batch_num += 1
+        print(f"\n[Batch {batch_num}] Generating {len(batch_categories)} items...", flush=True)
+        results = await generator.generate_batch(batch_categories)
+        # Process results
+        for result in results:
+            if result:
+                primary = result.get("metadata", {}).get("primary_category") or \
+                         result.get("labels", {}).get("categories", ["unknown"])[0]
+                if primary in category_counts:
+                    category_counts[primary] += 1
+                    all_data.append(result)
+                    # Save incrementally
+                    with open(output_file, "a") as f:
+                        f.write(json.dumps(result) + "\n")
+        # Progress report
+        total_done = sum(category_counts.values())
+        total_target = sum(CATEGORY_TARGETS.values())
+        print(f"  Success: {len(results)}/{len(batch_categories)} | Total: {total_done}/{total_target}", flush=True)
+        # Show category progress every 10 batches
+        if batch_num % 10 == 0:
+            print("\n  Category Progress:", flush=True)
+            for cat, count in sorted(category_counts.items()):
+                target = CATEGORY_TARGETS[cat]
+                bar = "█" * (count * 20 // target) + "░" * (20 - count * 20 // target)
+                print(f"    {cat:<35} [{bar}] {count}/{target}", flush=True)
+        # Rate limit: wait 3 seconds between batches
+        await asyncio.sleep(3)
+    # Final summary
+    print("\n" + "=" * 70, flush=True)
+    print("GENERATION COMPLETE", flush=True)
+    print("=" * 70, flush=True)
+    print(f"\nFinal Distribution:", flush=True)
+    for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
+        pct = count / len(all_data) * 100 if all_data else 0
+        print(f"  {cat:<40} {count:>4} ({pct:.1f}%)", flush=True)
+    print(f"\nTotal examples: {len(all_data)}", flush=True)
+    print(f"Output file: {output_file}", flush=True)
+    return output_file
+if __name__ == "__main__":
+    asyncio.run(run_balanced_generation_async())

synthetic_data/run_balanced_generation.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""
+Balanced Dataset Generation Script
+This script generates a balanced training dataset with:
+1. STRICT category enforcement - the model MUST output the target category
+2. Equal distribution across all categories
+3. Improved prompts for underrepresented categories
+"""
+import json
+import random
+import time
+import sys
+import asyncio
+import os
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+import cohere
+from dotenv import load_dotenv
+load_dotenv()
+# BALANCED DISTRIBUTION - Equal weight for all categories
+BALANCED_DISTRIBUTION = {
+    "company.brand_core": 80,
+    "company.strategic_signatures": 80,
+    "company.knowledge_artifacts": 80,
+    "company.business_priorities": 80,
+    "company.tools_config": 80,
+    "company.performance_context": 80,
+    "user.communication_style": 80,
+    "user.strategic_approach": 80,
+    "user.role_context": 80,
+    "user.workflow_patterns": 80,
+    "user.session_history": 80,
+    "user.interaction_preferences": 80,
+    "none": 80,
+}
+# Category-specific examples and signals for better generation
+CATEGORY_EXAMPLES = {
+    "company.brand_core": {
+        "description": "Brand voice, values, positioning, visual identity, tone guidelines",
+        "example_signals": [
+            "Our brand voice is warm and conversational",
+            "We always use sentence case for headlines",
+            "Our primary color is #2563EB",
+            "We never use corporate jargon",
+            "Our tagline is 'Simplify Everything'"
+        ],
+        "example_conversation": "USER: Remember, our brand personality is 'friendly expert' - knowledgeable but approachable."
+    },
+    "company.strategic_signatures": {
+        "description": "Decision frameworks, strategic heuristics, recurring patterns in how the company operates",
+        "example_signals": [
+            "We always prioritize retention over acquisition",
+            "Our 80/20 rule: 80% proven tactics, 20% experiments",
+            "We never launch without A/B testing",
+            "Customer lifetime value drives all decisions"
+        ],
+        "example_conversation": "USER: Our strategic principle is 'land and expand' - start small with enterprises then grow."
+    },
+    "company.knowledge_artifacts": {
+        "description": "Style guides, playbooks, SOPs, documented processes, templates",
+        "example_signals": [
+            "Here's our content style guide",
+            "The campaign playbook says...",
+            "According to our SOP for launches",
+            "Our template for proposals includes..."
+        ],
+        "example_conversation": "USER: I'm attaching our updated brand guidelines PDF. Make sure all content follows section 3.2."
+    },
+    "company.business_priorities": {
+        "description": "Quarterly goals, seasonal campaigns, current OKRs, active initiatives",
+        "example_signals": [
+            "Q4 focus is enterprise expansion",
+            "This quarter's target is 500 MQLs",
+            "Holiday campaign launches December 1st",
+            "We're prioritizing APAC market this quarter"
+        ],
+        "example_conversation": "USER: For Q1, we're shifting focus entirely to the SMB segment. All campaigns should target companies under 100 employees."
+    },
+    "company.tools_config": {
+        "description": "Integrations, API keys, workflow settings, tool configurations",
+        "example_signals": [
+            "The Slack webhook URL is...",
+            "Configure HubSpot to sync with...",
+            "The API key for analytics is...",
+            "Set up the Zapier integration to..."
+        ],
+        "example_conversation": "USER: Here's the API key for our analytics dashboard: sk-xxx-123. Make sure it syncs every 6 hours."
+    },
+    "company.performance_context": {
+        "description": "Campaign metrics, retrospectives, learnings, performance data",
+        "example_signals": [
+            "Last campaign had 24% open rate",
+            "CTR improved by 15% after the redesign",
+            "The retrospective showed we need more testing",
+            "Conversion rate dropped after the price change"
+        ],
+        "example_conversation": "USER: The email campaign results are in: 28% open rate, 4.2% CTR. That's our best performance this year."
+    },
+    "user.communication_style": {
+        "description": "Preferred tone, verbosity, format expectations, writing style",
+        "example_signals": [
+            "I prefer bullet points over paragraphs",
+            "Keep responses under 200 words",
+            "Use casual, friendly tone with me",
+            "I like data-driven explanations"
+        ],
+        "example_conversation": "USER: Just so you know, I prefer concise bullet points. No need for lengthy explanations with me."
+    },
+    "user.strategic_approach": {
+        "description": "Personal priorities, success definitions, decision-making style",
+        "example_signals": [
+            "I always prioritize speed over perfection",
+            "My philosophy is test fast, fail fast",
+            "I measure success by customer feedback",
+            "I believe in data-driven decisions only"
+        ],
+        "example_conversation": "USER: My approach is always 'done is better than perfect'. I'd rather ship and iterate."
+    },
+    "user.role_context": {
+        "description": "Title, scope, decision authority, reporting structure",
+        "example_signals": [
+            "As VP of Marketing, I approve all campaigns",
+            "I report directly to the CMO",
+            "My budget authority is up to $50k",
+            "I manage a team of 12 marketers"
+        ],
+        "example_conversation": "USER: Just for context, I'm the Director of Growth and I have final say on all acquisition campaigns."
+    },
+    "user.workflow_patterns": {
+        "description": "Review cadence, collaboration norms, meeting schedules",
+        "example_signals": [
+            "I review drafts every Monday morning",
+            "Don't send me anything on Fridays",
+            "I prefer async communication via Slack",
+            "Weekly sync is Tuesdays at 2pm"
+        ],
+        "example_conversation": "USER: My review schedule is Monday mornings only. Anything sent Friday won't be seen until next week."
+    },
+    "user.session_history": {
+        "description": "Immediate context, recent asks, current working session",
+        "example_signals": [
+            "As we discussed yesterday...",
+            "Continuing from our last conversation",
+            "The proposal we started earlier",
+            "Following up on the draft you sent"
+        ],
+        "example_conversation": "USER: Let's pick up where we left off yesterday on the Johnson account proposal."
+    },
+    "user.interaction_preferences": {
+        "description": "Coaching style, feedback expectations, collaboration preferences",
+        "example_signals": [
+            "I want you to push back on my ideas",
+            "Give me options, not just one answer",
+            "Be direct with feedback, don't sugarcoat",
+            "I prefer you ask clarifying questions"
+        ],
+        "example_conversation": "USER: I want you to challenge my assumptions. If you think I'm wrong, tell me directly."
+    },
+    "none": {
+        "description": "Transactional, vague, or temporary content with no memory value",
+        "example_signals": [
+            "What time is the meeting?",
+            "Can you check the status?",
+            "Just confirming receipt",
+            "Quick question about the attachment"
+        ],
+        "example_conversation": "USER: Hey, what's the status on that thing we discussed? Just checking in."
+    }
+}
+class BalancedDataGenerator:
+    def __init__(self, api_key: Optional[str] = None):
+        self.api_key = api_key or os.getenv("COHERE_API_KEY")
+        if not self.api_key:
+            raise ValueError("COHERE_API_KEY not found")
+        self.client = cohere.ClientV2(api_key=self.api_key)
+        self.model = "command-r-plus-08-2024"
+    def _extract_text(self, response) -> Optional[str]:
+        if not response or not getattr(response, "message", None):
+            return None
+        blocks = getattr(response.message, "content", []) or []
+        for block in blocks:
+            text = getattr(block, "text", None)
+            if isinstance(text, str) and text.strip():
+                return text
+        return None
+    def generate_for_category(self, category: str, max_retries: int = 3) -> Optional[Dict]:
+        """Generate a conversation that MUST contain the specified category."""
+        cat_info = CATEGORY_EXAMPLES.get(category, {})
+        description = cat_info.get("description", category)
+        example_signals = cat_info.get("example_signals", [])
+        example_conv = cat_info.get("example_conversation", "")
+        # Build a very specific prompt
+        if category == "none":
+            prompt = f"""Generate a realistic marketing conversation that has NO long-term memory value.
+The conversation should be:
+- Transactional (checking status, scheduling, confirming)
+- Vague or generic (no specific details worth remembering)
+- Temporary (only relevant for this moment)
+Examples of "none" conversations:
+- "What time is the meeting tomorrow?"
+- "Just confirming you received the file"
+- "Quick status check on the project"
+- "Can you resend that link?"
+Generate a 4-6 turn conversation between USER and ASSISTANT.
+Start mid-conversation (no greetings).
+OUTPUT FORMAT (JSON only):
+{{
+  "scenario_id": "none_{random.randint(100,999)}",
+  "conversation": [
+    {{"role": "user", "content": "..."}},
+    {{"role": "assistant", "content": "..."}}
+  ],
+  "labels": {{
+    "categories": ["none"],
+    "persistence_horizon": "short",
+    "memory_scope": "none",
+    "rationale": "This conversation is transactional/temporary with no memory value"
+  }},
+  "metadata": {{
+    "primary_category": "none",
+    "turn_count": 4
+  }}
+}}"""
+        else:
+            prompt = f"""Generate a marketing conversation that clearly demonstrates the category: {category}
+CATEGORY DEFINITION:
+{description}
+SIGNALS THAT INDICATE THIS CATEGORY:
+{chr(10).join(f"- {s}" for s in example_signals[:4])}
+EXAMPLE UTTERANCE:
+{example_conv}
+REQUIREMENTS:
+1. The conversation MUST contain clear signals for {category}
+2. The USER should explicitly state information that maps to this category
+3. Make it natural and realistic - embed the signals organically
+4. 4-6 turns, start mid-conversation (no greetings)
+5. Include specific, concrete details (names, numbers, dates)
+CRITICAL: The output categories array MUST include "{category}" as the primary category.
+You may include 1 additional category if naturally present, but {category} MUST be there.
+OUTPUT FORMAT (JSON only):
+{{
+  "scenario_id": "{category.replace('.', '_')}_{random.randint(100,999)}",
+  "conversation": [
+    {{"role": "user", "content": "..."}},
+    {{"role": "assistant", "content": "..."}}
+  ],
+  "labels": {{
+    "categories": ["{category}"],
+    "persistence_horizon": "long|medium|short",
+    "memory_scope": "company|user",
+    "rationale": "Explanation of why {category} applies"
+  }},
+  "metadata": {{
+    "primary_category": "{category}",
+    "turn_count": 4
+  }}
+}}"""
+        for attempt in range(max_retries):
+            try:
+                response = self.client.chat(
+                    messages=[{"role": "user", "content": prompt}],
+                    temperature=0.7,
+                    model=self.model,
+                    response_format={"type": "json_object"}
+                )
+                content = self._extract_text(response)
+                if not content:
+                    continue
+                # Clean JSON
+                if content.startswith("```json"):
+                    content = content[7:]
+                if content.endswith("```"):
+                    content = content[:-3]
+                data = json.loads(content.strip())
+                # VALIDATE: Ensure target category is present
+                categories = data.get("labels", {}).get("categories", [])
+                if category.lower() not in [c.lower() for c in categories]:
+                    print(f"  Warning: Target {category} not in output {categories}. Retrying...")
+                    continue
+                # Clean: Remove "none" if other categories exist
+                if len(categories) > 1 and "none" in [c.lower() for c in categories]:
+                    data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
+                return data
+            except Exception as e:
+                print(f"  Attempt {attempt+1} failed: {e}")
+                time.sleep(5 * (attempt + 1))
+        return None
+async def generate_balanced_dataset(output_dir: str = "synthetic_data", target_per_category: int = 80):
+    """Generate a balanced dataset with equal examples per category."""
+    os.makedirs(output_dir, exist_ok=True)
+    generator = BalancedDataGenerator()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = f"{output_dir}/balanced_dataset_{timestamp}.jsonl"
+    log_file = f"{output_dir}/balanced_generation_log_{timestamp}.txt"
+    all_data = []
+    category_counts = {cat: 0 for cat in BALANCED_DISTRIBUTION.keys()}
+    print("=" * 70, flush=True)
+    print("BALANCED DATASET GENERATION", flush=True)
+    print("=" * 70, flush=True)
+    print(f"Target per category: {target_per_category}", flush=True)
+    print(f"Total categories: {len(BALANCED_DISTRIBUTION)}", flush=True)
+    print(f"Expected total: {target_per_category * len(BALANCED_DISTRIBUTION)}", flush=True)
+    print(flush=True)
+    with open(log_file, "w") as log:
+        log.write(f"Balanced Generation Started: {timestamp}\n")
+        log.write(f"Target per category: {target_per_category}\n\n")
+        for category in BALANCED_DISTRIBUTION.keys():
+            print(f"\n--- Generating {target_per_category} examples for: {category} ---", flush=True)
+            log.write(f"\n=== {category} ===\n")
+            log.flush()
+            for i in range(target_per_category):
+                result = generator.generate_for_category(category)
+                if result:
+                    all_data.append(result)
+                    category_counts[category] += 1
+                    # Save incrementally
+                    with open(output_file, "a") as f:
+                        f.write(json.dumps(result) + "\n")
+                    if (i + 1) % 10 == 0:
+                        print(f"  Progress: {i+1}/{target_per_category}", flush=True)
+                        log.write(f"  {i+1}/{target_per_category} complete\n")
+                        log.flush()
+                else:
+                    print(f"  Failed: {i+1}", flush=True)
+                    log.write(f"  Failed to generate example {i+1}\n")
+                    log.flush()
+                # Rate limiting
+                await asyncio.sleep(0.5)
+            print(f"  Completed: {category_counts[category]}/{target_per_category}", flush=True)
+    # Final summary
+    print("\n" + "=" * 70)
+    print("GENERATION COMPLETE")
+    print("=" * 70)
+    print(f"\nCategory Distribution:")
+    for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
+        pct = count / len(all_data) * 100 if all_data else 0
+        print(f"  {cat:<40} {count:>4} ({pct:.1f}%)")
+    print(f"\nTotal examples: {len(all_data)}")
+    print(f"Output file: {output_file}")
+    return output_file
+if __name__ == "__main__":
+    target = int(sys.argv[1]) if len(sys.argv) > 1 else 80
+    asyncio.run(generate_balanced_dataset(target_per_category=target))

synthetic_data/run_batch.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import json
+import random
+import time
+import sys
+from typing import List, Dict, Any
+from synthetic_data.pipeline import SyntheticDataPipeline
+from synthetic_data.validate import validate_synthetic_data
+CATEGORY_DISTRIBUTION = {
+    "company.brand_core": 0.10,
+    "company.strategic_signatures": 0.08,
+    "company.knowledge_artifacts": 0.08,
+    "company.business_priorities": 0.10,
+    "company.tools_config": 0.07,
+    "company.performance_context": 0.09,
+    "user.communication_style": 0.10,
+    "user.strategic_approach": 0.09,
+    "user.role_context": 0.07,
+    "user.workflow_patterns": 0.08,
+    "user.session_history": 0.06,
+    "user.interaction_preferences": 0.08,
+    "none": 0.10
+}
+def run_pipeline_batches(total_items: int = 100, batch_size: int = 10):
+    pipeline = SyntheticDataPipeline()
+    categories = list(CATEGORY_DISTRIBUTION.keys())
+    weights = list(CATEGORY_DISTRIBUTION.values())
+    all_data = []
+    num_batches = max(1, total_items // batch_size)
+    print(f"Starting generation of {total_items} items in {num_batches} batches (Size: {batch_size})...")
+    for batch_num in range(1, num_batches + 1):
+        print(f"\n=== Processing Batch {batch_num}/{num_batches} ===")
+        batch_data = []
+        while len(batch_data) < batch_size:
+            category = random.choices(categories, weights=weights, k=1)[0]
+            current_count = len(batch_data) + 1
+            print(f"  Generating item {current_count}/{batch_size} (Category: {category})...")
+            # Determine if we should add a distractor (30% chance)
+            distractor = None
+            if random.random() < 0.30 and category != "none":
+                 possible_distractors = [c for c in categories if c != category and c != "none"]
+                 if possible_distractors:
+                     distractor = random.choice(possible_distractors)
+            persistence = _get_persistence_for_category(category)
+            turns = random.randint(4, 10)
+            scenario = pipeline.generate_scenario_spec(
+                category=category,
+                distractor=distractor,
+                persistence=persistence,
+                turns=turns
+            )
+            if not scenario:
+                print(f"    Failed to generate scenario for {category}. Retrying...")
+                time.sleep(20)
+                continue
+            conversation = pipeline.generate_conversation(scenario, turn_count=turns)
+            if conversation:
+                batch_data.append(conversation)
+                print(f"    Generated: {conversation.get('scenario_id', 'Unknown ID')}")
+            else:
+                 print(f"    Failed to generate conversation for {category}. Retrying...")
+                 time.sleep(20)
+                 continue
+            print("    Sleeping for 15s to avoid rate limits...")
+            time.sleep(15)
+        # Save batch
+        batch_filename = f"synthetic_data/batch_{batch_num:02d}.json"
+        with open(batch_filename, "w") as f:
+            json.dump(batch_data, f, indent=2)
+        print(f"  Saved batch to {batch_filename}")
+        # Validate batch
+        print("  Validating batch...")
+        metrics = validate_synthetic_data(batch_filename)
+        print(json.dumps(metrics, indent=2))
+        all_data.extend(batch_data)
+    # Save all data
+    with open("synthetic_data/all_generated_data_100.json", "w") as f:
+        json.dump(all_data, f, indent=2)
+    print(f"\nCompleted. Total items generated: {len(all_data)}")
+    print("Full dataset saved to synthetic_data/all_generated_data_100.json")
+def _get_persistence_for_category(category: str) -> str:
+    if "brand_core" in category or "strategic_signatures" in category or "knowledge_artifacts" in category or "communication_style" in category or "strategic_approach" in category:
+        return "long"
+    elif "tools_config" in category or "role_context" in category or "workflow_patterns" in category:
+        return "medium"
+    elif "business_priorities" in category or "session_history" in category:
+        return "short"
+    elif "performance_context" in category:
+        return "rolling"
+    elif "interaction_preferences" in category:
+        return "evolving"
+    elif "none" in category:
+        return "short"
+    return "medium"
+if __name__ == "__main__":
+    total = int(sys.argv[1]) if len(sys.argv) > 1 else 100
+    batch = int(sys.argv[2]) if len(sys.argv) > 2 else 10
+    run_pipeline_batches(total, batch)

synthetic_data/run_batch_async.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import json
+import random
+import time
+import sys
+import asyncio
+import os
+from typing import List, Dict, Any
+from synthetic_data.pipeline import SyntheticDataPipeline
+from synthetic_data.validate import validate_synthetic_data
+from synthetic_data.clean_data import clean_datum
+CATEGORY_DISTRIBUTION = {
+    "none": 0.15,
+    "user.interaction_preferences": 0.12,
+    "user.session_history": 0.10,
+    "company.brand_core": 0.10,
+    "company.strategic_signatures": 0.07,
+    "company.knowledge_artifacts": 0.07,
+    "user.communication_style": 0.07,
+    "user.strategic_approach": 0.07,
+    "user.workflow_patterns": 0.07,
+    "company.tools_config": 0.05,
+    "company.performance_context": 0.05,
+    "company.business_priorities": 0.04,
+    "user.role_context": 0.04
+}
+async def generate_single_item(pipeline: SyntheticDataPipeline, category: str, item_num: int) -> Dict[str, Any]:
+    """Generate a single conversation item asynchronously."""
+    print(f"  Starting item {item_num} (Target: {category})...")
+    # Determine distractor
+    categories = list(CATEGORY_DISTRIBUTION.keys())
+    distractor = None
+    if random.random() < 0.30 and category != "none":
+        possible_distractors = [c for c in categories if c != category and c != "none"]
+        if possible_distractors:
+            distractor = random.choice(possible_distractors)
+    persistence = _get_persistence_for_category(category)
+    turns = random.randint(4, 10)
+    # Generate scenario (synchronous call wrapped in executor)
+    loop = asyncio.get_event_loop()
+    scenario = await loop.run_in_executor(
+        None,
+        pipeline.generate_scenario_spec,
+        category,
+        distractor,
+        persistence,
+        "neutral",
+        turns,
+        ""
+    )
+    if not scenario:
+        print(f"  Failed item {item_num}: scenario generation failed")
+        return None
+    # Generate conversation
+    conversation = await loop.run_in_executor(
+        None,
+        pipeline.generate_conversation,
+        scenario,
+        turns,
+        category
+    )
+    if conversation:
+        # Clean the item immediately
+        cleaned_conversation = clean_datum(conversation)
+        print(f"  Completed item {item_num}: {cleaned_conversation.get('scenario_id', 'Unknown')}")
+        return cleaned_conversation
+    else:
+        print(f"  Failed item {item_num}: conversation generation failed")
+        return None
+async def generate_batch_concurrent(pipeline: SyntheticDataPipeline, batch_size: int, batch_num: int) -> List[Dict[str, Any]]:
+    """Generate a full batch of items concurrently, retrying until batch is full."""
+    print(f"\n=== Processing Batch {batch_num} (Concurrent) ===")
+    categories = list(CATEGORY_DISTRIBUTION.keys())
+    weights = list(CATEGORY_DISTRIBUTION.values())
+    batch_data = []
+    items_needed = batch_size
+    while items_needed > 0:
+        # Select categories for this chunk of work
+        batch_categories = random.choices(categories, weights=weights, k=items_needed)
+        # Create tasks for needed items
+        tasks = [
+            generate_single_item(pipeline, category, len(batch_data) + i + 1)
+            for i, category in enumerate(batch_categories)
+        ]
+        print(f"  Launch {items_needed} concurrent tasks...")
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Collect successes
+        success_count = 0
+        for result in results:
+            if isinstance(result, Exception):
+                print(f"    Task exception: {result}")
+            elif result is not None:
+                batch_data.append(result)
+                success_count += 1
+        items_needed = batch_size - len(batch_data)
+        if items_needed > 0:
+            print(f"  Batch incomplete ({len(batch_data)}/{batch_size}). Retrying {items_needed} items in 5s...")
+            await asyncio.sleep(5)
+    print(f"Batch {batch_num} complete: {len(batch_data)}/{batch_size} items generated")
+    return batch_data
+async def run_pipeline_batches_async(total_items: int = 100, batch_size: int = 10):
+    """Run the full pipeline with concurrent batch processing."""
+    pipeline = SyntheticDataPipeline(max_retries=5)
+    all_data = []
+    num_batches = max(1, total_items // batch_size)
+    print(f"Starting CONCURRENT generation of {total_items} items in {num_batches} batches...")
+    print(f"Batch size: {batch_size} items (generated in parallel)")
+    for batch_num in range(1, num_batches + 1):
+        # Check if batch already exists
+        batch_filename = f"synthetic_data/batch_{batch_num:02d}.jsonl"
+        if os.path.exists(batch_filename):
+            print(f"Batch {batch_num} already exists ({batch_filename}). Skipping generation...")
+            # Load existing data to include in final output
+            try:
+                with open(batch_filename, 'r') as f:
+                    for line in f:
+                        if line.strip():
+                            all_data.append(json.loads(line))
+                print(f"Loaded {len(all_data)} items so far.")
+                continue
+            except Exception as e:
+                print(f"Error reading existing batch {batch_num}: {e}. Regenerating...")
+        # Generate entire batch concurrently
+        batch_data = await generate_batch_concurrent(pipeline, batch_size, batch_num)
+        # Save batch as JSONL
+        with open(batch_filename, "w") as f:
+            for item in batch_data:
+                f.write(json.dumps(item) + "\n")
+        print(f"Saved batch to {batch_filename}")
+        # Validate batch
+        print("Validating batch...")
+        metrics = validate_synthetic_data(batch_filename)
+        print(json.dumps(metrics, indent=2))
+        all_data.extend(batch_data)
+        # Wait 5 seconds before next batch
+        if batch_num < num_batches:
+            print("Waiting 5 seconds before next batch...")
+            await asyncio.sleep(5)
+    # Save all data
+    output_file = f"synthetic_data/all_generated_data_{total_items}.jsonl"
+    with open(output_file, "w") as f:
+        for item in all_data:
+            f.write(json.dumps(item) + "\n")
+    print(f"\n{'='*60}")
+    print(f"COMPLETED: {len(all_data)} items generated")
+    print(f"Full dataset saved to {output_file}")
+    print(f"{'='*60}")
+def _get_persistence_for_category(category: str) -> str:
+    """Map category to its expected persistence level."""
+    if "brand_core" in category or "strategic_signatures" in category or "knowledge_artifacts" in category or "communication_style" in category or "strategic_approach" in category:
+        return "long"
+    elif "tools_config" in category or "role_context" in category or "workflow_patterns" in category:
+        return "medium"
+    elif "business_priorities" in category or "session_history" in category:
+        return "short"
+    elif "performance_context" in category:
+        return "rolling"
+    elif "interaction_preferences" in category:
+        return "evolving"
+    elif "none" in category:
+        return "short"
+    return "medium"
+if __name__ == "__main__":
+    total = int(sys.argv[1]) if len(sys.argv) > 1 else 100
+    batch = int(sys.argv[2]) if len(sys.argv) > 2 else 10
+    asyncio.run(run_pipeline_batches_async(total, batch))

synthetic_data/run_diverse_generation.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+World-Class Diverse Dataset Generation - 20 Concurrent API Calls per Batch
+Key features:
+- 20 API calls simultaneously per batch
+- Wait for batch to complete, then next batch
+- Temperature 0.95 for maximum diversity
+- No templates, maximum creative freedom
+"""
+import json
+import random
+import os
+import asyncio
+from typing import List, Dict, Optional
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor
+import cohere
+from dotenv import load_dotenv
+load_dotenv()
+CATEGORY_TARGETS = {
+    "company.brand_core": 77,
+    "company.strategic_signatures": 77,
+    "company.knowledge_artifacts": 77,
+    "company.business_priorities": 77,
+    "company.tools_config": 77,
+    "company.performance_context": 77,
+    "user.communication_style": 77,
+    "user.strategic_approach": 77,
+    "user.role_context": 77,
+    "user.workflow_patterns": 77,
+    "user.session_history": 77,
+    "user.interaction_preferences": 77,
+    "none": 77,
+}
+INDUSTRIES = [
+    "Series A fintech building a neobank", "hospital network digitizing patient intake",
+    "DTC sneaker brand scaling to retail", "industrial valve manufacturer going digital",
+    "K-12 tutoring platform expanding to Asia", "commercial real estate analytics startup",
+    "ghost kitchen aggregator in NYC", "enterprise zero-trust security vendor",
+    "luxury cruise line post-pandemic", "connected fitness hardware company",
+    "immigration law firm automating visas", "recruiting platform for nurses",
+    "pet insurance disruptor", "last-mile drone delivery startup",
+    "indie game studio with a viral hit", "podcast network monetizing premium content",
+    "EV charging network operator", "solar panel installer franchise",
+    "modular home construction startup", "veterinary telehealth platform",
+    "wine subscription service", "corporate wellness SaaS",
+    "NFT marketplace pivoting to digital art", "AI code review tool for enterprises",
+    "climate risk analytics for insurers", "restaurant POS system provider",
+    "online therapy platform", "B2B payments infrastructure",
+    "influencer marketing agency", "smart home security company"
+]
+PERSONAS = [
+    "a stressed CMO preparing for board review",
+    "a junior marketing coordinator on their first campaign",
+    "a VP who just joined from a competitor",
+    "a founder wearing multiple hats",
+    "a seasoned brand director with 20 years experience",
+    "a growth lead obsessed with metrics",
+    "a creative director frustrated with process",
+    "a demand gen manager under pressure to hit pipeline",
+    "a content strategist building a new team",
+    "a marketing ops person drowning in tools",
+    "a product marketer launching next week",
+    "an email specialist optimizing deliverability",
+    "a social media manager handling a PR crisis",
+    "a field marketer planning regional events",
+    "a partner marketing lead negotiating co-marketing",
+    "an analyst presenting attribution findings"
+]
+SITUATIONS = [
+    "in the middle of a heated planning session",
+    "wrapping up a long day before vacation",
+    "preparing for a last-minute executive ask",
+    "debugging why a campaign tanked",
+    "celebrating a successful launch",
+    "onboarding after joining last week",
+    "dealing with budget cuts",
+    "scaling something that unexpectedly worked",
+    "cleaning up a predecessor's mess",
+    "trying to align with a difficult stakeholder"
+]
+TONES = ["urgent", "casual", "frustrated", "excited", "methodical", "skeptical", "collaborative", "directive"]
+CATEGORY_HINTS = {
+    "company.brand_core": "The conversation should naturally surface brand identity elements - could be voice, visuals, values, positioning, or personality.",
+    "company.strategic_signatures": "The conversation should reveal how this company makes decisions - their frameworks, principles, or recurring patterns.",
+    "company.knowledge_artifacts": "The conversation should reference internal documentation - guides, playbooks, templates, or SOPs.",
+    "company.business_priorities": "The conversation should touch on current goals, quarterly targets, or active initiatives.",
+    "company.tools_config": "The conversation should involve tool setup, integrations, APIs, or workflow automation.",
+    "company.performance_context": "The conversation should discuss metrics, campaign results, or performance learnings.",
+    "user.communication_style": "The user should express how they prefer to receive information - format, length, tone, or style.",
+    "user.strategic_approach": "The user should reveal their personal philosophy, priorities, or decision-making style.",
+    "user.role_context": "The user should mention their role, responsibilities, authority, or team structure.",
+    "user.workflow_patterns": "The user should describe their schedule, review process, or collaboration preferences.",
+    "user.session_history": "The conversation should reference recent context, ongoing work, or previous discussions.",
+    "user.interaction_preferences": "The user should express how they want the AI to behave - proactivity, feedback style, or coaching level.",
+    "none": "The conversation should be purely transactional with nothing worth remembering long-term."
+}
+class ConcurrentGenerator:
+    def __init__(self):
+        self.api_key = os.getenv("COHERE_API_KEY")
+        if not self.api_key:
+            raise ValueError("COHERE_API_KEY not found")
+        self.client = cohere.ClientV2(api_key=self.api_key)
+        self.model = "command-r-plus-08-2024"
+        self.executor = ThreadPoolExecutor(max_workers=20)
+    def _extract_text(self, response) -> Optional[str]:
+        if not response or not getattr(response, "message", None):
+            return None
+        blocks = getattr(response.message, "content", []) or []
+        for block in blocks:
+            text = getattr(block, "text", None)
+            if isinstance(text, str) and text.strip():
+                return text
+        return None
+    def _generate_one(self, category: str) -> Optional[Dict]:
+        """Generate a single example with maximum creativity."""
+        industry = random.choice(INDUSTRIES)
+        persona = random.choice(PERSONAS)
+        situation = random.choice(SITUATIONS)
+        tone = random.choice(TONES)
+        turns = random.randint(3, 10)
+        hint = CATEGORY_HINTS.get(category, "")
+        if category == "none":
+            prompt = f"""You are a creative writer generating training data for an AI memory system.
+Create a completely realistic conversation between {persona} at a {industry} and their AI marketing assistant.
+Context: They are {situation}. The tone is {tone}.
+THIS CONVERSATION MUST BE FORGETTABLE - nothing worth storing in long-term memory:
+- Quick status checks, scheduling, or confirmations
+- Vague questions without actionable details
+- Chitchat or temporary context that expires immediately
+Be creative. Make it feel real. No templates. Surprise me.
+Output as JSON with this structure:
+{{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["none"], "persistence_horizon": "short", "memory_scope": "none", "rationale": "why this is unmemorable"}}, "metadata": {{"primary_category": "none", "turn_count": {turns}, "industry": "{industry}"}}}}"""
+        else:
+            prompt = f"""You are a world-class creative writer generating training data for an AI memory routing system.
+Create a completely unique, realistic conversation between {persona} at a {industry} and their AI marketing assistant.
+Context: They are {situation}. The tone is {tone}.
+CATEGORY TO DEMONSTRATE: {category}
+{hint}
+CREATIVE FREEDOM:
+- Invent specific, realistic details (names, numbers, dates, products)
+- The conversation can start anywhere - mid-thought, mid-project, mid-crisis
+- Vary structure dramatically - could be rapid-fire, could be detailed
+- Include natural speech patterns, interruptions, tangents
+- Make it feel like eavesdropping on a real conversation
+- {turns} turns, but quality over quantity
+The ONLY hard requirement: the conversation must clearly demonstrate {category}.
+Output as JSON:
+{{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"], "persistence_horizon": "long/medium/short", "memory_scope": "{category.split('.')[0]}", "rationale": "why this fits {category}"}}, "metadata": {{"primary_category": "{category}", "turn_count": {turns}, "industry": "{industry}"}}}}"""
+        try:
+            response = self.client.chat(
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.95,
+                model=self.model,
+                response_format={"type": "json_object"}
+            )
+            content = self._extract_text(response)
+            if not content:
+                return None
+            if content.startswith("```"):
+                content = content.split("\n", 1)[1] if "\n" in content else content[3:]
+            if content.endswith("```"):
+                content = content[:-3]
+            data = json.loads(content.strip())
+            categories = data.get("labels", {}).get("categories", [])
+            if category.lower() not in [c.lower() for c in categories]:
+                return None
+            if len(categories) > 1 and "none" in [c.lower() for c in categories]:
+                data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
+            return data
+        except Exception as e:
+            return None
+    async def generate_batch_concurrent(self, categories: List[str]) -> List[Dict]:
+        """Generate 20 items concurrently."""
+        loop = asyncio.get_event_loop()
+        tasks = [
+            loop.run_in_executor(self.executor, self._generate_one, cat)
+            for cat in categories
+        ]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [r for r in results if isinstance(r, dict)]
+async def run_generation():
+    generator = ConcurrentGenerator()
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = f"synthetic_data/diverse_dataset_{timestamp}.jsonl"
+    category_counts = {cat: 0 for cat in CATEGORY_TARGETS}
+    all_data = []
+    print("=" * 70, flush=True)
+    print("WORLD-CLASS DATASET GENERATION (20 Concurrent)", flush=True)
+    print("=" * 70, flush=True)
+    print(f"Batch size: 20 concurrent API calls", flush=True)
+    print(f"Temperature: 0.95", flush=True)
+    print(f"Target: 77 per category x 13 = 1001 total", flush=True)
+    print(f"Output: {output_file}", flush=True)
+    print("=" * 70, flush=True)
+    batch_num = 0
+    start_time = datetime.now()
+    while True:
+        # Build list of needed categories
+        needed = []
+        for cat, target in CATEGORY_TARGETS.items():
+            remaining = target - category_counts[cat]
+            if remaining > 0:
+                needed.extend([cat] * min(remaining, 3))  # Up to 3 per category per batch
+        if not needed:
+            break
+        random.shuffle(needed)
+        batch_categories = needed[:20]  # 20 concurrent
+        batch_num += 1
+        print(f"\n[Batch {batch_num}] Launching 20 concurrent requests...", flush=True)
+        batch_start = datetime.now()
+        results = await generator.generate_batch_concurrent(batch_categories)
+        batch_time = (datetime.now() - batch_start).seconds
+        for result in results:
+            if result:
+                primary = result.get("metadata", {}).get("primary_category") or \
+                         result.get("labels", {}).get("categories", ["unknown"])[0]
+                if primary in category_counts:
+                    category_counts[primary] += 1
+                    all_data.append(result)
+                    with open(output_file, "a") as f:
+                        f.write(json.dumps(result) + "\n")
+                    conv = result.get("conversation", [])
+                    if conv and len(conv) > 0:
+                        first_msg = conv[0].get("content", "") if isinstance(conv[0], dict) else str(conv[0])
+                        print(f"  [{primary}] {first_msg[:60]}...", flush=True)
+        total_done = sum(category_counts.values())
+        total_target = sum(CATEGORY_TARGETS.values())
+        elapsed = (datetime.now() - start_time).seconds
+        rate = total_done / max(elapsed, 1) * 60
+        eta = (total_target - total_done) / max(rate, 0.1)
+        print(f"  Batch: {len(results)}/20 success in {batch_time}s | Total: {total_done}/{total_target} | Rate: {rate:.1f}/min | ETA: {eta:.0f}min", flush=True)
+        # Progress every 10 batches
+        if batch_num % 10 == 0:
+            print("\n  === Category Breakdown ===", flush=True)
+            for cat in sorted(category_counts.keys()):
+                count = category_counts[cat]
+                target = CATEGORY_TARGETS[cat]
+                bar = "█" * (count * 20 // target) + "░" * (20 - count * 20 // target)
+                print(f"    {cat:<35} [{bar}] {count:>3}/{target}", flush=True)
+            print()
+        # Wait 3 seconds between batches
+        await asyncio.sleep(3)
+    print("\n" + "=" * 70, flush=True)
+    print("GENERATION COMPLETE", flush=True)
+    print("=" * 70, flush=True)
+    elapsed_total = (datetime.now() - start_time).seconds / 60
+    print(f"Total: {len(all_data)} examples in {elapsed_total:.1f} minutes", flush=True)
+    print(f"Output: {output_file}", flush=True)
+    return output_file
+if __name__ == "__main__":
+    asyncio.run(run_generation())

synthetic_data/sample_batch.json ADDED Viewed

	@@ -0,0 +1,841 @@

+[
+  {
+    "scenario_id": "company.brand_core_strategy_789",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Building on our Q3 discussion about brand consistency\u2014how does the loyalty program reinforce 'Timeless Elegance' in digital touchpoints?"
+      },
+      {
+        "role": "assistant",
+        "content": "Previously, we aligned on using minimalist UI and exclusive content to reflect that. For loyalty, this could mean members-only digital lookbooks showcasing archival pieces. Absolutely, that's a key consideration when maintaining our brand's heritage digitally."
+      },
+      {
+        "role": "user",
+        "content": "How should we balance exclusivity with accessibility in rewards? Our urban professionals expect privilege but also practicality."
+      },
+      {
+        "role": "assistant",
+        "content": "A tiered rewards system would work here. Base tier offers practical perks like complimentary alterations and priority customer service. Meanwhile, top-tier members get exclusive experiences\u2014think private styling sessions or early access to limited-edition collaborations. That makes sense\u2014let me structure this."
+      },
+      {
+        "role": "user",
+        "content": "Will the AI style profiling integrate here? How do we avoid diluting the 'Modern Edge' promise?"
+      },
+      {
+        "role": "assistant",
+        "content": "Profile data could fuel hyper-personalized rewards\u2014for instance, offering avant-garde capsule collection access to members with edgier profiles. We'd track redemption metrics against brand alignment to ensure we're staying true to our identity. Also, we could include a feedback loop where members rate reward 'edginess' to refine future offerings."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.business_priorities",
+        "user.role_context",
+        "user.strategic_approach"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Conversation centers on brand identity (company.brand_core) and strategic frameworks (company.strategic_signatures) for loyalty program design, while referencing active initiatives (business_priorities) and the user's strategic priorities (user.strategic_approach) and role scope (user.role_context)."
+    },
+    "metadata": {
+      "scenario_type": "loyalty_program_alignment",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Timeless Elegance with Modern Edge",
+        "Urban professionals (25-45) valuing curated exclusivity",
+        "Accessible Luxury Without Compromise",
+        "Personalization through AI-driven style profiling"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_protocol_update_142",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Need to finalize the safety messaging section before the summit \u2013 should we integrate the new accident response protocols here? Legal\u2019s been pushing for clearer driver guidelines."
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely, building on our Project Compass foundation. The 'Safety First' pillar should absorb those protocols. Remember the crisis comms framework we locked in March? We can mirror that structure here for consistency. Just making sure we're not reinventing the wheel."
+      },
+      {
+        "role": "user",
+        "content": "Yes, but how does 'Community Trust' tie into driver conduct without stepping on HR\u2019s toes? Need a clear boundary there \u2013 don\u2019t want confusion between brand expectations and official policies."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. Per our May 12th brand guidelines review, 'Community Trust' specifically addresses passenger interaction norms \u2013 smiles, greetings, conflict de-escalation. I\u2019ll pull the approved driver conduct verbiage from that document. HR policies remain separate but we\u2019ll cross-reference them in footnotes."
+      },
+      {
+        "role": "user",
+        "content": "Good. And the newsletter template \u2013 legal wants all three pillars in every header. Confirm we\u2019re not just stuffing them in the footer anymore? Drivers need to see these front and center."
+      },
+      {
+        "role": "assistant",
+        "content": "Confirmed. Template v3.1, approved June 8th, embeds all pillars in the header with the 'Seamless Journeys' tagline. Footer contains compliance links. Bi-weekly revision cycle stays on track for July 15th summit distribution. Let me double-check that version for you."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.business_priorities",
+        "company.knowledge_artifacts",
+        "user.workflow_patterns"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand pillars and Project Compass reference require long-term retention (brand_core). Summit deadline and template revisions are short-term priorities (business_priorities). Template versions and guidelines are knowledge artifacts (knowledge_artifacts). Bi-weekly cycles reflect user workflow patterns (user.workflow_patterns)."
+    },
+    "metadata": {
+      "scenario_type": "brand_protocol_update",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Three brand messaging pillars",
+        "Deadline for handbook update: Q3 2025 driver summit",
+        "Bi-weekly driver newsletter template revision",
+        "Project Compass branding initiative reference"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_live_ops_strategy_142",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Building on what we discussed earlier about the teaser cadence\u2014I need to ensure the in-game dimensional rift narrative aligns with our brand voice when we drop the lore updates next week. Can't have the tone shifting between mysterious and... whatever the devs wrote last night."
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely. Remember how our 'community co-creation' principle during the Q3 campaign boosted engagement by 22%? That approachable authority tone with witty lore hooks worked perfectly. For this event, should we lean into the *playful challenge* aspect of our brand values while maintaining that witty, authoritative voice?"
+      },
+      {
+        "role": "user",
+        "content": "Yes, but the 'dimensional rift' concept needs to feel innovative\u2014our core value\u2014without alienating players. The narrative should invite exploration, not confusion. And the global audience means inclusivity is non-negotiable. Avoid cultural missteps in the lore metaphors."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. Since innovation is our north star here, we could frame the rift as a *collaborative discovery*\u2014using our 'player storytelling' mission\u2014rather than a threat. Tone-wise, think 'curious guide' instead of 'ominous warning'. I\u2019ll reference the brand voice guidelines from our knowledge artifacts for specific phrasing examples."
+      },
+      {
+        "role": "user",
+        "content": "Perfect. And make sure the tooltips use gender-neutral terms and avoid region-specific idioms. Oh, and the devs mentioned something about a 'rift meter' UI element\u2014need to ensure that terminology stays consistent with our style guide. Don\u2019t want another 'elixir of fortitude' fiasco."
+      },
+      {
+        "role": "assistant",
+        "content": "Noted. Here\u2019s your launch checklist synthesis: 1) Narrative framing as collaborative discovery (innovation + inclusivity), 2) Tone audit against brand voice guidelines, 3) UI terminology cross-check with style guide, 4) Player communication templates with witty/lore-rich hooks. Shall I route these to the content team via our playbook workflow?"
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.performance_context",
+        "company.knowledge_artifacts",
+        "user.strategic_approach"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand mission/values (company.brand_core) and past campaign metrics (company.performance_context) require long-term retention. Style guide references (company.knowledge_artifacts) are foundational. User's emphasis on inclusivity and global audience (user.strategic_approach) reflects personal priorities needing long persistence."
+    },
+    "metadata": {
+      "scenario_type": "brand_consistency_live_ops",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Studio mission",
+        "Core brand values",
+        "Brand voice guidelines",
+        "Q3 2025 campaign reference"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.strategic_signatures_ongoing_142",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Let\u2019s revisit the segmentation thresholds we discussed last quarter. The new data shows 20% of Gold members haven\u2019t engaged in 3 months. Should we adjust the recency threshold or introduce re-engagement campaigns?"
+      },
+      {
+        "role": "assistant",
+        "content": "Previously, we set Gold status at $500 annual spend with quarterly engagement. Given our 'accessible luxury' positioning, I\u2019d suggest targeted re-engagement first\u2014perhaps exclusive early access to capsule collections via app notifications. That reinforces premium value without diluting the tier\u2019s exclusivity."
+      },
+      {
+        "role": "user",
+        "content": "Right, but the retail team also wants to promote in-store pickups for online orders. Can we tie bonus points to that behavior? Maybe 100 points per pickup?"
+      },
+      {
+        "role": "assistant",
+        "content": "Strategically, that aligns with our omnichannel goals. We could frame it as 'Gold Concierge Pickup'\u2014offering bonus points *and* priority packaging. This maintains the premium experience while driving the behavior. Need to ensure our POS systems log these redemptions seamlessly alongside app tracking."
+      },
+      {
+        "role": "user",
+        "content": "Wait, how does this affect the points expiration policy we finalized in January? We can\u2019t have conflicting rules."
+      },
+      {
+        "role": "assistant",
+        "content": "The core 12-month expiration for base points remains, but we can exempt pickup-bonus points from expiration entirely. This incentivizes omnichannel engagement without penalizing active members. Remember, our segmentation model already weights recency heavily\u2014this keeps the system coherent."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.business_priorities",
+        "user.role_context",
+        "user.session_history"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand positioning ('accessible luxury') and strategic frameworks (tiered thresholds) require long-term persistence, while expiration policies and campaign tactics are shorter-term priorities. User's role context (omnichannel integration) and reference to prior discussions reflect medium/long persistence."
+    },
+    "metadata": {
+      "scenario_type": "loyalty_program_optimization",
+      "primary_category": "company.strategic_signatures",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Existing loyalty program's tiered rewards structure",
+        "Customer segmentation criteria",
+        "Omnichannel integration points",
+        "Brand-core value: 'Accessible luxury'"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.business_priorities_vendor_collaboration_742",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Building on the EcoVend pilot success\u2014how quickly can we scale real-time inventory APIs to top-tier partners? We need to lock in Q3 timelines."
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely, though we need to balance that with the Q3 deadline extension requests from partners like GreenLogistics. Remember, we pushed their integration to September 15th to address their ERP compatibility concerns. EcoVend's pilot showed promising uptime, but their feedback highlighted API latency spikes during peak hours."
+      },
+      {
+        "role": "user",
+        "content": "Right, what other feedback trends are we seeing from vendors? Any recurring pain points beyond latency?"
+      },
+      {
+        "role": "assistant",
+        "content": "EcoVend specifically flagged inconsistent data schema mappings causing 15-20% error rates in their warehouse module. Smaller vendors also mentioned documentation gaps in our developer portal. We've got the shared analytics dashboard prototype ready, but finance is still reviewing the $180K budget request for full development."
+      },
+      {
+        "role": "user",
+        "content": "Budget status is critical. Any movement there? And remind me\u2014what were the top action items from last quarter's summit that we need to address before scaling?"
+      },
+      {
+        "role": "assistant",
+        "content": "Budget remains pending, but we've got preliminary approval to use Q2 rollover funds for vendor testing. From the summit: 1) Establish vendor onboarding SLAs by July, 2) Implement quarterly API performance reviews, and 3) Finalize data-sharing compliance frameworks. Oh, and legal wants all new contracts to reference the dashboard proposal's IP terms."
+      },
+      {
+        "role": "user",
+        "content": "Got it. Does the dashboard proposal align with the contract terms we negotiated with Prime Logistics? Just making sure we're not missing any fine print here."
+      },
+      {
+        "role": "assistant",
+        "content": "Yes\u2014the proposal's data residency clauses match Prime's requirements, and the 99.5% uptime SLA is codified. We cross-referenced their contract during the prototype phase. Their CTO also signed off on the API rate limits last week."
+      },
+      {
+        "role": "user",
+        "content": "Perfect. Let's confirm adjusted timelines then. Push non-critical vendors to Q4 if needed, but GreenLogistics and EcoVend stay on track for September go-live?"
+      },
+      {
+        "role": "assistant",
+        "content": "Agreed. GreenLogistics' extended deadline is firm at 9/15, and EcoVend's full integration is set for 9/22. We'll need to prioritize their sandbox testing environments by mid-July. Oh, and the compliance team wants weekly syncs starting next month\u2014should we schedule that?"
+      },
+      {
+        "role": "user",
+        "content": "Yeah, put it on the calendar. And send me the summit action items doc again\u2014I\u2019ll review before next week\u2019s steering committee."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.business_priorities",
+        "company.performance_context",
+        "company.knowledge_artifacts",
+        "company.tools_config",
+        "none"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Business priorities (Q3 deadlines/budget), performance context (pilot feedback metrics), and knowledge artifacts (summit action items/contracts) dominate. Transactional elements (calendar requests) map to 'none'."
+    },
+    "metadata": {
+      "scenario_type": "strategic_planning",
+      "primary_category": "company.business_priorities",
+      "distractor_present": false,
+      "turn_count": 11,
+      "signals_present": [
+        "Q3 vendor integration deadline extension request from key partners",
+        "Pilot program feedback from EcoVend regarding real-time inventory APIs",
+        "Pending budget approval for shared analytics dashboard development",
+        "Action items from last quarter's vendor collaboration summit"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.knowledge_artifacts_collaborative_refinement_742",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Building on the brand voice guidelines we finalized last quarter, we need to adjust our financial services proposal templates. The current drafts feel too generic for this sector's compliance needs."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. Since we're refining sector-specific templates, could you clarify the client segmentation criteria? Previously, we discussed tiers based on asset size\u2014should these variations align with our strategic/growth/volume client definitions?"
+      },
+      {
+        "role": "user",
+        "content": "Exactly. Strategic clients (>$5B AUM) require deep customization, growth tier ($1B-$5B) needs balanced detail, and volume (<$1B) should prioritize efficiency. Also, all financial templates must embed our SEC compliance disclaimers by default."
+      },
+      {
+        "role": "assistant",
+        "content": "Got it. The segmentation logic matches our earlier brand architecture framework. I've drafted three template variations\u2014strategic version includes the full credentials matrix, growth streamlines case studies, and volume uses modular service snapshots. Shall I share these in Confluence?"
+      },
+      {
+        "role": "user",
+        "content": "Yes, but flag the methodology section in the strategic template. Our parent brand's 'integrated advisory approach' needs stronger emphasis there, while sub-brands should maintain their specialized positioning. That consistency is critical for global offices."
+      },
+      {
+        "role": "assistant",
+        "content": "Noted. I'll revise the methodology section to anchor in our core value proposition\u2014'Seamless expertise across regulatory landscapes'\u2014while preserving sub-brand flexibility. Next, I'll integrate the credential library with dynamic fields for compliance updates. Expect the refreshed drafts by EOD Friday?"
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.knowledge_artifacts",
+        "company.strategic_signatures",
+        "user.communication_style",
+        "user.role_context"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Company categories dominate (brand framework, knowledge artifacts, strategic compliance) with long-term persistence. User's role context and concise feedback style warrant medium-term storage."
+    },
+    "metadata": {
+      "scenario_type": "collaborative_refinement",
+      "primary_category": "company.knowledge_artifacts",
+      "distractor_present": true,
+      "turn_count": 6,
+      "signals_present": [
+        "Brand architecture framework (parent brand vs. sub-brands)",
+        "Core value proposition statements approved by leadership",
+        "Client segmentation criteria for template customization",
+        "Standardized boilerplate sections (e.g., methodology, credentials)"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_brand_alignment_initiative_742",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Building on our earlier collaboration around brand voice guidelines, we need to ensure these principles permeate the AI's dialogue patterns in the revised playbooks. The team wants approachable expertise reflected in every interaction."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. We'll embed the voice guidelines into response templates and micro-interactions. Should we prioritize consistency over pedagogical flexibility in edge cases?"
+      },
+      {
+        "role": "user",
+        "content": "Our core value of 'Learner-Centric Adaptive Pathways' must drive content personalization. The AI needs to dynamically adjust difficulty based on engagement signals\u2014not just predefined metrics."
+      },
+      {
+        "role": "assistant",
+        "content": "Noted. Could you clarify how the 'Scaffolded Mastery Framework' operationalizes this? What specific implementation examples should we reference for incremental progression?"
+      },
+      {
+        "role": "user",
+        "content": "Right, the framework starts with foundational concept checks before introducing complexity. For example, in algebra modules, it begins with visual equation balancing before abstract variables. Each mastery gate requires 90% competency before advancing."
+      },
+      {
+        "role": "assistant",
+        "content": "Got it\u2014granular progression gates. How does this tie to our USP of 'personalized learning at scale without human tutor costs'? Should accessibility messaging emphasize cost efficiency?"
+      },
+      {
+        "role": "user",
+        "content": "Exactly. Position the adaptive pathways as democratizing high-quality education. Highlight that our AI delivers individualized pacing at a fraction of traditional tutoring expenses. That's our accessibility hook."
+      },
+      {
+        "role": "assistant",
+        "content": "Confirmed. All brand elements\u2014voice guidelines, adaptive principles, framework details, and USP positioning\u2014are now documented for persistent memory. Ready for playbook integration."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.knowledge_artifacts",
+        "user.role_context",
+        "user.strategic_approach"
+      ],
+      "persistence_horizon": "long",
+      "memory_scope": "mixed",
+      "rationale": "Brand values (company.brand_core), teaching methodology (company.strategic_signatures), and playbooks (company.knowledge_artifacts) require long-term storage. User's PM role (user.role_context) and focus on brand-pedagogy balance (user.strategic_approach) provide context for future interactions."
+    },
+    "metadata": {
+      "scenario_type": "brand_alignment_initiative",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Brand voice guidelines emphasizing approachable expertise",
+        "Core value: 'Learner-Centric Adaptive Pathways'",
+        "Proprietary teaching methodology: 'Scaffolded Mastery Framework'",
+        "Unique value proposition: 'Personalized learning at scale without human tutor costs'"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_influencer_strategy_refinement_127",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "I'm concerned our regional adaptations for Southeast Asia might dilute our heritage narrative. How do we scale 'Timeless Elegance' principles to markets like Tokyo without losing that three-generation formulation expertise story?"
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely, that tension is real. Remember how our Parisian activation maintained 98% visual adherence while incorporating local artisans? We could apply similar 'craftsmanship story modules'\u2014pre-approved heritage highlights paired with localized rituals. This kept our core identity intact while resonating regionally."
+      },
+      {
+        "role": "user",
+        "content": "For Tokyo specifically, what concrete vetting criteria should we use for micro-influencers? We need more than just follower counts."
+      },
+      {
+        "role": "assistant",
+        "content": "Agreed. Prioritize creators with \u226570% affluent beauty connoisseur audiences, verified by our partner platform. Require content proposals to weave in formulation history\u2014like highlighting our cold-process extraction methods. We\u2019ll also mandate our new visual compliance toolkit for 95% guideline adherence tracking."
+      },
+      {
+        "role": "user",
+        "content": "The compliance toolkit is non-negotiable. We\u2019ll need persistent tracking of that 95% threshold across all activations. Also, remind me\u2014where\u2019s the central repository for approved heritage assets?"
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. We\u2019ll bake compliance dashboards into every campaign brief. Approved assets live in the Brand Legacy Vault under 'Formulation Archives'. To summarize our strategy: 1) Heritage modules for local adaptation, 2) Strict audience/content vetting, 3) Real-time compliance tracking\u2014all anchored in our 'uncompromising craftsmanship' pillar. Shall I document this in the playbook?"
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.performance_context",
+        "company.business_priorities",
+        "company.knowledge_artifacts"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand heritage principles (company.brand_core) and influencer criteria (company.strategic_signatures) require long-term persistence, while compliance metrics (company.performance_context) and campaign tools (company.business_priorities) are shorter-term. Playbook documentation (company.knowledge_artifacts) bridges both."
+    },
+    "metadata": {
+      "scenario_type": "influencer_strategy_refinement",
+      "primary_category": "company.brand_core",
+      "distractor_present": true,
+      "turn_count": 6,
+      "signals_present": [
+        "Brand pillar: 'Uncompromising artisanal craftsmanship'",
+        "Mandatory influencer criterion: 70%+ audience alignment with 'affluent beauty connoisseurs' segment",
+        "Campaign consistency metric: 95% adherence to visual identity guidelines",
+        "Heritage narrative: 'Three-generation family formulation expertise'"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company_brand_core_742",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Pull up the Q3 vendor reliability data \u2013 I need to compare it against the Project Helix baseline from Q2. Specifically, how are on-time deliveries trending?"
+      },
+      {
+        "role": "assistant",
+        "content": "Against the Q2 Project Helix benchmarks, Q3 shows a 12% increase in delivery delays with Vendor Alpha. Their on-time rate dropped to 88% versus the 95% we achieved under Helix. Vendors Beta and Gamma are holding steady at 94% and 97% respectively. Want me to drill into Alpha's specific pain points?"
+      },
+      {
+        "role": "user",
+        "content": "Yes, but first \u2013 how does this 12-point gap directly impact our 'Reliability' brand pillar? And remind me of their transparency performance from our earlier discussions."
+      },
+      {
+        "role": "assistant",
+        "content": "Right, the delivery consistency feeds directly into customer trust for our demand-response programs. On transparency, Vendor Alpha scored 68% in Q2's communication benchmarks \u2013 remember the delayed shipment notifications we flagged? Their predictive ETAs were off by 3.2 days on average. That historical pattern suggests systemic issues beyond just Q3 logistics."
+      },
+      {
+        "role": "user",
+        "content": "Exactly. We need to revise the scoring model to weight predictive analytics heavier than historical averages. Let's emphasize real-time tracking data for Q4 \u2013 I'll draft a proposal. Also, confirm our next sync date."
+      },
+      {
+        "role": "assistant",
+        "content": "Got it. To recap: 1) Revise vendor scoring with 40% weight on predictive analytics, 2) Formalize real-time tracking requirements by Friday, 3) Maintain bi-weekly syncs \u2013 next on October 12th. This builds on our Q2 transparency benchmarks and aligns with the 'Reliability' pillar's 95% on-time target. Anything else before I distribute the summary?"
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.knowledge_artifacts",
+        "company.performance_context",
+        "user.workflow_patterns"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand pillars (company.brand_core) and Project Helix model (company.knowledge_artifacts) require long-term retention. Vendor metrics (company.performance_context) need rolling context. Bi-weekly syncs (user.workflow_patterns) reflect medium-term user patterns."
+    },
+    "metadata": {
+      "scenario_type": "collaborative_initiative",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Project Helix vendor scoring model",
+        "Reliability brand pillar tied to on-time delivery",
+        "Vendor transparency benchmarks from Q2",
+        "Bi-weekly sync commitment"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company_strategic_signatures_742",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Double-check the Japan visual assets against Project Phoenix specs\u2014especially the hero banner gradients. Legal\u2019s nervous about cultural missteps."
+      },
+      {
+        "role": "assistant",
+        "content": "Confirmed compliant with Project Phoenix 2.1 guidelines, but the crimson-to-gold gradient in Frame 3 deviates from our approved palette by 12%. Should we flag this for revision?"
+      },
+      {
+        "role": "user",
+        "content": "Right, the Mexico campaign last quarter bent the rules on typography but killed it locally. Does Japan get similar flexibility under the Unified Brand Pillars?"
+      },
+      {
+        "role": "assistant",
+        "content": "Version 4.2 of the Pillars allows regional adaptations if metrics justify\u2014Mexico\u2019s engagement lift was +28%. For Japan, we\u2019d need comparable data to justify the gradient shift. I\u2019ll attach the flexibility clause matrix."
+      },
+      {
+        "role": "user",
+        "content": "Brazil team\u2019s proposed tagline tweak for Episode 4 feels\u2026 edgy. What\u2019s the escalation path if they push back on our feedback?"
+      },
+      {
+        "role": "assistant",
+        "content": "Trigger Crimson Protocol for high-stakes drops. I\u2019ll draft a memo to Legal and Regional Leads citing Phoenix parameters, then schedule a sync within 24 hours. We\u2019ve used this workflow for 3 prior drops this quarter."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.knowledge_artifacts",
+        "company.business_priorities",
+        "user.role_context",
+        "user.workflow_patterns"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Project Phoenix/Unified Brand Pillars (long-term artifacts), Crimson Protocol (strategic signature), and regional adaptation rules (brand core) require persistent storage. Campaign metrics and escalation workflows reflect evolving priorities and user-specific patterns."
+    },
+    "metadata": {
+      "scenario_type": "brand_consistency_planning",
+      "primary_category": "company_strategic_signatures",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Project Phoenix",
+        "Unified Brand Pillars",
+        "Crimson Protocol",
+        "cultural adaptation balance"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_demand_response_456",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "We need to reinforce our 'Reliability First' positioning in the new demand-response collateral for commercial clients. How can we thread that through the value prop without overshadowing cost savings?"
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely. Remember how we structured the 'PeakPartner' framework last quarter? That balanced operational benefits with financial incentives. We could adapt that approach here\u2014lead with grid stability as the foundational promise, then layer in cost efficiencies as a natural outcome of our reliability."
+      },
+      {
+        "role": "user",
+        "content": "Makes sense. But commercial audiences will want proof points. How do we quantify 'reliability' beyond just stating it as a value?"
+      },
+      {
+        "role": "assistant",
+        "content": "Our grid stability metrics are perfect for this. We've maintained 99.8% uptime across commercial nodes since 2022\u2014that's industry-leading. We should feature that prominently, maybe with a regional comparison chart to contextualize the advantage."
+      },
+      {
+        "role": "user",
+        "content": "Uptime stats are good, but competitors can claim similar numbers. What technical validation can we add to differentiate?"
+      },
+      {
+        "role": "assistant",
+        "content": "Our proprietary 'EnergyPulse' algorithm is key here. It's what enables predictive load balancing with 92% accuracy. We could highlight how this technology actively prevents disruptions\u2014tying directly back to the 'Reliability First' promise while showcasing our innovation edge."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.performance_context",
+        "company.knowledge_artifacts"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "company",
+      "rationale": "Brand pillar ('Reliability First') and strategic framework ('PeakPartner') represent enduring company identity and decision heuristics. Grid metrics and proprietary tech are performance assets and knowledge artifacts with longer-term value."
+    },
+    "metadata": {
+      "scenario_type": "demand_response_campaign",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Company's 'Reliability First' brand pillar",
+        "Reference to 'PeakPartner' campaign",
+        "99.8% grid uptime metric",
+        "'EnergyPulse' forecasting algorithm"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_collaboration_147",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Wait, before we dive into the onboarding flows, could you recap our core brand principles? I want to ensure the new campaign messaging stays consistent with what we established last quarter."
+      },
+      {
+        "role": "assistant",
+        "content": "Absolutely. Our brand mission remains 'Empower developers to build without boundaries'. The core value proposition centers on seamless integration reducing toolchain complexity by 40%, directly addressing developer frustration with fragmented debugging workflows. Our reliability pillar\u2014'Uncompromising reliability for high-stakes coding environments'\u2014should anchor all technical narratives."
+      },
+      {
+        "role": "user",
+        "content": "Right, that reliability piece feels crucial for the hackathon sponsorship we're considering. How can we embed these principles into that activation without feeling forced?"
+      },
+      {
+        "role": "assistant",
+        "content": "For the hackathon, we could structure challenges around 'building without boundaries'\u2014like cross-platform integration tasks. And reliability messaging should live in the swag: think high-quality hardware tools with 'Uncompromising reliability' engraved. That reinforces the pillar tangibly."
+      },
+      {
+        "role": "user",
+        "content": "That makes sense. Now for the onboarding flows\u2014show me how the brand voice would manifest in actual copy. Something developers would find authentic, not marketing-heavy."
+      },
+      {
+        "role": "assistant",
+        "content": "Here's a neutral-toned example for the integration setup step: 'Connect your existing toolchain in 3 steps. Our unified API layer handles the heavy lifting\u2014so you can focus on code, not configuration. Designed for 99.99% uptime in production environments.' It emphasizes seamlessness and reliability without hyperbole."
+      },
+      {
+        "role": "user",
+        "content": "Perfect. Let's lock that direction in. Oh, and I'll need the final copy by EOD Friday\u2014my team's reviewing everything before the sprint starts. Almost forgot, did you see the new competitor benchmarks? Their onboarding takes 45 minutes..."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.knowledge_artifacts",
+        "user.workflow_patterns",
+        "none"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand mission/UVP/reliability pillar (company.brand_core) and hackathon strategy (company.strategic_signatures) require long-term storage. Onboarding copy examples (company.knowledge_artifacts) and user's review deadline (user.workflow_patterns) are medium/short-term. Competitor mention is irrelevant (none)."
+    },
+    "metadata": {
+      "scenario_type": "brand_strategy_collaboration",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 7,
+      "signals_present": [
+        "Empower developers to build without boundaries",
+        "Seamless integration reduces toolchain complexity by 40%",
+        "Developer frustration with fragmented debugging workflows",
+        "Uncompromising reliability for high-stakes coding environments"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_messaging_alignment_123",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "So revisiting that 'skills gap' framing we discussed earlier\u2014I think we need to sharpen it further. The board loves the direction but wants to ensure we're not diluting our core message when we talk about underserved communities."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. Building on our previous emphasis on measurable outcomes, perhaps we could reframe around specific skill acquisition rates? For instance, 'Every $50 donation funds 20 hours of certified job training'\u2014this keeps us tied to practical empowerment while addressing the gap quantifiably."
+      },
+      {
+        "role": "user",
+        "content": "That's solid. What about this test slogan: 'Building futures, one green job at a time'? It feels modern and hopeful. Wait, does 'green' pose any issues?"
+      },
+      {
+        "role": "assistant",
+        "content": "Careful with 'green'\u2014it risks drifting toward environmental themes, which we agreed to avoid. Our brand pillar is 'empowerment through practical skills', not sustainability. Let's pivot to something like 'Building futures through skilled hands' to maintain vocational focus."
+      },
+      {
+        "role": "user",
+        "content": "Got it. How should we differentiate this for corporate CSR teams versus individual donors? They consume messaging so differently."
+      },
+      {
+        "role": "assistant",
+        "content": "For CSR departments, lead with workforce pipeline impact: 'Reduce local skills shortages by partnering with SkillBridge'. Individuals respond better to human-scale outcomes\u2014like our Q1 campaign highlighting how Maria's welding certification led to a $15k salary increase. Remember, our community impact metrics showed 40% higher engagement when we tied donations to specific skill outcomes."
+      },
+      {
+        "role": "user",
+        "content": "Perfect. Let's lock in that direction. And maybe include the metric about 92% job placement within 6 months\u2014it's our strongest proof point."
+      },
+      {
+        "role": "assistant",
+        "content": "Confirmed. Final messaging will emphasize practical skills empowerment, avoid environmental references, and segment messaging using proven impact metrics. I'll draft the revised brief accordingly."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.performance_context",
+        "user.strategic_approach",
+        "user.session_history"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand core (long-term) and strategic signatures (long-term) are central to messaging alignment. Performance context (rolling) supports metric-based claims. User's strategic approach (long-term) and session history (short-term) reflect their focus on measurable outcomes and ongoing discussion continuity."
+    },
+    "metadata": {
+      "scenario_type": "messaging_refinement",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 8,
+      "signals_present": [
+        "Core brand pillar: 'Empowerment through practical skills'",
+        "Target donor segments: individual philanthropists (ages 35-65) and CSR departments",
+        "Key messaging constraint: avoid referencing climate/environmental themes",
+        "Previous collaboration note: established 'community impact metrics' as primary success indicator"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_campaign_456",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Wait, building on our earlier discussion about the 'community trust' pillar\u2014how specifically should we articulate safety in driver emails without sounding redundant? We need concrete actions, not just slogans."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. Since safety is non-negotiable for our brand, we could highlight initiatives like mandatory quarterly safety training refreshers and the new collision detection feature in the driver app. These demonstrate proactive investment in driver well-being, reinforcing our core value. Does that align with the playbook framework we approved last month?"
+      },
+      {
+        "role": "user",
+        "content": "Perfect. Now shifting to reliability messaging for high-volume drivers\u2014they're obsessed with consistent earnings. How do we tie that to brand values without overpromising?"
+      },
+      {
+        "role": "assistant",
+        "content": "For that segment, we should emphasize our dynamic pricing algorithms ensuring stable earnings during peak hours, plus the guaranteed hourly minimums rolling out in Q3. This connects reliability directly to their livelihood concerns. Remember, the phased rollout starts with Tier 1 cities in July, so we'll need geo-targeted email digests."
+      },
+      {
+        "role": "user",
+        "content": "Got it. But how do we operationalize 'community' without leaning on eco-initiatives? Tenured drivers especially might see that as superficial."
+      },
+      {
+        "role": "assistant",
+        "content": "Valid point. Instead, we're proposing peer-to-peer recognition programs where drivers nominate colleagues for 'Reliability Hero' awards, featured in hub materials. This builds organic community while reinforcing our pillars. The nomination criteria will live in the updated driver playbook."
+      },
+      {
+        "role": "user",
+        "content": "Right. And just to confirm\u2014we're still locked into the Q3 timeline? Creative teams are stressing about physical hub material production lead times."
+      },
+      {
+        "role": "assistant",
+        "content": "Yes, the phased schedule holds: in-app notifications launch July 15, emails follow August 1, and hub materials hit 20+ cities by September. We'll prioritize digital-first for tenured drivers since they engage more with the app, per our segmentation strategy."
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.strategic_signatures",
+        "company.business_priorities",
+        "company.knowledge_artifacts",
+        "user.role_context"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand pillars (safety/reliability) and strategic frameworks (campaign pillars) require long-term company memory. Q3 timelines and playbook updates are short-term business priorities. The user's focus on segmentation aligns with their role context."
+    },
+    "metadata": {
+      "scenario_type": "proposal_standardization",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 8,
+      "signals_present": [
+        "safety/reliability pillars",
+        "Q3 phased rollout",
+        "driver segmentation",
+        "approved channels"
+      ]
+    }
+  },
+  {
+    "scenario_id": "company.brand_core_proposal_template_refinement_142",
+    "conversation": [
+      {
+        "role": "user",
+        "content": "Let's tighten the executive summary section. We need to emphasize our Trust pillar more explicitly\u2014reference the Q2 healthcare proposal's opening structure but align it with financial services regulations. Keep it concise."
+      },
+      {
+        "role": "assistant",
+        "content": "Understood. I'll mirror the healthcare proposal's three-sentence hook structure but reframe it around financial compliance. Our brand's Trust pillar language from the guidelines will anchor the messaging. Drafting now..."
+      },
+      {
+        "role": "user",
+        "content": "Good start, but the value proposition paragraph feels generic. Bake in our regulatory compliance expertise more boldly\u2014that's the client's primary pain point. And watch the tone; it's leaning too formal for 'confident yet approachable'."
+      },
+      {
+        "role": "assistant",
+        "content": "Adjusting tone to match our approved voice guidelines. I've sharpened the compliance angle: 'Navigating complex regulations requires more than expertise\u2014it demands proactive partnership.' Does that resonate?"
+      },
+      {
+        "role": "user",
+        "content": "Better. Now integrate the Partnership pillar into the methodology section without diluting the financial focus. Use bullet points like we did in the healthcare template's Phase 2 breakdown."
+      },
+      {
+        "role": "assistant",
+        "content": "Integrated Partnership pillar through co-creation workshops and governance frameworks. Maintained financial specificity with examples like 'joint regulatory strategy sessions'. Final draft attached\u2014ready for your approval queue?"
+      }
+    ],
+    "labels": {
+      "categories": [
+        "company.brand_core",
+        "company.knowledge_artifacts",
+        "company.business_priorities",
+        "user.communication_style"
+      ],
+      "persistence_horizon": "mixed",
+      "memory_scope": "mixed",
+      "rationale": "Brand pillars (Trust/Partnership) and voice guidelines fall under company.brand_core (long persistence). Proposal templates reference company.knowledge_artifacts (long). Client-specific compliance focus ties to company.business_priorities (short). User's tone feedback touches user.communication_style (long)."
+    },
+    "metadata": {
+      "scenario_type": "proposal_template_refinement",
+      "primary_category": "company.brand_core",
+      "distractor_present": false,
+      "turn_count": 6,
+      "signals_present": [
+        "Firm's three-pillar brand framework (Trust, Innovation, Partnership)",
+        "Client-specific value proposition: 'Regulatory compliance expertise'",
+        "Approved brand voice guideline: 'Confident yet approachable'",
+        "Previous collaboration reference: Q2 2025 healthcare proposal structure"
+      ]
+    }
+  }
+]

synthetic_data/test_balanced.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Quick test of balanced generation for underrepresented categories."""
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+import cohere
+client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
+# Test with underrepresented categories
+test_categories = ["company.tools_config", "company.knowledge_artifacts", "none"]
+for category in test_categories:
+    print(f"\n{'='*60}")
+    print(f"Testing: {category}")
+    print("="*60)
+    if category == "none":
+        prompt = """Generate a marketing conversation that has NO long-term memory value.
+The conversation should be transactional, vague, or temporary.
+Examples: checking status, scheduling, confirming receipt.
+Generate 4 turns. Start mid-conversation (no greetings).
+OUTPUT FORMAT (JSON only):
+{
+  "scenario_id": "none_001",
+  "conversation": [
+    {"role": "user", "content": "..."},
+    {"role": "assistant", "content": "..."}
+  ],
+  "labels": {
+    "categories": ["none"],
+    "rationale": "..."
+  }
+}"""
+    else:
+        prompt = f"""Generate a marketing conversation that clearly demonstrates: {category}
+The conversation MUST contain clear signals for this category.
+4-6 turns, start mid-conversation (no greetings).
+CRITICAL: The categories array MUST include "{category}".
+OUTPUT FORMAT (JSON only):
+{{
+  "scenario_id": "{category.replace('.', '_')}_001",
+  "conversation": [
+    {{"role": "user", "content": "..."}},
+    {{"role": "assistant", "content": "..."}}
+  ],
+  "labels": {{
+    "categories": ["{category}"],
+    "rationale": "..."
+  }}
+}}"""
+    try:
+        response = client.chat(
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.7,
+            model="command-r-plus-08-2024",
+            response_format={"type": "json_object"}
+        )
+        content = response.message.content[0].text
+        data = json.loads(content)
+        output_cats = data.get("labels", {}).get("categories", [])
+        print(f"Target: {category}")
+        print(f"Output: {output_cats}")
+        print(f"Match: {'YES' if category in output_cats else 'NO'}")
+        if data.get("conversation"):
+            print(f"First turn: {data['conversation'][0]['content'][:80]}...")
+    except Exception as e:
+        print(f"Error: {e}")

synthetic_data/test_connection.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import cohere
+import time
+from dotenv import load_dotenv
+load_dotenv()
+def test_connection():
+    api_key = os.getenv("COHERE_API_KEY")
+    if not api_key:
+        print("Error: COHERE_API_KEY not found.")
+        return
+    print(f"Testing connection with API Key: {api_key[:4]}...{api_key[-4:]}")
+    client = cohere.ClientV2(api_key=api_key)
+    print("Sending request to command-a-reasoning-08-2025...")
+    start_time = time.time()
+    try:
+        response = client.chat(
+            messages=[{"role": "user", "content": "Say 'Hello, World!'"}],
+            model="command-a-reasoning-08-2025",
+            thinking={"type": "enabled"},
+            temperature=0.7
+        )
+        elapsed = time.time() - start_time
+        print(f"Response received in {elapsed:.2f}s")
+        print("Response object:", response)
+        # Try to extract text
+        if hasattr(response, 'message') and response.message.content:
+            for block in response.message.content:
+                if block.type == 'text':
+                    print(f"Text content: {block.text}")
+    except Exception as e:
+        print(f"Error: {e}")
+if __name__ == "__main__":
+    test_connection()

synthetic_data/test_diverse.py ADDED Viewed

	@@ -0,0 +1,58 @@

+"""Quick test of diverse generation with high temperature."""
+import json
+import random
+import os
+from dotenv import load_dotenv
+load_dotenv()
+import cohere
+client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
+INDUSTRIES = ["fintech startup", "healthcare SaaS", "e-commerce fashion"]
+STARTERS = ["So about", "Following up on", "I've been thinking about"]
+# Test 3 different categories with HIGH temperature
+test_cases = [
+    ("company.tools_config", "fintech startup", "growth hacker"),
+    ("user.communication_style", "healthcare SaaS", "CMO"),
+    ("none", "e-commerce fashion", "marketing manager")
+]
+for category, industry, role in test_cases:
+    starter = random.choice(STARTERS)
+    turns = random.randint(3, 6)
+    if category == "none":
+        prompt = f"""Create a UNMEMORABLE conversation between a {role} at a {industry} and AI.
+Purely transactional - status check, scheduling, confirmation. NO specific details.
+{turns} turns. Start with "{starter}..."
+Return JSON: {{"conversation": [...], "labels": {{"categories": ["none"]}}}}"""
+    else:
+        prompt = f"""Create a marketing conversation for a {role} at a {industry}.
+Must demonstrate: {category}
+{turns} turns. Start with "{starter}..."
+Be SPECIFIC with realistic details unique to {industry}.
+Return JSON: {{"conversation": [...], "labels": {{"categories": ["{category}"]}}}}"""
+    response = client.chat(
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0.95,
+        model="command-r-plus-08-2024",
+        response_format={"type": "json_object"}
+    )
+    content = response.message.content[0].text
+    data = json.loads(content)
+    print(f"\n{'='*60}")
+    print(f"Category: {category} | Industry: {industry}")
+    print(f"Output categories: {data.get('labels', {}).get('categories', [])}")
+    conv = data.get("conversation", [])
+    if conv:
+        first = conv[0]
+        if isinstance(first, dict):
+            print(f"First turn: {first.get('content', '')[:120]}...")
+        else:
+            print(f"First turn: {str(first)[:120]}...")

synthetic_data/test_pipeline.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import unittest
+import json
+import os
+from synthetic_data.pipeline import SyntheticDataPipeline
+from unittest.mock import MagicMock, patch
+class TestSyntheticDataPipeline(unittest.TestCase):
+    @patch('synthetic_data.pipeline.cohere.ClientV2')
+    def test_pipeline_structure(self, mock_client):
+        # Setup mock response
+        mock_instance = mock_client.return_value
+        # Mock scenario generation response
+        scenario_block = MagicMock()
+        scenario_block.type = "text"
+        scenario_block.text = json.dumps({
+            "scenario_description": "Test scenario",
+            "user_profile": "Test user",
+            "key_signals_to_include": ["signal1"],
+            "distractor_signals": [],
+            "suggested_turn_breakdown": "Test breakdown"
+        })
+        mock_scenario_response = MagicMock()
+        mock_scenario_response.message.content = [scenario_block]
+        # Mock conversation generation response
+        conv_block = MagicMock()
+        conv_block.type = "text"
+        conv_block.text = json.dumps({
+            "scenario_id": "test_id_001",
+            "conversation": [
+                {"role": "user", "content": "test"},
+                {"role": "assistant", "content": "response"}
+            ],
+            "labels": {
+                "categories": ["company.brand_core"],
+                "persistence_horizon": "long",
+                "memory_scope": "company",
+                "rationale": "test rationale"
+            },
+            "metadata": {
+                "scenario_type": "test",
+                "primary_category": "company.brand_core",
+                "distractor_present": False,
+                "turn_count": 2,
+                "signals_present": ["signal1"]
+            }
+        })
+        mock_conv_response = MagicMock()
+        mock_conv_response.message.content = [conv_block]
+        mock_instance.chat.side_effect = [mock_scenario_response, mock_conv_response]
+        # Run pipeline with dummy key
+        pipeline = SyntheticDataPipeline(api_key="dummy_key")
+        results = pipeline.run_batch(count=1)
+        self.assertEqual(len(results), 1)
+        self.assertIn('scenario_id', results[0])
+        self.assertIn('conversation', results[0])
+        self.assertIn('labels', results[0])
+        self.assertEqual(results[0]['labels']['categories'], ["company.brand_core"])
+    def test_conversation_structure(self):
+        # Validate structure of a sample output
+        sample_data = {
+            "scenario_id": "brand_core_test_001",
+            "conversation": [
+                {"role": "user", "content": "test"},
+                {"role": "assistant", "content": "response"}
+            ],
+            "labels": {
+                "categories": ["company.brand_core"],
+                "persistence_horizon": "long",
+                "memory_scope": "company",
+                "rationale": "test"
+            },
+            "metadata": {
+                "scenario_type": "test",
+                "primary_category": "company.brand_core",
+                "distractor_present": False,
+                "turn_count": 2,
+                "signals_present": []
+            }
+        }
+        # Check required fields
+        required_fields = ['scenario_id', 'conversation', 'labels', 'metadata']
+        for field in required_fields:
+            self.assertIn(field, sample_data)
+        # Check label structure
+        label_fields = ['categories', 'persistence_horizon', 'memory_scope', 'rationale']
+        for field in label_fields:
+            self.assertIn(field, sample_data['labels'])
+if __name__ == '__main__':
+    unittest.main()

synthetic_data/training_dataset_1000.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

synthetic_data/validate.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import json
+import argparse
+from collections import Counter
+from typing import Dict, List, Any
+def validate_synthetic_data(filepath: str) -> Dict[str, Any]:
+    """Validate synthetic data quality based on the PRD guidelines."""
+    try:
+        with open(filepath, 'r') as f:
+            # Handle both single JSON array and JSONL formats
+            content = f.read().strip()
+            if content.startswith('[') and content.endswith(']'):
+                data = json.loads(content)
+            else:
+                data = [json.loads(line) for line in content.split('\n') if line.strip()]
+    except json.JSONDecodeError as e:
+        return {'error': f"Invalid JSON format: {e}"}
+    except Exception as e:
+        return {'error': f"Error reading file: {e}"}
+    if not data:
+        return {'error': "Empty dataset"}
+    # Category distribution
+    all_categories = []
+    for item in data:
+        if 'labels' in item and 'categories' in item['labels']:
+            all_categories.extend(item['labels']['categories'])
+    category_dist = Counter(all_categories)
+    # Multi-label frequency
+    multi_label_count = sum(1 for item in data
+                          if 'labels' in item and 'categories' in item['labels']
+                          and len(item['labels']['categories']) > 1)
+    multi_label_freq = multi_label_count / len(data) if len(data) > 0 else 0
+    # Turn count distribution
+    turn_counts = [item['metadata'].get('turn_count', 0) for item in data if 'metadata' in item]
+    avg_turns = sum(turn_counts) / len(turn_counts) if turn_counts else 0
+    # Persistence distribution
+    persistence_dist = Counter(item['labels'].get('persistence_horizon', 'unknown') for item in data if 'labels' in item)
+    # Memory scope distribution
+    scope_dist = Counter(item['labels'].get('memory_scope', 'unknown') for item in data if 'labels' in item)
+    return {
+        'total_examples': len(data),
+        'category_distribution': dict(category_dist),
+        'multi_label_frequency': multi_label_freq,
+        'avg_turns_per_conversation': avg_turns,
+        'persistence_distribution': dict(persistence_dist),
+        'scope_distribution': dict(scope_dist),
+        'warnings': _generate_warnings(category_dist, multi_label_freq, avg_turns, len(data))
+    }
+def _generate_warnings(cat_dist, ml_freq, avg_turns, total_count):
+    warnings = []
+    # Check for imbalanced categories (only if dataset is large enough)
+    if total_count > 20:
+        total_cats = sum(cat_dist.values())
+        for cat, count in cat_dist.items():
+            if count / total_cats < 0.05:
+                warnings.append(f"Category '{cat}' underrepresented: {count/total_cats:.1%}")
+    # Check multi-label frequency
+    if ml_freq < 0.15:
+        warnings.append(f"Low multi-label frequency: {ml_freq:.1%} (target: 20-25%)")
+    # Check turn length
+    if avg_turns < 4 or avg_turns > 10:
+        warnings.append(f"Average turns out of range: {avg_turns:.1f} (target: 6.5±1.5)")
+    return warnings
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Validate synthetic data quality")
+    parser.add_argument("filepath", help="Path to JSON/JSONL file")
+    args = parser.parse_args()
+    metrics = validate_synthetic_data(args.filepath)
+    print(json.dumps(metrics, indent=2))

synthetic_data/verify_key.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import cohere
+from dotenv import load_dotenv
+load_dotenv()
+key = os.getenv("CO_API_KEY")
+print(f"Key: {key[:5]}... (len={len(key) if key else 0})")
+try:
+    client = cohere.ClientV2(api_key=key)
+    resp = client.chat(model="command-r-plus", messages=[{"role": "user", "content": "hi"}])
+    print("API Connection Success!")
+except Exception as e:
+    print(f"API Error: {e}")

training/benchmark.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+Benchmark: Memory Routing Model Evaluation
+This script evaluates our trained model against:
+1. Base model (untrained Llama-3.1-8B)
+2. Our SFT model
+3. Our RL model
+We measure:
+- Classification metrics (F1, precision, recall)
+- Task-specific metrics (temporal alignment, scope parity)
+- Efficiency (tokens generated, latency)
+"""
+import asyncio
+import json
+import time
+import os
+import numpy as np
+from typing import List, Dict, Any, Tuple
+from collections import Counter
+from dataclasses import dataclass
+@dataclass
+class BenchmarkConfig:
+    base_model: str = "meta-llama/Llama-3.1-8B"
+    renderer_name: str = "llama3"
+    test_data_path: str = "training/processed_data/test_data.json"
+    output_dir: str = "training/benchmarks"
+    # Model checkpoints to evaluate
+    sft_checkpoint: str = ""
+    rl_checkpoint: str = ""
+VALID_CATEGORIES = {
+    "company.brand_core", "company.strategic_signatures", "company.knowledge_artifacts",
+    "company.business_priorities", "company.tools_config", "company.performance_context",
+    "user.communication_style", "user.strategic_approach", "user.role_context",
+    "user.workflow_patterns", "user.session_history", "user.interaction_preferences",
+    "none"
+}
+CATEGORY_PERSISTENCE = {
+    "company.brand_core": "long", "company.strategic_signatures": "long",
+    "company.knowledge_artifacts": "long", "company.business_priorities": "short",
+    "company.tools_config": "medium", "company.performance_context": "rolling",
+    "user.communication_style": "long", "user.strategic_approach": "long",
+    "user.role_context": "medium", "user.workflow_patterns": "medium",
+    "user.session_history": "short", "user.interaction_preferences": "evolving",
+    "none": "short"
+}
+SYSTEM_PROMPT = """You route marketing conversations into structured memory categories.
+Available categories:
+- company.brand_core: Voice, values, positioning
+- company.strategic_signatures: Decision frameworks
+- company.knowledge_artifacts: Docs, style guides
+- company.business_priorities: Quarterly goals, campaigns
+- company.tools_config: Integrations, settings
+- company.performance_context: Campaign metrics
+- user.communication_style: Tone, format expectations
+- user.strategic_approach: Personal priorities
+- user.role_context: Title, scope
+- user.workflow_patterns: Review cadence
+- user.session_history: Recent context
+- user.interaction_preferences: Coaching style
+- none: Irrelevant or transactional
+Respond with comma-separated categories only. No explanations."""
+def parse_prediction(text: str) -> set:
+    """Parse model output into category set."""
+    if not text:
+        return set()
+    categories = set()
+    for part in text.split(","):
+        cat = part.strip().lower()
+        if cat in VALID_CATEGORIES:
+            categories.add(cat)
+    # Remove "none" if mixed with others
+    if "none" in categories and len(categories) > 1:
+        categories.discard("none")
+    return categories
+def compute_metrics(predicted: set, gold: set) -> Dict[str, float]:
+    """Compute all evaluation metrics for a single example."""
+    metrics = {}
+    # Basic classification
+    tp = len(predicted & gold)
+    metrics["precision"] = tp / len(predicted) if predicted else 0
+    metrics["recall"] = tp / len(gold) if gold else 0
+    metrics["f1"] = 2 * metrics["precision"] * metrics["recall"] / (metrics["precision"] + metrics["recall"]) if (metrics["precision"] + metrics["recall"]) > 0 else 0
+    metrics["exact_match"] = float(predicted == gold)
+    metrics["any_match"] = float(tp > 0)
+    # Temporal alignment
+    def majority_persistence(cats):
+        if not cats:
+            return "medium"
+        persis = [CATEGORY_PERSISTENCE.get(c, "medium") for c in cats]
+        return Counter(persis).most_common(1)[0][0]
+    pred_pers = majority_persistence(predicted)
+    gold_pers = majority_persistence(gold)
+    metrics["temporal_match"] = float(pred_pers == gold_pers)
+    # Scope parity
+    def get_scope(cats):
+        scopes = set()
+        for c in cats:
+            if c.startswith("company."):
+                scopes.add("company")
+            elif c.startswith("user."):
+                scopes.add("user")
+        if len(scopes) == 2:
+            return "mixed"
+        return scopes.pop() if scopes else "none"
+    metrics["scope_match"] = float(get_scope(predicted) == get_scope(gold))
+    # Efficiency
+    n = len(predicted)
+    metrics["n_categories"] = n
+    metrics["efficiency"] = 1.0 if n <= 3 else (0.7 if n == 4 else 0.4)
+    return metrics
+async def evaluate_model(
+    service_client,
+    tokenizer,
+    renderer,
+    checkpoint: str,
+    test_data: List[Dict],
+    model_name: str
+) -> Tuple[Dict, List[Dict]]:
+    """Evaluate a single model checkpoint."""
+    from tinker import types
+    print(f"\nEvaluating: {model_name}")
+    print(f"Checkpoint: {checkpoint}")
+    sampling_client = service_client.create_sampling_client(model_path=checkpoint)
+    stop_sequences = renderer.get_stop_sequences()
+    results = []
+    latencies = []
+    for i, example in enumerate(test_data):
+        gold = set([c.lower() for c in example.get("categories", [])])
+        messages = example.get("messages", [])
+        prompt_messages = [m for m in messages if m.get("role") != "assistant"]
+        if not prompt_messages:
+            continue
+        prompt = renderer.build_generation_prompt(prompt_messages)
+        params = types.SamplingParams(max_tokens=50, temperature=0.1, stop=stop_sequences)
+        start_time = time.time()
+        result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
+        latency = time.time() - start_time
+        latencies.append(latency)
+        response, success = renderer.parse_response(result.sequences[0].tokens)
+        predicted_text = response["content"] if success else ""
+        predicted = parse_prediction(predicted_text)
+        metrics = compute_metrics(predicted, gold)
+        metrics["gold"] = list(gold)
+        metrics["predicted"] = list(predicted)
+        metrics["predicted_text"] = predicted_text
+        metrics["latency"] = latency
+        metrics["format_valid"] = bool(predicted) or predicted_text.strip().lower() == "none"
+        results.append(metrics)
+        if (i + 1) % 50 == 0:
+            print(f"  Progress: {i + 1}/{len(test_data)}")
+    # Aggregate
+    aggregate = {
+        "model_name": model_name,
+        "checkpoint": checkpoint,
+        "n_examples": len(results),
+        "f1": np.mean([r["f1"] for r in results]),
+        "precision": np.mean([r["precision"] for r in results]),
+        "recall": np.mean([r["recall"] for r in results]),
+        "exact_match": np.mean([r["exact_match"] for r in results]),
+        "any_match": np.mean([r["any_match"] for r in results]),
+        "temporal_match": np.mean([r["temporal_match"] for r in results]),
+        "scope_match": np.mean([r["scope_match"] for r in results]),
+        "efficiency": np.mean([r["efficiency"] for r in results]),
+        "format_valid": np.mean([r["format_valid"] for r in results]),
+        "mean_latency": np.mean(latencies),
+        "p95_latency": np.percentile(latencies, 95),
+    }
+    return aggregate, results
+async def run_benchmark(config: BenchmarkConfig):
+    """Run full benchmark suite."""
+    import tinker
+    from tinker_cookbook import renderers
+    from tinker_cookbook.tokenizer_utils import get_tokenizer
+    from dotenv import load_dotenv
+    from datetime import datetime
+    load_dotenv()
+    print("=" * 70)
+    print("MEMORY ROUTING BENCHMARK")
+    print("=" * 70)
+    # Setup
+    os.makedirs(config.output_dir, exist_ok=True)
+    service_client = tinker.ServiceClient()
+    tokenizer = get_tokenizer(config.base_model)
+    renderer = renderers.get_renderer(name=config.renderer_name, tokenizer=tokenizer)
+    # Load test data
+    with open(config.test_data_path, "r") as f:
+        test_data = json.load(f)
+    print(f"Test examples: {len(test_data)}")
+    # Models to evaluate
+    models = []
+    if config.sft_checkpoint:
+        models.append(("SFT Model (Llama-3.1-8B + LoRA)", config.sft_checkpoint))
+    if config.rl_checkpoint:
+        models.append(("RL Model (Llama-3.1-8B + LoRA)", config.rl_checkpoint))
+    # Run evaluations
+    all_results = {}
+    for model_name, checkpoint in models:
+        aggregate, details = await evaluate_model(
+            service_client, tokenizer, renderer, checkpoint, test_data, model_name
+        )
+        all_results[model_name] = {
+            "aggregate": aggregate,
+            "details": details
+        }
+    # Print comparison table
+    print("\n" + "=" * 70)
+    print("BENCHMARK RESULTS")
+    print("=" * 70)
+    print(f"\n{'Metric':<20} ", end="")
+    for model_name in all_results:
+        short_name = model_name.split(" (")[0]
+        print(f"{short_name:<15} ", end="")
+    print()
+    print("-" * 70)
+    metrics_to_show = [
+        ("F1 Score", "f1"),
+        ("Precision", "precision"),
+        ("Recall", "recall"),
+        ("Exact Match", "exact_match"),
+        ("Any Match", "any_match"),
+        ("Temporal Match", "temporal_match"),
+        ("Scope Match", "scope_match"),
+        ("Format Valid", "format_valid"),
+        ("Mean Latency", "mean_latency"),
+    ]
+    for display_name, key in metrics_to_show:
+        print(f"{display_name:<20} ", end="")
+        for model_name in all_results:
+            value = all_results[model_name]["aggregate"][key]
+            if key == "mean_latency":
+                print(f"{value:.3f}s         ", end="")
+            else:
+                print(f"{value:.1%}          ", end="")
+        print()
+    # Save results
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_path = os.path.join(config.output_dir, f"benchmark_{timestamp}.json")
+    with open(output_path, "w") as f:
+        json.dump({
+            "config": {
+                "base_model": config.base_model,
+                "test_examples": len(test_data),
+            },
+            "results": {k: v["aggregate"] for k, v in all_results.items()},
+            "details": {k: v["details"] for k, v in all_results.items()}
+        }, f, indent=2, default=str)
+    print(f"\nResults saved to: {output_path}")
+    return all_results
+async def main():
+    import sys
+    config = BenchmarkConfig()
+    # Parse command line args
+    for arg in sys.argv[1:]:
+        if "=" in arg:
+            key, value = arg.split("=", 1)
+            if hasattr(config, key):
+                setattr(config, key, value)
+    await run_benchmark(config)
+if __name__ == "__main__":
+    asyncio.run(main())