MuratcanKoylan commited on
Commit
685d968
·
verified ·
1 Parent(s): ec5d796

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +48 -0
  3. MODEL_CARD.md +128 -0
  4. README.md +274 -3
  5. assets/difficulty_comparison.png +0 -0
  6. assets/generate_charts.py +207 -0
  7. assets/model_comparison.png +0 -0
  8. assets/rl_components.png +3 -0
  9. assets/rl_reward.png +0 -0
  10. assets/sft_loss.png +0 -0
  11. docs/PRD.md +498 -0
  12. docs/Synthetic-Data-Gen/synthetic_data.md +784 -0
  13. docs/tinker_docs.md +0 -0
  14. huggingface/README.md +140 -0
  15. huggingface/upload_to_hf.py +220 -0
  16. requirements.txt +14 -0
  17. synthetic_data/README.md +58 -0
  18. synthetic_data/all_generated_data_1000.jsonl +0 -0
  19. synthetic_data/balanced_async_log.txt +67 -0
  20. synthetic_data/balanced_generation_log.txt +0 -0
  21. synthetic_data/balanced_generation_log_20251124_184530.txt +0 -0
  22. synthetic_data/clean_batch.py +38 -0
  23. synthetic_data/clean_data.py +72 -0
  24. synthetic_data/debug_key.py +28 -0
  25. synthetic_data/debug_key_raw.py +13 -0
  26. synthetic_data/diverse_dataset_20251124_192207.jsonl +0 -0
  27. synthetic_data/diverse_generation_log.txt +9 -0
  28. synthetic_data/diverse_log.txt +0 -0
  29. synthetic_data/generate_sample.py +15 -0
  30. synthetic_data/generation_log_100.txt +702 -0
  31. synthetic_data/generation_log_100_v2.txt +0 -0
  32. synthetic_data/generation_log_async.txt +0 -0
  33. synthetic_data/generation_log_final.txt +0 -0
  34. synthetic_data/merged_training_dataset_2001.jsonl +0 -0
  35. synthetic_data/pipeline.py +311 -0
  36. synthetic_data/quick_test_diverse.py +62 -0
  37. synthetic_data/run_balanced_async.py +259 -0
  38. synthetic_data/run_balanced_generation.py +391 -0
  39. synthetic_data/run_batch.py +116 -0
  40. synthetic_data/run_batch_async.py +198 -0
  41. synthetic_data/run_diverse_generation.py +310 -0
  42. synthetic_data/sample_batch.json +841 -0
  43. synthetic_data/test_balanced.py +81 -0
  44. synthetic_data/test_connection.py +40 -0
  45. synthetic_data/test_diverse.py +58 -0
  46. synthetic_data/test_pipeline.py +100 -0
  47. synthetic_data/training_dataset_1000.jsonl +0 -0
  48. synthetic_data/validate.py +85 -0
  49. synthetic_data/verify_key.py +15 -0
  50. training/benchmark.py +328 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/rl_components.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Virtual environment
2
+ venv/
3
+ .venv/
4
+ env/
5
+
6
+ # Environment variables
7
+ .env
8
+ .env.local
9
+
10
+ # Python
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+ *.so
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+
32
+ # IDE
33
+ .idea/
34
+ .vscode/
35
+ *.swp
36
+ *.swo
37
+ .DS_Store
38
+
39
+ # Jupyter
40
+ .ipynb_checkpoints/
41
+
42
+ # Logs (keep structure, ignore large files)
43
+ training/logs/**/nohup.out
44
+
45
+ # Temporary files
46
+ *.tmp
47
+ *.temp
48
+
MODEL_CARD.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Card: Memory Routing Agent (Llama-8B + LoRA)
2
+
3
+ ## Model Details
4
+
5
+ - **Model Name**: memory-routing-llama-8b-lora
6
+ - **Base Model**: meta-llama/Llama-3.1-8B
7
+ - **Architecture**: LoRA (Low-Rank Adaptation), rank 32
8
+ - **Training Platform**: Tinker (Thinking Machines)
9
+ - **Training Method**: SFT (Supervised Fine-Tuning) + RL (Reinforcement Learning)
10
+ - **Parameters**: ~8B base + ~100M LoRA adapters
11
+ - **License**: Apache 2.0
12
+
13
+ ## Intended Use
14
+
15
+ This model classifies marketing conversations into memory categories for AI assistant systems. It determines which pieces of information from a conversation should be stored in long-term memory and how they should be categorized.
16
+
17
+ ### Primary Use Cases
18
+ - Marketing AI assistants that need to remember user preferences
19
+ - CRM systems that extract structured data from conversations
20
+ - Knowledge management systems for marketing teams
21
+
22
+ ### Out-of-Scope Uses
23
+ - General-purpose chatbots
24
+ - Non-marketing domains (healthcare, legal, finance)
25
+ - Real-time conversation generation
26
+
27
+ ## Training Data
28
+
29
+ ### Synthetic Dataset
30
+ - **Size**: 2,001 conversations
31
+ - **Generation**: Cohere Command-R-Plus (104B) as teacher model
32
+ - **Format**: Multi-turn marketing conversations with category labels
33
+
34
+ ### Category Taxonomy (13 categories)
35
+ | Category | Description | Persistence |
36
+ |----------|-------------|-------------|
37
+ | company.brand_core | Voice, values, positioning | Long (>1y) |
38
+ | company.strategic_signatures | Decision frameworks | Long (>1y) |
39
+ | company.knowledge_artifacts | Docs, style guides | Long (>1y) |
40
+ | company.business_priorities | Quarterly goals | Short (<3m) |
41
+ | company.tools_config | Integrations, APIs | Medium (~6m) |
42
+ | company.performance_context | Campaign metrics | Rolling (~6m) |
43
+ | user.communication_style | Tone, format preferences | Long (>1y) |
44
+ | user.strategic_approach | Personal priorities | Long (>1y) |
45
+ | user.role_context | Title, scope | Medium (~1y) |
46
+ | user.workflow_patterns | Review cadence | Medium (~1y) |
47
+ | user.session_history | Immediate context | Short (<2w) |
48
+ | user.interaction_preferences | Coaching style | Evolving |
49
+ | none | Irrelevant content | N/A |
50
+
51
+ ## Training Procedure
52
+
53
+ ### Phase 1: Supervised Fine-Tuning (SFT)
54
+ - **Steps**: 100
55
+ - **Batch Size**: 128
56
+ - **Learning Rate**: 2.86e-4 (Tinker default for Llama-8B)
57
+ - **Optimizer**: Adam (β1=0.9, β2=0.95)
58
+ - **Loss Function**: Cross-entropy
59
+
60
+ ### Phase 2: Reinforcement Learning (RL)
61
+ - **Iterations**: 12
62
+ - **Groups per Batch**: 64
63
+ - **Group Size**: 32
64
+ - **Learning Rate**: 2e-5
65
+ - **Loss Function**: Importance sampling policy gradient
66
+ - **Reward Function**:
67
+ - R_F1 (60%): F1 score vs gold labels
68
+ - R_temp (20%): Temporal alignment
69
+ - R_parity (10%): Company/user scope
70
+ - R_eff (10%): Storage efficiency
71
+
72
+ ## Evaluation Results
73
+
74
+ ### Marketing Routing Benchmark (50 scenarios)
75
+
76
+ | Model | Any Match | Exact Match | Avg F1 |
77
+ |-------|-----------|-------------|--------|
78
+ | **Ours (8B + LoRA)** | 72% | **60%** | **0.68** |
79
+ | Cohere Command-R-Plus (104B) | 82% | 26% | 0.61 |
80
+
81
+ ### Key Findings
82
+ - **11.1% higher F1** than the 104B teacher model
83
+ - **2.3x better exact match** accuracy
84
+ - **13x smaller** than the teacher model
85
+ - Excels at single-category classification (86% exact on easy cases)
86
+ - Struggles with multi-label scenarios (10% exact on hard cases)
87
+
88
+ ### Performance by Difficulty
89
+ | Difficulty | Our Model (F1) | Cohere (F1) | Delta |
90
+ |------------|----------------|-------------|-------|
91
+ | Easy | 0.86 | 0.48 | +79% |
92
+ | Medium | 0.65 | 0.64 | +2% |
93
+ | Hard | 0.50 | 0.72 | -31% |
94
+
95
+ ## Limitations
96
+
97
+ 1. **Multi-label Detection**: Under-predicts when multiple categories apply
98
+ 2. **Company vs User Confusion**: Sometimes confuses `company.strategic_signatures` with `user.strategic_approach`
99
+ 3. **Hard Cases**: Performance drops on complex overlapping categories
100
+ 4. **Domain Specificity**: Trained only on marketing scenarios
101
+
102
+ ## Ethical Considerations
103
+
104
+ - Model trained on synthetic data; may not capture all real-world edge cases
105
+ - Should be used with human oversight for critical decisions
106
+ - Privacy: Does not store or transmit conversation data
107
+
108
+ ## Citation
109
+
110
+ ```bibtex
111
+ @misc{memory-routing-agent-2025,
112
+ title={Memory Routing Agent: Prompt Distillation for Marketing AI},
113
+ author={Muratcan Koylan},
114
+ year={2025},
115
+ howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
116
+ }
117
+ ```
118
+
119
+ ## Model Files
120
+
121
+ - `training/checkpoints/rl_iter_012/` - Final RL checkpoint
122
+ - `training/benchmarks/marketing_routing_benchmark.json` - Benchmark dataset
123
+ - `synthetic_data/merged_training_dataset_2001.jsonl` - Training data
124
+
125
+ ## Contact
126
+
127
+ For questions or issues, please open a GitHub issue.
128
+
README.md CHANGED
@@ -1,3 +1,274 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Memory Routing Agent
2
+
3
+ A specialized 8B parameter model that **outperforms 104B models** on marketing conversation classification.
4
+
5
+ ## Key Results
6
+
7
+ ![Model Comparison](assets/model_comparison.png)
8
+
9
+ | Metric | Our Model (8B) | Cohere (104B) |
10
+ |--------|----------------|---------------|
11
+ | **Avg F1** | **0.68** | 0.61 |
12
+ | Exact Match | **60%** | 26% |
13
+ | Model Size | 8B | 104B |
14
+ | **Improvement** | **+11.1% F1** | baseline |
15
+
16
+ Our 8B model achieves **11.1% higher F1 score** than the 104B teacher model that generated its training data, while being **13x smaller**.
17
+
18
+ ## Training Results
19
+
20
+ ### Phase 1: Supervised Fine-Tuning (SFT)
21
+
22
+ ![SFT Loss](assets/sft_loss.png)
23
+
24
+ - **100 training steps** on 2,001 synthetic conversations
25
+ - Loss dropped from **5.47 → 0.26** (95% reduction)
26
+ - Best test loss: **0.105** at step 90
27
+
28
+ ### Phase 2: Reinforcement Learning (RL)
29
+
30
+ ![RL Reward](assets/rl_reward.png)
31
+
32
+ - **30 RL iterations** with importance sampling policy gradient
33
+ - Mean reward improved from **0.73 → 0.93** (+27%)
34
+ - Accuracy maintained at **99.9%+** throughout
35
+
36
+ ### Reward Components
37
+
38
+ ![RL Components](assets/rl_components.png)
39
+
40
+ | Component | Start | End | Description |
41
+ |-----------|-------|-----|-------------|
42
+ | R_F1 | 0.64 | 0.90 | F1 score vs gold labels |
43
+ | R_temp | 0.81 | 0.95 | Temporal alignment |
44
+ | R_parity | 0.86 | 1.00 | Company/user scope |
45
+ | R_eff | 1.00 | 1.00 | Storage efficiency |
46
+
47
+ ## Performance by Difficulty
48
+
49
+ ![Difficulty Comparison](assets/difficulty_comparison.png)
50
+
51
+ | Difficulty | Our Model | Cohere (104B) | Winner |
52
+ |------------|-----------|---------------|--------|
53
+ | Easy | **0.86** | 0.48 | Ours (+79%) |
54
+ | Medium | **0.65** | 0.64 | Ours (+2%) |
55
+ | Hard | 0.50 | **0.72** | Cohere |
56
+
57
+ Our model excels at clear-cut cases but the larger model handles ambiguous multi-label scenarios better.
58
+
59
+ ## What It Does
60
+
61
+ The Memory Routing Agent classifies marketing conversations into 13 categories to determine what information should be stored in an AI assistant's long-term memory:
62
+
63
+ - **Company categories**: brand_core, strategic_signatures, knowledge_artifacts, business_priorities, tools_config, performance_context
64
+ - **User categories**: communication_style, strategic_approach, role_context, workflow_patterns, session_history, interaction_preferences
65
+ - **None**: Transactional or irrelevant content
66
+
67
+ ## Training Pipeline
68
+
69
+ ```
70
+ ┌─────────────────────────────────────────────────────────────────┐
71
+ │ TRAINING PIPELINE │
72
+ ├─────────────────────────────────────────────────────────────────┤
73
+ │ │
74
+ │ 1. SYNTHETIC DATA GENERATION │
75
+ │ ├── Cohere Command-R-Plus (104B) as teacher │
76
+ │ ├── 2,001 marketing conversations │
77
+ │ └── 13 category labels + persistence horizons │
78
+ │ │
79
+ │ 2. SUPERVISED FINE-TUNING (SFT) │
80
+ │ ├── Base: meta-llama/Llama-3.1-8B │
81
+ │ ├── LoRA rank 32 │
82
+ │ ├── 100 steps, batch size 128 │
83
+ │ └── Cross-entropy loss │
84
+ │ │
85
+ │ 3. REINFORCEMENT LEARNING (RL) │
86
+ │ ├── 30 iterations, 64 groups × 32 samples │
87
+ │ ├── Importance sampling policy gradient │
88
+ │ └── Composite reward: F1 + temporal + parity + efficiency │
89
+ │ │
90
+ └─────────────────────────────────────────────────────────────────┘
91
+ ```
92
+
93
+ ## Quick Start
94
+
95
+ ### Installation
96
+
97
+ ```bash
98
+ # Clone the repository
99
+ git clone https://github.com/muratcankoylan/memory-routing-agent.git
100
+ cd memory-routing-agent
101
+
102
+ # Create virtual environment
103
+ python -m venv venv
104
+ source venv/bin/activate
105
+
106
+ # Install dependencies
107
+ pip install tinker-toolkit python-dotenv cohere
108
+ pip install -e ".[envs]"
109
+ ```
110
+
111
+ ### Environment Setup
112
+
113
+ ```bash
114
+ # Create .env file
115
+ echo "TINKER_API_KEY=your_tinker_key" >> .env
116
+ echo "COHERE_API_KEY=your_cohere_key" >> .env
117
+ echo "HF_TOKEN=your_huggingface_token" >> .env
118
+ ```
119
+
120
+ ### Run Inference
121
+
122
+ ```python
123
+ import tinker
124
+ from tinker import types
125
+ from tinker_cookbook import renderers
126
+ from tinker_cookbook.tokenizer_utils import get_tokenizer
127
+
128
+ # Load model
129
+ service_client = tinker.ServiceClient()
130
+ checkpoint = "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012"
131
+ sampling_client = service_client.create_sampling_client(model_path=checkpoint)
132
+
133
+ # Setup tokenizer and renderer
134
+ tokenizer = get_tokenizer("meta-llama/Llama-3.1-8B")
135
+ renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
136
+
137
+ # Classify a conversation
138
+ conversation = """
139
+ USER: Our brand voice is professional but approachable. Think Harvard Business Review meets Slack.
140
+ ASSISTANT: So authoritative content with a conversational tone?
141
+ USER: Exactly. We never use jargon without explaining it first.
142
+ """
143
+
144
+ messages = [
145
+ {"role": "system", "content": "You route marketing conversations into structured memory categories..."},
146
+ {"role": "user", "content": f"Analyze this conversation:\n\n{conversation}"}
147
+ ]
148
+
149
+ prompt = renderer.build_generation_prompt(messages)
150
+ params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=renderer.get_stop_sequences())
151
+ result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
152
+
153
+ response, _ = renderer.parse_response(result.sequences[0].tokens)
154
+ print(f"Categories: {response['content']}")
155
+ # Output: company.brand_core
156
+ ```
157
+
158
+ ## Project Structure
159
+
160
+ ```
161
+ memory-routing-agent/
162
+ ├── assets/ # Training visualizations
163
+ │ ├── sft_loss.png
164
+ │ ├── rl_reward.png
165
+ │ ├── rl_components.png
166
+ │ ├── model_comparison.png
167
+ │ └── difficulty_comparison.png
168
+ ├── synthetic_data/ # Data generation pipeline
169
+ │ ├── pipeline.py # Cohere-based conversation generator
170
+ │ ├── run_diverse_generation.py
171
+ │ └── merged_training_dataset_2001.jsonl
172
+ ├── training/ # Training scripts
173
+ │ ├── train_v2.py # Main training script (SFT + RL)
174
+ │ ├── preprocess.py # Data preprocessing
175
+ │ ├── rl_env.py # RL environment and reward function
176
+ │ ├── final_benchmark.py # Benchmark evaluation
177
+ │ ├── logs/ # Training logs (JSONL)
178
+ │ └── benchmarks/ # Benchmark results
179
+ ├── huggingface/ # HuggingFace upload scripts
180
+ ├── docs/ # Documentation
181
+ │ ├── PRD.md # Product requirements
182
+ │ └── tinker_docs.md # Tinker reference
183
+ ├── MODEL_CARD.md # Model card
184
+ └── README.md # This file
185
+ ```
186
+
187
+ ## Benchmark
188
+
189
+ The Marketing Routing Benchmark contains 50 challenging scenarios across 7 domains:
190
+
191
+ | Domain | Scenarios | Description |
192
+ |--------|-----------|-------------|
193
+ | Brand & Positioning | 8 | Brand voice, values, identity |
194
+ | Strategic Decisions | 8 | Decision frameworks, heuristics |
195
+ | Performance & Metrics | 8 | Campaign metrics, learnings |
196
+ | Tools & Integrations | 6 | Tech stack, APIs |
197
+ | User Preferences | 10 | Communication style, workflow |
198
+ | Business Priorities | 6 | Goals, focus areas |
199
+ | Knowledge Artifacts | 4 | Docs, playbooks, templates |
200
+
201
+ ### Run Benchmark
202
+
203
+ ```bash
204
+ python training/final_benchmark.py
205
+ ```
206
+
207
+ ## Training Your Own Model
208
+
209
+ ### 1. Generate Synthetic Data
210
+
211
+ ```bash
212
+ cd synthetic_data
213
+ python run_diverse_generation.py --num_items 1000
214
+ ```
215
+
216
+ ### 2. Preprocess Data
217
+
218
+ ```bash
219
+ python training/prepare_data.py
220
+ ```
221
+
222
+ ### 3. Run Training
223
+
224
+ ```bash
225
+ python training/train_v2.py
226
+ ```
227
+
228
+ ### 4. Evaluate
229
+
230
+ ```bash
231
+ python training/final_benchmark.py
232
+ ```
233
+
234
+ ## Reward Function
235
+
236
+ The RL phase uses a composite reward:
237
+
238
+ ```
239
+ R_total = 0.6 × R_F1 + 0.2 × R_temp + 0.1 × R_parity + 0.1 × R_eff
240
+ ```
241
+
242
+ | Component | Weight | Description |
243
+ |-----------|--------|-------------|
244
+ | R_F1 | 60% | F1 score vs gold labels |
245
+ | R_temp | 20% | Persistence horizon alignment |
246
+ | R_parity | 10% | Company/user scope correctness |
247
+ | R_eff | 10% | Storage efficiency (≤3 categories) |
248
+
249
+ ## Limitations
250
+
251
+ - **Multi-label**: Under-predicts when multiple categories apply
252
+ - **Overlap**: Struggles with company/user category overlap
253
+ - **Domain**: Marketing-specific; not tested on other domains
254
+
255
+ ## Citation
256
+
257
+ ```bibtex
258
+ @misc{memory-routing-agent-2025,
259
+ title={Memory Routing Agent: Prompt Distillation for Marketing AI},
260
+ author={Muratcan Koylan},
261
+ year={2025},
262
+ howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
263
+ }
264
+ ```
265
+
266
+ ## License
267
+
268
+ Apache 2.0
269
+
270
+ ## Acknowledgments
271
+
272
+ - [Thinking Machines](https://thinkingmachines.ai/) for Tinker training platform
273
+ - [Cohere](https://cohere.com/) for Command-R-Plus teacher model
274
+ - [Meta](https://ai.meta.com/) for Llama 3.1 base model
assets/difficulty_comparison.png ADDED
assets/generate_charts.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate training visualization charts for README
3
+ """
4
+ import json
5
+ import matplotlib.pyplot as plt
6
+ import matplotlib
7
+ matplotlib.use('Agg')
8
+ import numpy as np
9
+ import os
10
+
11
+ # Create assets directory
12
+ os.makedirs('assets', exist_ok=True)
13
+
14
+ # Load SFT metrics
15
+ sft_metrics = []
16
+ with open('training/logs/run_20251124_200256/sft_metrics.jsonl', 'r') as f:
17
+ for line in f:
18
+ sft_metrics.append(json.loads(line))
19
+
20
+ # Load RL metrics
21
+ rl_metrics = []
22
+ with open('training/logs/run_20251124_200256/rl_metrics.jsonl', 'r') as f:
23
+ for line in f:
24
+ rl_metrics.append(json.loads(line))
25
+
26
+ # Style settings
27
+ plt.style.use('seaborn-v0_8-whitegrid')
28
+ colors = {
29
+ 'train': '#2563eb',
30
+ 'test': '#dc2626',
31
+ 'reward': '#059669',
32
+ 'f1': '#7c3aed',
33
+ 'our_model': '#2563eb',
34
+ 'cohere': '#dc2626'
35
+ }
36
+
37
+ # ============ Chart 1: SFT Loss Curve ============
38
+ fig, ax = plt.subplots(figsize=(10, 5))
39
+
40
+ steps = [m['step'] for m in sft_metrics]
41
+ train_loss = [m['train_loss'] for m in sft_metrics]
42
+
43
+ # Get test loss points
44
+ test_steps = [m['step'] for m in sft_metrics if 'test_loss' in m]
45
+ test_loss = [m['test_loss'] for m in sft_metrics if 'test_loss' in m]
46
+
47
+ ax.plot(steps, train_loss, color=colors['train'], linewidth=2, label='Train Loss', alpha=0.8)
48
+ ax.scatter(test_steps, test_loss, color=colors['test'], s=80, zorder=5, label='Test Loss', marker='o')
49
+ ax.plot(test_steps, test_loss, color=colors['test'], linewidth=2, linestyle='--', alpha=0.5)
50
+
51
+ ax.set_xlabel('Training Step', fontsize=12)
52
+ ax.set_ylabel('Loss', fontsize=12)
53
+ ax.set_title('SFT Training: Loss Convergence', fontsize=14, fontweight='bold')
54
+ ax.legend(loc='upper right', fontsize=10)
55
+ ax.set_ylim(0, 6)
56
+
57
+ # Add annotations
58
+ ax.annotate(f'Final: {train_loss[-1]:.3f}', xy=(steps[-1], train_loss[-1]),
59
+ xytext=(steps[-1]-15, train_loss[-1]+0.5),
60
+ fontsize=9, color=colors['train'])
61
+ ax.annotate(f'Best Test: {min(test_loss):.3f}', xy=(test_steps[test_loss.index(min(test_loss))], min(test_loss)),
62
+ xytext=(test_steps[test_loss.index(min(test_loss))]+5, min(test_loss)+0.3),
63
+ fontsize=9, color=colors['test'])
64
+
65
+ plt.tight_layout()
66
+ plt.savefig('assets/sft_loss.png', dpi=150, bbox_inches='tight')
67
+ plt.close()
68
+ print("Saved: assets/sft_loss.png")
69
+
70
+ # ============ Chart 2: RL Reward Progression ============
71
+ fig, ax = plt.subplots(figsize=(10, 5))
72
+
73
+ iterations = [m['iteration'] for m in rl_metrics]
74
+ mean_reward = [m['mean_reward'] for m in rl_metrics]
75
+ std_reward = [m['std_reward'] for m in rl_metrics]
76
+
77
+ # Plot with confidence band
78
+ ax.fill_between(iterations,
79
+ [r - s for r, s in zip(mean_reward, std_reward)],
80
+ [r + s for r, s in zip(mean_reward, std_reward)],
81
+ alpha=0.2, color=colors['reward'])
82
+ ax.plot(iterations, mean_reward, color=colors['reward'], linewidth=2.5, label='Mean Reward')
83
+
84
+ ax.set_xlabel('RL Iteration', fontsize=12)
85
+ ax.set_ylabel('Reward', fontsize=12)
86
+ ax.set_title('RL Training: Reward Progression', fontsize=14, fontweight='bold')
87
+ ax.legend(loc='lower right', fontsize=10)
88
+ ax.set_ylim(0.5, 1.0)
89
+
90
+ # Add annotations
91
+ ax.annotate(f'Start: {mean_reward[0]:.3f}', xy=(0, mean_reward[0]),
92
+ xytext=(2, mean_reward[0]-0.05), fontsize=9, color=colors['reward'])
93
+ ax.annotate(f'Peak: {max(mean_reward):.3f}', xy=(mean_reward.index(max(mean_reward)), max(mean_reward)),
94
+ xytext=(mean_reward.index(max(mean_reward))+2, max(mean_reward)+0.02),
95
+ fontsize=9, color=colors['reward'])
96
+
97
+ plt.tight_layout()
98
+ plt.savefig('assets/rl_reward.png', dpi=150, bbox_inches='tight')
99
+ plt.close()
100
+ print("Saved: assets/rl_reward.png")
101
+
102
+ # ============ Chart 3: Reward Components ============
103
+ fig, ax = plt.subplots(figsize=(10, 5))
104
+
105
+ r_f1 = [m['mean_r_f1'] for m in rl_metrics]
106
+ r_temp = [m['mean_r_temp'] for m in rl_metrics]
107
+ r_parity = [m['mean_r_parity'] for m in rl_metrics]
108
+ r_eff = [m['mean_r_eff'] for m in rl_metrics]
109
+
110
+ ax.plot(iterations, r_f1, label='R_F1 (60%)', linewidth=2, color='#2563eb')
111
+ ax.plot(iterations, r_temp, label='R_temp (20%)', linewidth=2, color='#7c3aed')
112
+ ax.plot(iterations, r_parity, label='R_parity (10%)', linewidth=2, color='#059669')
113
+ ax.plot(iterations, r_eff, label='R_eff (10%)', linewidth=2, color='#f59e0b')
114
+
115
+ ax.set_xlabel('RL Iteration', fontsize=12)
116
+ ax.set_ylabel('Reward Component', fontsize=12)
117
+ ax.set_title('RL Training: Reward Components', fontsize=14, fontweight='bold')
118
+ ax.legend(loc='lower right', fontsize=10)
119
+ ax.set_ylim(0.5, 1.05)
120
+
121
+ plt.tight_layout()
122
+ plt.savefig('assets/rl_components.png', dpi=150, bbox_inches='tight')
123
+ plt.close()
124
+ print("Saved: assets/rl_components.png")
125
+
126
+ # ============ Chart 4: Model Comparison ============
127
+ fig, ax = plt.subplots(figsize=(8, 5))
128
+
129
+ metrics = ['Avg F1', 'Exact Match', 'Any Match']
130
+ our_model = [0.68, 0.60, 0.72]
131
+ cohere = [0.61, 0.26, 0.82]
132
+
133
+ x = np.arange(len(metrics))
134
+ width = 0.35
135
+
136
+ bars1 = ax.bar(x - width/2, our_model, width, label='Ours (8B)', color=colors['our_model'])
137
+ bars2 = ax.bar(x + width/2, cohere, width, label='Cohere (104B)', color=colors['cohere'])
138
+
139
+ ax.set_ylabel('Score', fontsize=12)
140
+ ax.set_title('Model Comparison: 50 Marketing Scenarios', fontsize=14, fontweight='bold')
141
+ ax.set_xticks(x)
142
+ ax.set_xticklabels(metrics, fontsize=11)
143
+ ax.legend(loc='upper right', fontsize=10)
144
+ ax.set_ylim(0, 1.0)
145
+
146
+ # Add value labels
147
+ for bar in bars1:
148
+ height = bar.get_height()
149
+ ax.annotate(f'{height:.0%}',
150
+ xy=(bar.get_x() + bar.get_width() / 2, height),
151
+ xytext=(0, 3), textcoords="offset points",
152
+ ha='center', va='bottom', fontsize=10, fontweight='bold')
153
+
154
+ for bar in bars2:
155
+ height = bar.get_height()
156
+ ax.annotate(f'{height:.0%}',
157
+ xy=(bar.get_x() + bar.get_width() / 2, height),
158
+ xytext=(0, 3), textcoords="offset points",
159
+ ha='center', va='bottom', fontsize=10)
160
+
161
+ plt.tight_layout()
162
+ plt.savefig('assets/model_comparison.png', dpi=150, bbox_inches='tight')
163
+ plt.close()
164
+ print("Saved: assets/model_comparison.png")
165
+
166
+ # ============ Chart 5: Performance by Difficulty ============
167
+ fig, ax = plt.subplots(figsize=(8, 5))
168
+
169
+ difficulties = ['Easy', 'Medium', 'Hard']
170
+ our_f1 = [0.86, 0.65, 0.50]
171
+ cohere_f1 = [0.48, 0.64, 0.72]
172
+
173
+ x = np.arange(len(difficulties))
174
+ width = 0.35
175
+
176
+ bars1 = ax.bar(x - width/2, our_f1, width, label='Ours (8B)', color=colors['our_model'])
177
+ bars2 = ax.bar(x + width/2, cohere_f1, width, label='Cohere (104B)', color=colors['cohere'])
178
+
179
+ ax.set_ylabel('F1 Score', fontsize=12)
180
+ ax.set_title('F1 Score by Difficulty Level', fontsize=14, fontweight='bold')
181
+ ax.set_xticks(x)
182
+ ax.set_xticklabels(difficulties, fontsize=11)
183
+ ax.legend(loc='upper right', fontsize=10)
184
+ ax.set_ylim(0, 1.0)
185
+
186
+ # Add value labels
187
+ for bar in bars1:
188
+ height = bar.get_height()
189
+ ax.annotate(f'{height:.2f}',
190
+ xy=(bar.get_x() + bar.get_width() / 2, height),
191
+ xytext=(0, 3), textcoords="offset points",
192
+ ha='center', va='bottom', fontsize=10, fontweight='bold')
193
+
194
+ for bar in bars2:
195
+ height = bar.get_height()
196
+ ax.annotate(f'{height:.2f}',
197
+ xy=(bar.get_x() + bar.get_width() / 2, height),
198
+ xytext=(0, 3), textcoords="offset points",
199
+ ha='center', va='bottom', fontsize=10)
200
+
201
+ plt.tight_layout()
202
+ plt.savefig('assets/difficulty_comparison.png', dpi=150, bbox_inches='tight')
203
+ plt.close()
204
+ print("Saved: assets/difficulty_comparison.png")
205
+
206
+ print("\nAll charts generated successfully!")
207
+
assets/model_comparison.png ADDED
assets/rl_components.png ADDED

Git LFS Details

  • SHA256: 3be9068cd428b4126d96f3bc1c93071cfdd44eb089c86501de90fd3f17e31c86
  • Pointer size: 131 Bytes
  • Size of remote file: 119 kB
assets/rl_reward.png ADDED
assets/sft_loss.png ADDED
docs/PRD.md ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Product Requirements Document: Intelligent Memory Routing System (Tinker Implementation)
2
+
3
+ ## 1. Executive Summary
4
+ Deliver a production memory-routing agent for marketing AI systems using Tinker as the exclusive training platform. The agent ingests conversation snippets, decides if the content merits storage, assigns the correct ontology slot, and respects persistence horizons. We follow a two-stage pipeline: supervised prompt distillation for initialization and reinforcement learning to optimize downstream retrieval utility. All code must rely on Tinker’s asynchronous APIs and built-in loss functions (cross-entropy and importance sampling) to stay within supported patterns.
5
+
6
+ ## 2. Goals and Non-Goals
7
+ - **Goals**
8
+ - Maintain selective, temporally-aware storage across the 12-category taxonomy plus `none`.
9
+ - Achieve reliable multi-label routing with tight output formatting compatible with Tinker renderers.
10
+ - Support RL reward shaping for retrieval F1, temporal correctness, company/user separation, and storage efficiency.
11
+ - Produce checkpoints consumable by downstream services via Tinker sampling clients.
12
+ - **Non-Goals**
13
+ - Building a retrieval engine or serving layer.
14
+ - Extending Tinker beyond LoRA or supported loss functions.
15
+
16
+ ## 3. Success Metrics
17
+ - Exact-match accuracy ≥80% on held-out labeled data.
18
+ - Macro F1 ≥90% across categories; `none` precision ≥90%, recall ≥85%.
19
+ - Average predicted categories per utterance ≤2.0.
20
+ - Temporal alignment accuracy ≥90% (long/medium/short mapping).
21
+ - KL divergence: target <0.005, warn 0.005–0.01, critical >0.01.
22
+
23
+ ## 4. System Architecture Overview
24
+ 1. **Synthetic Conversation Library** – Scenario templates drive GPT-5 generation to cover ontology breadth and noise patterns.
25
+ 2. **Teacher Labeling** – GPT-5, prompted with taxonomy/persistence guidance, produces gold labels (multi-label + `none`).
26
+ 3. **Prompt Distillation (SFT)** – Llama-3.1-8B LoRA (rank 32) is trained asynchronously via Tinker `forward_backward_async(..., loss_fn="cross_entropy")`.
27
+ 4. **RL Optimization** – Same model undergoes importance sampling policy gradient loss with a custom `MemoryRoutingEnv`.
28
+ 5. **Evaluation Harness** – Tinker evaluator builders and offline scripts verify accuracy, pruning behavior, and reward stability.
29
+
30
+ ### Model Selection Rationale
31
+ - Using `meta-llama/Llama-3.1-8B` (🐙 Base, 🧱 Dense, 🦆 Small) as the foundation model for this classification task. While Tinker recommends MoE models for cost efficiency and instruction-tuned models for task-specific work, we choose the base model for three reasons:
32
+ 1. **Routing Neutrality**: Instruction-tuned models may have ingrained biases toward helpfulness/verbosity that conflict with selective storage decisions. The base model learns routing behavior purely from our synthetic data.
33
+ 2. **Prompt Distillation Alignment**: Our two-stage pipeline (teacher labels → SFT → RL) is a classic prompt distillation setup where starting from a base model ensures we're not fighting pre-existing instruction-following patterns.
34
+ 3. **Evaluation Baseline**: Establishes a clean baseline for comparing LoRA vs full fine-tuning effects without confounding variables from prior post-training.
35
+ - LoRA rank 32 mirrors Tinker defaults for classification-style tasks. Higher ranks can be evaluated later if capacity becomes a bottleneck.
36
+ - **Post-MVP**: Once baseline performance is established, evaluate `meta-llama/Llama-3.1-8B-Instruct` (to measure instruction-tuning impact) and `Qwen/Qwen3-30B-A3B` (MoE cost efficiency) as alternative starting points.
37
+
38
+ ### Environment Design Notes
39
+ - Each `MemoryRoutingEnv` is a single-step bandit: `initial_observation()` returns a tokenized conversation + stop conditions, `step()` receives the model's generated classification tokens and terminates immediately with reward.
40
+ - EnvGroupBuilder clones each conversation across `group_size` rollouts for variance reduction; dataset builder provides `batch_size` EnvGroupBuilders per iteration.
41
+ - No multi-turn transitions, which matches Tinker's Env definitions and keeps reward computation simple.
42
+
43
+ ### Reward Computation Details
44
+ The `step()` method in `MemoryRoutingEnv` performs the following sequence:
45
+ 1. **Parse Model Output**: Extract predicted categories from generated tokens using renderer stop sequences. Expected format: `category1, category2, category3` (comma-separated, from valid taxonomy).
46
+ 2. **Format Validation**: If parsing fails or any category is invalid, assign `R_format = -1.0` and return immediately (zero for all other reward components).
47
+ 3. **Component Calculation**:
48
+ - `R_F1`: Token-level F1 between predicted and gold category sets. Use macro-averaging if multi-label.
49
+ - `R_temp`: Persistence alignment. +1.0 if predicted persistence matches gold (long/medium/short), +0.5 if adjacent (long↔medium or medium↔short), 0.0 otherwise. Use majority vote if multi-label predictions span multiple persistence horizons.
50
+ - `R_parity`: Company/user scope alignment. +1.0 if predicted scope (company/user/mixed/none) matches gold scope exactly, 0.0 otherwise.
51
+ - `R_eff`: Storage efficiency. `1.0` if ≤3 categories predicted, `0.7` if 4 categories, `0.4` if 5 categories, `0.0` if ≥6 categories.
52
+ 4. **Composite Reward**: `R_total = 0.6 * R_F1 + 0.2 * R_temp + 0.1 * R_parity + 0.1 * R_eff` (unless format validation failed, then `R_total = -1.0`).
53
+
54
+ **Edge Cases**:
55
+ - Model outputs empty string or only stop tokens → format validation failure.
56
+ - Model outputs `none` + other categories → invalid, format failure (none must be exclusive).
57
+ - Model outputs duplicate categories → deduplicate before computing metrics.
58
+ - Model exceeds max_tokens without hitting stop sequence → truncate and attempt parse, format failure if no valid categories extracted.
59
+
60
+ ## 5. Memory Ontology
61
+ | Category | Description | Persistence |
62
+ | --- | --- | --- |
63
+ | `company.brand_core` | Voice, values, positioning, identity anchors. | Long (>1y) |
64
+ | `company.strategic_signatures` | Decision frameworks, strategic heuristics. | Long (>1y) |
65
+ | `company.knowledge_artifacts` | Docs, style guides, playbooks. | Long (>1y) |
66
+ | `company.business_priorities` | Quarterly/seasonal goals, active campaigns. | Short (<3m) |
67
+ | `company.tools_config` | Integrations, API keys, workflow settings. | Medium (~6m) |
68
+ | `company.performance_context` | Campaign metrics, retrospectives, learnings. | Rolling (~6m) |
69
+ | `user.communication_style` | Tone, verbosity, format expectations. | Long (>1y) |
70
+ | `user.strategic_approach` | Personal priorities, success definitions. | Long (>1y) |
71
+ | `user.role_context` | Title, scope, decision authority. | Medium (~1y) |
72
+ | `user.workflow_patterns` | Review cadence, collaboration norms. | Medium (~1y) |
73
+ | `user.session_history` | Immediate context, recent asks. | Short (<2w) |
74
+ | `user.interaction_preferences` | Coaching style, feedback expectations. | Evolving |
75
+ | `none` | Irrelevant, vague, or transactional content. | Critical for noise reduction |
76
+
77
+ ## 6. Data & Prompt Strategy
78
+
79
+ ### Scenario Generation
80
+ - Script: customize `tinker_cookbook/recipes/prompt_distillation/create_data.py`.
81
+ - Inputs: category focus, distractor category, emotional tone, required signal; 4–10 turns per dialogue.
82
+ - Outputs: JSONL with scenario metadata, teacher confidence, persistence hints.
83
+
84
+ ### Teacher Prompt
85
+ ```
86
+ System: You route marketing conversations into persistent memory. Consider each utterance and decide if it conveys a durable fact. Prefer `none` unless confident.
87
+ Ontology: <category table with definitions + persistence>
88
+ Rules:
89
+ 1. Distinguish company.* from user.* details.
90
+ 2. Match persistence horizon (long/medium/short) to signal lifetime.
91
+ 3. Predict ≤3 categories unless strictly necessary.
92
+ Output:
93
+ categories: cat1, cat2 (use `none` for no storage)
94
+ ```
95
+ - Temperature 0.2, max tokens 256, stop newline.
96
+
97
+ ### Student Prompt
98
+ ```
99
+ System: You route marketing conversations into structured memory categories.
100
+ User: Conversation:
101
+ {dialogue}
102
+
103
+ Available categories:
104
+ - company.brand_core ...
105
+ - ...
106
+ - none
107
+
108
+ Respond with comma-separated categories.
109
+ ```
110
+
111
+ ### Renderer Configuration
112
+ ```python
113
+ from tinker_cookbook import renderers, tokenizer_utils
114
+
115
+ tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
116
+ renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
117
+ stop_sequences = renderer.get_stop_sequences()
118
+
119
+ sampling_params = types.SamplingParams(
120
+ max_tokens=150,
121
+ temperature=0.0,
122
+ stop=stop_sequences,
123
+ )
124
+ ```
125
+
126
+ ### Parsing & Validation
127
+ - Normalize whitespace/case, strip bullets, deduplicate, enforce taxonomy membership.
128
+ - Validation helper:
129
+ ```python
130
+ def validate_datum(datum: types.Datum, vocab_size: int) -> bool:
131
+ if datum.model_input.length > 512:
132
+ return False
133
+ weights = datum.loss_fn_inputs["weights"].tolist()
134
+ if sum(weights) == 0:
135
+ return False
136
+ target_tokens = datum.loss_fn_inputs["target_tokens"].tolist()
137
+ if not all(0 <= t < vocab_size for t in target_tokens):
138
+ return False
139
+ return True
140
+ ```
141
+
142
+ ## 6.5 Synthetic Data Quality Assurance
143
+ - **Coverage:** ≥20 examples per category × persistence; ≥20% multi-label, ≥10% `none`-only.
144
+ - **Noise:** ≥30% dialogues include distractors to stress selectivity.
145
+ - **Signal Density:** >60% of turns include relevant info; length 6.5 ± 1.5 turns.
146
+ - **Human Audit:** Spot-check 100 samples per refresh; require ≥95% teacher agreement.
147
+ - **Continuous Improvement:** Log production misses, refresh quarterly, retrain teacher prompt if accuracy drops >10%.
148
+
149
+ ## 6.6 Data Preprocessing Pipeline
150
+
151
+ Before SFT training, synthetic JSONL conversations must be converted to Tinker-compatible `types.Datum` objects:
152
+
153
+ **Step 1: Load Synthetic Data**
154
+ ```python
155
+ import json
156
+ with open("train.jsonl", "r") as f:
157
+ conversations = [json.loads(line) for line in f]
158
+ ```
159
+
160
+ **Step 2: Convert to Datum Objects**
161
+ ```python
162
+ from tinker import types
163
+ from tinker_cookbook import renderers, tokenizer_utils
164
+
165
+ tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
166
+ renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
167
+
168
+ def conversation_to_datum(conversation_json: dict) -> types.Datum:
169
+ """Convert synthetic conversation to training datum."""
170
+ tokens, weights = renderer.build_supervised_example(
171
+ conversation_json["conversation"]
172
+ )
173
+ model_input = types.ModelInput.from_ints(tokens[:-1])
174
+ datum = types.Datum(
175
+ model_input=model_input,
176
+ loss_fn_inputs=dict(
177
+ target_tokens=tokens[1:],
178
+ weights=weights[1:],
179
+ ),
180
+ )
181
+ return datum
182
+
183
+ train_data = [conversation_to_datum(conv) for conv in conversations]
184
+ ```
185
+
186
+ **Step 3: Validate Datum Objects**
187
+ ```python
188
+ vocab_size = len(tokenizer)
189
+ valid_data = []
190
+ for datum in train_data:
191
+ if datum.model_input.length > 4096:
192
+ print(f"Warning: Skipping example with length {datum.model_input.length}")
193
+ continue
194
+ weights = datum.loss_fn_inputs["weights"].tolist()
195
+ if sum(weights) == 0:
196
+ print("Warning: Skipping example with zero loss weights")
197
+ continue
198
+ target_tokens = datum.loss_fn_inputs["target_tokens"].tolist()
199
+ if not all(0 <= t < vocab_size for t in target_tokens):
200
+ print(f"Warning: Invalid token IDs found")
201
+ continue
202
+ valid_data.append(datum)
203
+
204
+ print(f"Preprocessed {len(valid_data)}/{len(train_data)} examples")
205
+ ```
206
+
207
+ **Step 4: Split and Save**
208
+ ```python
209
+ train_size = int(0.8 * len(valid_data))
210
+ train_dataset = valid_data[:train_size]
211
+ test_dataset = valid_data[train_size:]
212
+ ```
213
+
214
+ ## 7. Stage 1 – Prompt Distillation (Supervised Learning)
215
+
216
+ ### Dataset & Batch Size
217
+ - 1–2k labeled conversations (80/20 split after preprocessing per Section 6.6).
218
+ - Batch size 128 (per Tinker SL guidance) balances stability/throughput; if changed, scale LR ∝ √batch_size.
219
+ - Expected preprocessing yield: ~90-95% of raw JSONL (some examples filtered for length/validity).
220
+
221
+ ### Hyperparameter Selection
222
+ ```python
223
+ from tinker_cookbook.hyperparam_utils import get_lr
224
+
225
+ model_name = "meta-llama/Llama-3.1-8B"
226
+ learning_rate = get_lr(model_name) # Returns LoRA-adjusted LR: ~2.86e-4
227
+ ```
228
+ - Tinker's `get_lr()` utility already returns the LoRA-optimized learning rate for the specified model, accounting for model size and architecture. No manual scaling needed.
229
+ - Use Adam β1=0.9, β2=0.95, ε=1e-8 (Tinker SL defaults).
230
+ - **Training Duration**: Start with 300 steps minimum (≈20-25 epochs for 1.5k samples at batch_size=128). Tinker SL guidance recommends "at least 100 steps but usually best results with 1000 or more" - for LoRA classification tasks, 300-500 steps typically ensures convergence.
231
+ - **Early Stopping**: Validate every 20 steps on test set. Stop if test loss doesn't improve for 5 consecutive evaluations (100 steps patience).
232
+ - **Convergence Check**: Plot train/test loss curves. If test loss hasn't plateaued by step 300, extend to 500 steps before RL initialization.
233
+
234
+ ### Async Training Loop
235
+ ```python
236
+ import tinker
237
+ from tinker import types
238
+ from tinker_cookbook.hyperparam_utils import get_lr
239
+
240
+ service_client = tinker.ServiceClient()
241
+ training_client = await service_client.create_lora_training_client_async(
242
+ base_model="meta-llama/Llama-3.1-8B",
243
+ rank=32,
244
+ )
245
+
246
+ learning_rate = get_lr("meta-llama/Llama-3.1-8B")
247
+
248
+ for step in range(num_steps):
249
+ # Submit forward-backward pass
250
+ fwd_bwd_future = await training_client.forward_backward_async(
251
+ batch_data,
252
+ loss_fn="cross_entropy",
253
+ )
254
+
255
+ # Submit optimizer step (can overlap with forward-backward)
256
+ adam_params = types.AdamParams(
257
+ learning_rate=learning_rate,
258
+ beta1=0.9,
259
+ beta2=0.95,
260
+ eps=1e-8,
261
+ )
262
+ optim_future = await training_client.optim_step_async(adam_params)
263
+
264
+ # Wait for both operations to complete
265
+ fwd_bwd_result = await fwd_bwd_future.result_async()
266
+ optim_result = await optim_future.result_async()
267
+
268
+ # Log metrics from both operations
269
+ log_metrics(step, fwd_bwd_result, optim_result)
270
+ ```
271
+
272
+ ### Checkpointing & Sampling
273
+ ```python
274
+ # Save checkpoint for sampling (every 20 steps)
275
+ checkpoint_future = await training_client.save_weights_for_sampler_async(
276
+ name=f"sft_{step:04d}"
277
+ )
278
+ checkpoint_result = await checkpoint_future.result_async()
279
+ sampling_path = checkpoint_result.path
280
+
281
+ # Create sampling client with the checkpoint
282
+ sampling_client = service_client.create_sampling_client(
283
+ model_path=sampling_path
284
+ )
285
+
286
+ # Use with renderer stop sequences for evaluation
287
+ stop_sequences = renderer.get_stop_sequences()
288
+ sampling_params = types.SamplingParams(
289
+ max_tokens=150,
290
+ temperature=0.0,
291
+ stop=stop_sequences,
292
+ )
293
+ ```
294
+ - Save weights every 20 steps for periodic evaluation.
295
+ - Critical: Must call `.result_async()` on the checkpoint future to get the path before creating sampling client.
296
+
297
+ ## 8. Stage 2 – Reinforcement Learning
298
+
299
+ ### Environment & Reward
300
+ - `MemoryRoutingEnv` implements single-step episodes; EnvGroupBuilder replicates conversations across `group_size=8`.
301
+ - Reward: `0.6 * R_F1 + 0.2 * R_temp + 0.1 * R_parity + 0.1 * R_eff`.
302
+ - `R_F1`: F1 overlap with teacher labels.
303
+ - `R_temp`: +1 (correct persistence), +0.5 (adjacent), 0 otherwise.
304
+ - `R_parity`: +1 when company/user presence matches ground truth.
305
+ - `R_eff`: 1.0 (≤3 cats), 0.7 (4), 0.4 (5), 0 (≥6) with hard penalty for parser failures.
306
+
307
+ ### Policy & Sampling Workflow
308
+ ```python
309
+ # Save current policy weights for sampling
310
+ checkpoint_future = await training_client.save_weights_for_sampler_async(
311
+ name=f"rl_step_{step:04d}"
312
+ )
313
+ checkpoint_result = await checkpoint_future.result_async()
314
+ sampling_path = checkpoint_result.path
315
+
316
+ # Create sampling client with current policy
317
+ sampling_client = service_client.create_sampling_client(
318
+ model_path=sampling_path,
319
+ )
320
+
321
+ # Wrap in policy completer for RL rollouts
322
+ policy = TinkerTokenCompleter(
323
+ sampling_client=sampling_client,
324
+ max_tokens=150,
325
+ temperature=0.0,
326
+ stop=renderer.get_stop_sequences(),
327
+ )
328
+ ```
329
+
330
+ ### Async Training Loop
331
+ ```python
332
+ for iteration in range(num_iterations):
333
+ # 1. Gather rollouts concurrently
334
+ trajectory_groups = await asyncio.gather(
335
+ *[do_group_rollout(env_builder, policy) for env_builder in env_builders]
336
+ )
337
+
338
+ # 2. Process trajectories
339
+ filtered_groups = remove_constant_reward_groups(trajectory_groups)
340
+ advantages = compute_advantages(filtered_groups)
341
+ train_data, metadata = assemble_training_data(filtered_groups, advantages)
342
+
343
+ # 3. Submit forward-backward pass
344
+ fwd_bwd_future = await training_client.forward_backward_async(
345
+ train_data,
346
+ loss_fn="importance_sampling"
347
+ )
348
+
349
+ # 4. Submit optimizer step
350
+ adam_params = types.AdamParams(
351
+ learning_rate=2e-5,
352
+ beta1=0.9,
353
+ beta2=0.95,
354
+ eps=1e-8
355
+ )
356
+ optim_future = await training_client.optim_step_async(adam_params)
357
+
358
+ # 5. Wait for both operations to complete
359
+ fwd_bwd_result = await fwd_bwd_future.result_async()
360
+ optim_result = await optim_future.result_async()
361
+
362
+ # 6. Log metrics including KL divergence
363
+ log_metrics(iteration, fwd_bwd_result, optim_result, metadata)
364
+ ```
365
+ - Run ≈25 iterations (256 rollouts each). Adjust based on convergence and KL monitoring.
366
+
367
+ ### KL Monitoring
368
+ | Status | KL Range | Action |
369
+ | --- | --- | --- |
370
+ | Target | <0.005 | Optimal on-policy stability |
371
+ | Warning | 0.005–0.01 | Log warning, monitor closely; still stable per Tinker guidance |
372
+ | Critical | >0.01 | Halt run immediately, inspect sampler vs learner drift |
373
+
374
+ **Implementation Notes**:
375
+ - Always log `kl_sample_train_v1` and `kl_sample_train_v2` (two KL estimators per Tinker RL docs).
376
+ - Per Tinker: "training is stable with KL divergence below 0.01" - values above this threshold indicate numerical instability or off-policy issues.
377
+ - Even with full on-policy training, KL won't be exactly zero due to [non-determinism](https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/) in batched inference.
378
+ - Keep sampling temperature at 0.0 for deterministic rollouts.
379
+ - Enable gradient clipping (max_norm=1.0) if KL repeatedly exceeds 0.005.
380
+ - If KL exceeds 0.01, halt training, inspect checkpoint drift, and verify sampling client is using correct weights.
381
+
382
+ ### Future Throughput Optimizations
383
+ - After MVP, consider `StreamMinibatchConfig` to overlap sampling/training per Tinker RL docs (20–30% throughput gain).
384
+
385
+ ## 9. Evaluation & Monitoring
386
+
387
+ ### Inline
388
+ - SFT: track train/test loss, exact-match, macro/micro F1, avg categories.
389
+ - RL: log reward components, KL metrics, entropy, avg categories, stop reasons.
390
+
391
+ ### Evaluators
392
+ ```python
393
+ from tinker_cookbook.evaluators import SamplingClientEvaluator
394
+ from tinker import types
395
+
396
+ class MemoryRoutingEvaluator(SamplingClientEvaluator):
397
+ """Evaluates memory routing classification on held-out test set."""
398
+
399
+ def __init__(self, test_set, renderer, tokenizer):
400
+ self.test_set = test_set # List of preprocessed conversations with gold labels
401
+ self.renderer = renderer
402
+ self.tokenizer = tokenizer
403
+
404
+ async def __call__(self, sampling_client):
405
+ """Run holdout evaluation on the test set."""
406
+ predictions = []
407
+ gold_labels = []
408
+
409
+ # Sample predictions for each test conversation
410
+ for example in self.test_set:
411
+ # Build generation prompt from conversation
412
+ prompt = self.renderer.build_generation_prompt(
413
+ example["conversation"]
414
+ )
415
+
416
+ # Generate classification
417
+ sampling_params = types.SamplingParams(
418
+ max_tokens=150,
419
+ temperature=0.0,
420
+ stop=self.renderer.get_stop_sequences(),
421
+ )
422
+ result = await sampling_client.sample_async(
423
+ prompt=prompt,
424
+ num_samples=1,
425
+ sampling_params=sampling_params
426
+ )
427
+
428
+ # Parse model output into categories
429
+ pred_tokens = result.sequences[0].tokens
430
+ pred_text = self.tokenizer.decode(pred_tokens)
431
+ pred_categories = self._parse_categories(pred_text)
432
+
433
+ predictions.append(pred_categories)
434
+ gold_labels.append(set(example["labels"]["categories"]))
435
+
436
+ # Compute metrics
437
+ return {
438
+ "exact_match": self._compute_exact_match(predictions, gold_labels),
439
+ "macro_f1": self._compute_macro_f1(predictions, gold_labels),
440
+ "none_precision": self._compute_none_precision(predictions, gold_labels),
441
+ "temporal_accuracy": self._compute_temporal_accuracy(predictions, gold_labels),
442
+ }
443
+
444
+ def _parse_categories(self, text: str) -> set:
445
+ """Parse comma-separated categories from model output."""
446
+ # Implementation: split on comma, strip whitespace, validate against taxonomy
447
+ # Return set of valid categories or {"none"} if parsing fails
448
+ pass
449
+
450
+ def _compute_exact_match(self, preds, golds) -> float:
451
+ """Fraction of examples where predicted set exactly matches gold set."""
452
+ pass
453
+
454
+ def _compute_macro_f1(self, preds, golds) -> float:
455
+ """Macro-averaged F1 across all categories."""
456
+ pass
457
+
458
+ def _compute_none_precision(self, preds, golds) -> float:
459
+ """Precision of 'none' category predictions."""
460
+ pass
461
+
462
+ def _compute_temporal_accuracy(self, preds, golds) -> float:
463
+ """Accuracy of persistence horizon alignment (requires loading full examples)."""
464
+ pass
465
+ ```
466
+ - Register evaluator builders with `eval_every=20` for SFT (every checkpoint) and RL loops.
467
+ - Consider Inspect AI tasks after MVP for standardized benchmarking.
468
+
469
+ ### Offline & Compliance
470
+ - Offline script computes exact-match, macro/micro F1, `none` precision/recall, temporal accuracy, confusion matrix.
471
+ - Regression suite: 100 held-out dialogues rerun after each checkpoint.
472
+ - Format validator ensures comma-separated taxonomy outputs and ≤3 categories typical.
473
+
474
+ ## 10. Implementation Plan
475
+ 1. **Scenario Refresh & QA** – Generate new datasets, run teacher labeling, enforce Section 6.5 checks.
476
+ 2. **Preprocessing & Validation** – Convert to `Datum`, run parser + validator.
477
+ 3. **SFT Training** – 120–160 async steps with early stopping, checkpoint weights.
478
+ 4. **RL Environment Build** – Implement env/reward/evaluators, add unit tests.
479
+ 5. **RL Training** – 25 iteration importance sampling run with KL monitoring.
480
+ 6. **Evaluation & Sign-off** – Execute evaluator builders + offline scripts, capture qualitative samples, document results.
481
+ 7. **Future Optimization** – Investigate streaming minibatch and Inspect AI integration after MVP.
482
+
483
+ ## 11. Risks & Mitigations
484
+ - **Format Drift:** reward penalty + strict parser; renderer stop sequences enforce termination.
485
+ - **`none` Collapse:** reward weights emphasize recall, track per-category confusion, rebalance data.
486
+ - **Off-Policy Instability:** monitor KL each step, warn at 0.01, halt at 0.05, keep temperature=0.0, clip gradients.
487
+ - **Temporal Mislabeling:** targeted scenario generation plus dedicated reward component; run temporal audits weekly.
488
+ - **Synthetic Bias:** quarterly data refresh with human audits; ingest production edge cases.
489
+
490
+ ## 12. Deployment Considerations
491
+ - **Inference:** Export final LoRA checkpoint via `save_weights_for_sampler(name="prod_v1")`; serve via Tinker SamplingClient or export to preferred inference stack.
492
+ - **Performance Targets:** <200 ms p95 latency per routing decision; ≥100 decisions/sec on A100 (LoRA overhead ≈8 GB).
493
+ - **Monitoring:** Weekly dashboards for category distribution, `none` precision (>85%), avg categories (<2.5), temporal accuracy, reward drift.
494
+ - **Versioning:** Semantic versioning (major.minor.patch); record lineage (base → SFT → RL); keep last 3 versions for rollback.
495
+
496
+ ---
497
+ **Owner:** Technical Architecture Lead
498
+
docs/Synthetic-Data-Gen/synthetic_data.md ADDED
@@ -0,0 +1,784 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Synthetic Data Generation Guide: Memory Routing System
2
+
3
+ ## Overview
4
+ This guide provides detailed specifications for generating synthetic training data for the intelligent memory routing system. The data will be used to train both the supervised (teacher labels) and RL (reward signals) stages.
5
+
6
+ ---
7
+
8
+ ## Output Format Specification
9
+
10
+ ### JSONL Schema
11
+ Each line in the output file should be a valid JSON object with the following structure:
12
+
13
+ ```json
14
+ {
15
+ "scenario_id": "string",
16
+ "conversation": [
17
+ {
18
+ "role": "user|assistant",
19
+ "content": "string"
20
+ }
21
+ ],
22
+ "labels": {
23
+ "categories": ["string"],
24
+ "persistence_horizon": "long|medium|short",
25
+ "memory_scope": "company|user|mixed|none",
26
+ "rationale": "string"
27
+ },
28
+ "metadata": {
29
+ "scenario_type": "string",
30
+ "primary_category": "string",
31
+ "distractor_present": boolean,
32
+ "turn_count": integer,
33
+ "signals_present": ["string"]
34
+ }
35
+ }
36
+ ```
37
+
38
+ ### Field Definitions
39
+
40
+ **scenario_id**: Unique identifier (format: `{category}_{type}_{counter}`, e.g., `brand_core_standard_001`)
41
+
42
+ **conversation**: Array of message objects representing the dialogue
43
+ - Must be 4-10 turns total
44
+ - Should alternate between user and assistant (can start with either)
45
+ - Content should be realistic marketing/strategy dialogue
46
+
47
+ **labels.categories**: Array of category strings from taxonomy
48
+ - Valid values: `company.brand_core`, `company.strategic_signatures`, `company.knowledge_artifacts`, `company.business_priorities`, `company.tools_config`, `company.performance_context`, `user.communication_style`, `user.strategic_approach`, `user.role_context`, `user.workflow_patterns`, `user.session_history`, `user.interaction_preferences`, `none`
49
+ - Can be multi-label (typically 1-3 categories)
50
+ - Use `["none"]` for transactional/vague content
51
+
52
+ **labels.persistence_horizon**: Expected lifetime of information
53
+ - `long`: >1 year (e.g., brand values, communication style)
54
+ - `medium`: 6-12 months (e.g., role context, tools config)
55
+ - `short`: <3 months (e.g., business priorities, session history)
56
+
57
+ **labels.memory_scope**: Who this information pertains to
58
+ - `company`: Company-level information (brand, processes, etc.)
59
+ - `user`: Individual user preferences/context
60
+ - `mixed`: Contains both company and user information
61
+ - `none`: No memorable information
62
+
63
+ **labels.rationale**: Brief explanation (1-2 sentences) of why these categories were chosen
64
+
65
+ **metadata**: Additional context for training/evaluation
66
+ - `scenario_type`: Descriptive label (e.g., "brand_discovery", "campaign_review", "preference_setting")
67
+ - `primary_category`: The main category this example focuses on
68
+ - `distractor_present`: Whether irrelevant information was intentionally included
69
+ - `turn_count`: Number of conversation turns
70
+ - `signals_present`: List of specific signals (e.g., ["brand_voice_example", "tone_preference", "transactional_question"])
71
+
72
+ ---
73
+
74
+ ## Data Generation Prompts
75
+
76
+ ### Stage 1: Scenario Generation Prompt
77
+
78
+ Use this prompt to generate diverse scenario specifications first, then use those to create conversations.
79
+
80
+ ```
81
+ You are designing training scenarios for an AI memory system in marketing context. Generate a scenario specification with the following requirements:
82
+
83
+ TARGET SPECIFICATIONS:
84
+ - Primary Category: {category}
85
+ - Distractor Category: {distractor_category if applicable}
86
+ - Persistence Level: {long/medium/short}
87
+ - Emotional Tone: {neutral/excited/frustrated/collaborative}
88
+ - Turn Count: {4-10}
89
+ - Special Requirements: {e.g., "include specific brand voice example", "multi-label with user preference"}
90
+
91
+ OUTPUT FORMAT:
92
+ Return a JSON object with:
93
+ {
94
+ "scenario_description": "Brief narrative setup (2-3 sentences)",
95
+ "user_profile": "User role and context",
96
+ "key_signals_to_include": ["List of 2-4 specific memory-worthy signals"],
97
+ "distractor_signals": ["Optional list of noise/irrelevant info"],
98
+ "suggested_turn_breakdown": "How the conversation should flow"
99
+ }
100
+
101
+ EXAMPLE OUTPUT:
102
+ {
103
+ "scenario_description": "Marketing director discussing their personal communication preferences while reviewing campaign performance. They reveal tone expectations and decision-making style.",
104
+ "user_profile": "Senior Marketing Director at B2B SaaS company, prefers data-driven discussions, values conciseness",
105
+ "key_signals_to_include": [
106
+ "Explicit statement about preferring bullet points over paragraphs",
107
+ "Request for 'bottom-line-up-front' approach",
108
+ "Mention of quarterly review cadence"
109
+ ],
110
+ "distractor_signals": [
111
+ "Transactional question about meeting time",
112
+ "Small talk about weather"
113
+ ],
114
+ "suggested_turn_breakdown": "Start with campaign review (business_priorities), transition to feedback on communication style (user.communication_style), end with scheduling (none)"
115
+ }
116
+
117
+ Generate a scenario for: {TARGET SPECIFICATIONS}
118
+ ```
119
+
120
+ ### Stage 2: Conversation Generation Prompt
121
+
122
+ Use this prompt with GPT-5 (or Claude) to generate the actual conversation based on a scenario spec.
123
+
124
+ ```
125
+ You are generating realistic marketing conversations between a user and an AI marketing assistant. Generate natural dialogue that contains specific information worth storing in long-term memory.
126
+
127
+ CONTEXT:
128
+ You will create a conversation that exemplifies certain memory categories while maintaining realism and natural flow.
129
+
130
+ SCENARIO SPECIFICATION:
131
+ {Insert scenario_spec from Stage 1}
132
+
133
+ MEMORY TAXONOMY (for reference):
134
+ COMPANY MEMORY:
135
+ - company.brand_core: Voice, values, positioning, identity anchors (Persistence: Long >1y)
136
+ - company.strategic_signatures: Decision frameworks, strategic heuristics (Persistence: Long >1y)
137
+ - company.knowledge_artifacts: Docs, style guides, playbooks (Persistence: Long >1y)
138
+ - company.business_priorities: Quarterly/seasonal goals, active campaigns (Persistence: Short <3m)
139
+ - company.tools_config: Integrations, API keys, workflow settings (Persistence: Medium ~6m)
140
+ - company.performance_context: Campaign metrics, retrospectives, learnings (Persistence: Rolling ~6m)
141
+
142
+ USER MEMORY:
143
+ - user.communication_style: Tone, verbosity, format expectations (Persistence: Long >1y)
144
+ - user.strategic_approach: Personal priorities, success definitions (Persistence: Long >1y)
145
+ - user.role_context: Title, scope, decision authority (Persistence: Medium ~1y)
146
+ - user.workflow_patterns: Review cadence, collaboration norms (Persistence: Medium ~1y)
147
+ - user.session_history: Immediate context, recent asks (Persistence: Short <2w)
148
+ - user.interaction_preferences: Coaching style, feedback expectations (Persistence: Evolving)
149
+
150
+ SPECIAL:
151
+ - none: Irrelevant, vague, or transactional content
152
+
153
+ GENERATION RULES:
154
+ 1. Make conversations feel natural - include some filler, transitions, acknowledgments
155
+ 2. Embed memory-worthy information organically (don't make it too obvious)
156
+ 3. Include 1-2 utterances that should map to "none" for realism
157
+ 4. If multi-label scenario, ensure signals for both categories are present
158
+ 5. Length: {turn_count} turns (alternating user/assistant)
159
+ 6. Include specific, concrete details (not generic statements)
160
+ 7. For company.* categories: use "we", "our company", "our brand"
161
+ 8. For user.* categories: use "I prefer", "my approach", "I typically"
162
+
163
+ OUTPUT FORMAT:
164
+ Return a JSON object with:
165
+ {
166
+ "scenario_id": "{primary_category}_{scenario_type}_{random_3_digit_number}",
167
+ "conversation": [
168
+ {"role": "user", "content": "..."},
169
+ {"role": "assistant", "content": "..."},
170
+ ...
171
+ ],
172
+ "labels": {
173
+ "categories": ["array of applicable categories"],
174
+ "persistence_horizon": "long|medium|short",
175
+ "memory_scope": "company|user|mixed|none",
176
+ "rationale": "1-2 sentence explanation of category choices"
177
+ },
178
+ "metadata": {
179
+ "scenario_type": "descriptive_label",
180
+ "primary_category": "main_category",
181
+ "distractor_present": true|false,
182
+ "turn_count": integer,
183
+ "signals_present": ["list of specific signals included"]
184
+ }
185
+ }
186
+
187
+ CRITICAL: Respond with ONLY the JSON object. No markdown formatting, no explanation, no preamble.
188
+
189
+ Generate the conversation now.
190
+ ```
191
+
192
+ ---
193
+
194
+ ## Example Synthetic Data Instances
195
+
196
+ ### Example 1: Single-Label, company.brand_core
197
+
198
+ ```json
199
+ {
200
+ "scenario_id": "brand_core_voice_definition_047",
201
+ "conversation": [
202
+ {
203
+ "role": "user",
204
+ "content": "I'm working on our new product launch messaging and want to make sure we're staying true to our brand voice. Can you remind me what our core voice attributes are?"
205
+ },
206
+ {
207
+ "role": "assistant",
208
+ "content": "I'd be happy to help! Let me check our brand guidelines. What specific aspect of the launch are you working on?"
209
+ },
210
+ {
211
+ "role": "user",
212
+ "content": "The hero section copy. We've always positioned ourselves as the approachable expert - professional but not corporate, if that makes sense. Our voice should be confident without being arrogant, and we use contractions to sound human."
213
+ },
214
+ {
215
+ "role": "assistant",
216
+ "content": "That's perfect context. So you're looking for that balance of expertise and warmth. Would you like me to review a draft with those voice principles in mind?"
217
+ },
218
+ {
219
+ "role": "user",
220
+ "content": "Yes, that would be great. Also, I want to make sure we avoid jargon - that's been a key part of our brand since day one. We explain complex marketing concepts in plain English."
221
+ },
222
+ {
223
+ "role": "assistant",
224
+ "content": "Absolutely, accessibility through clear language is core to your brand identity. Share the draft whenever you're ready and I'll evaluate it against those voice principles."
225
+ }
226
+ ],
227
+ "labels": {
228
+ "categories": ["company.brand_core"],
229
+ "persistence_horizon": "long",
230
+ "memory_scope": "company",
231
+ "rationale": "User explicitly defines multiple core brand voice attributes (approachable expert, contractions, jargon-free) that represent enduring identity anchors."
232
+ },
233
+ "metadata": {
234
+ "scenario_type": "brand_voice_definition",
235
+ "primary_category": "company.brand_core",
236
+ "distractor_present": false,
237
+ "turn_count": 6,
238
+ "signals_present": [
239
+ "voice_attribute_approachable_expert",
240
+ "voice_attribute_contractions",
241
+ "voice_principle_no_jargon",
242
+ "positioning_statement"
243
+ ]
244
+ }
245
+ }
246
+ ```
247
+
248
+ ### Example 2: Multi-Label, company.business_priorities + user.communication_style
249
+
250
+ ```json
251
+ {
252
+ "scenario_id": "mixed_priority_communication_183",
253
+ "conversation": [
254
+ {
255
+ "role": "user",
256
+ "content": "Quick update on Q4 - our north star metric is increasing trial-to-paid conversion by 15%. Everything else is secondary."
257
+ },
258
+ {
259
+ "role": "assistant",
260
+ "content": "Got it, trial-to-paid conversion is the priority. What initiatives are you planning to support that goal?"
261
+ },
262
+ {
263
+ "role": "user",
264
+ "content": "Three main campaigns: onboarding email sequence optimization, in-app value demonstration, and sales-assist for high-intent trials. When you give me updates on these, I need the data upfront - show me the numbers first, then the narrative. I don't have time for long explanations before I see performance."
265
+ },
266
+ {
267
+ "role": "assistant",
268
+ "content": "Understood - metrics first, context second. Should I track these three campaigns as your active Q4 priorities?"
269
+ },
270
+ {
271
+ "role": "user",
272
+ "content": "Yes, and one more thing - if any campaign is underperforming, just tell me straight. I value directness over softening bad news."
273
+ }
274
+ ],
275
+ "labels": {
276
+ "categories": ["company.business_priorities", "user.communication_style"],
277
+ "persistence_horizon": "mixed",
278
+ "memory_scope": "mixed",
279
+ "rationale": "Contains Q4 business goal and active campaigns (short-term, company-level) plus explicit communication preferences about data-first reporting and directness (long-term, user-level)."
280
+ },
281
+ "metadata": {
282
+ "scenario_type": "priority_setting_with_communication_preference",
283
+ "primary_category": "company.business_priorities",
284
+ "distractor_present": false,
285
+ "turn_count": 5,
286
+ "signals_present": [
287
+ "quarterly_goal_conversion",
288
+ "active_campaigns_list",
289
+ "communication_pref_data_first",
290
+ "communication_pref_directness"
291
+ ]
292
+ }
293
+ }
294
+ ```
295
+
296
+ ### Example 3: None Category (Transactional)
297
+
298
+ ```json
299
+ {
300
+ "scenario_id": "none_transactional_312",
301
+ "conversation": [
302
+ {
303
+ "role": "user",
304
+ "content": "What time is it in New York right now?"
305
+ },
306
+ {
307
+ "role": "assistant",
308
+ "content": "It's currently 2:34 PM EST in New York."
309
+ },
310
+ {
311
+ "role": "user",
312
+ "content": "Thanks. Can you find that report I asked for earlier?"
313
+ },
314
+ {
315
+ "role": "assistant",
316
+ "content": "I don't see a previous request for a report in our conversation. Could you clarify which report you're looking for?"
317
+ },
318
+ {
319
+ "role": "user",
320
+ "content": "Never mind, I think I was talking to someone else about it."
321
+ }
322
+ ],
323
+ "labels": {
324
+ "categories": ["none"],
325
+ "persistence_horizon": "short",
326
+ "memory_scope": "none",
327
+ "rationale": "Purely transactional queries with no durable information worth storing. Time query and confused reference have no strategic or personal value."
328
+ },
329
+ "metadata": {
330
+ "scenario_type": "transactional_noise",
331
+ "primary_category": "none",
332
+ "distractor_present": false,
333
+ "turn_count": 5,
334
+ "signals_present": ["time_query", "confused_context"]
335
+ }
336
+ }
337
+ ```
338
+
339
+ ### Example 4: Multi-Label with Distractor, user.strategic_approach + user.workflow_patterns + none
340
+
341
+ ```json
342
+ {
343
+ "scenario_id": "user_strategic_workflow_mixed_089",
344
+ "conversation": [
345
+ {
346
+ "role": "user",
347
+ "content": "Before we dive into campaign planning, I need to set some expectations about how I work. I'm a big believer in testing small before scaling - I'd rather run five $1k experiments than one $5k campaign until we find what works."
348
+ },
349
+ {
350
+ "role": "assistant",
351
+ "content": "That makes sense - iterative testing to de-risk before investment. How do you typically structure these experiments?"
352
+ },
353
+ {
354
+ "role": "user",
355
+ "content": "I run them for two weeks minimum, review results every Friday afternoon, and I need at least 3x ROAS to consider something scalable. Also, completely unrelated, but is it supposed to rain tomorrow?"
356
+ },
357
+ {
358
+ "role": "assistant",
359
+ "content": "Let me check the weather... Yes, there's a 70% chance of rain tomorrow afternoon. Going back to your testing framework - do you involve your team in the Friday reviews?"
360
+ },
361
+ {
362
+ "role": "user",
363
+ "content": "Always. I do weekly reviews with the full marketing team, and I expect everyone to come prepared with their numbers. Collaboration is key to my approach - I don't make scaling decisions alone."
364
+ },
365
+ {
366
+ "role": "assistant",
367
+ "content": "Got it - test-and-learn methodology with collaborative weekly reviews every Friday. I'll keep that workflow in mind when making recommendations."
368
+ }
369
+ ],
370
+ "labels": {
371
+ "categories": ["user.strategic_approach", "user.workflow_patterns"],
372
+ "persistence_horizon": "long",
373
+ "memory_scope": "user",
374
+ "rationale": "Defines personal strategic philosophy (test small before scaling, 3x ROAS threshold) and recurring workflow pattern (Friday reviews with team). Weather question is noise."
375
+ },
376
+ "metadata": {
377
+ "scenario_type": "strategic_workflow_definition_with_distractor",
378
+ "primary_category": "user.strategic_approach",
379
+ "distractor_present": true,
380
+ "turn_count": 6,
381
+ "signals_present": [
382
+ "strategic_approach_test_before_scale",
383
+ "success_metric_3x_roas",
384
+ "workflow_friday_reviews",
385
+ "workflow_collaborative_decisions",
386
+ "distractor_weather"
387
+ ]
388
+ }
389
+ }
390
+ ```
391
+
392
+ ### Example 5: Edge Case - Long-term company.knowledge_artifacts
393
+
394
+ ```json
395
+ {
396
+ "scenario_id": "knowledge_artifacts_style_guide_221",
397
+ "conversation": [
398
+ {
399
+ "role": "user",
400
+ "content": "I just updated our email marketing style guide - it's in the shared drive under Marketing/Guidelines/Email_Style_Guide_v3.pdf. This supersedes the old version from 2023."
401
+ },
402
+ {
403
+ "role": "assistant",
404
+ "content": "Thanks for letting me know. What are the major changes in version 3?"
405
+ },
406
+ {
407
+ "role": "user",
408
+ "content": "Main updates: we now use sentence case for subject lines instead of title case, maximum 50 characters. Email body should be 150 words max, and we've added a new section on accessibility requirements - minimum 14pt font, 1.5 line spacing, alt text mandatory for all images."
409
+ },
410
+ {
411
+ "role": "assistant",
412
+ "content": "Those are significant changes, especially the accessibility requirements. Should I reference this guide when reviewing email campaigns?"
413
+ },
414
+ {
415
+ "role": "user",
416
+ "content": "Yes, absolutely. This is now the canonical source for all email creative. If anyone asks about email standards, point them to this document."
417
+ }
418
+ ],
419
+ "labels": {
420
+ "categories": ["company.knowledge_artifacts"],
421
+ "persistence_horizon": "long",
422
+ "memory_scope": "company",
423
+ "rationale": "Introduction of updated canonical style guide with specific location and new standards. This is a durable knowledge artifact that will be referenced repeatedly."
424
+ },
425
+ "metadata": {
426
+ "scenario_type": "knowledge_artifact_update",
427
+ "primary_category": "company.knowledge_artifacts",
428
+ "distractor_present": false,
429
+ "turn_count": 5,
430
+ "signals_present": [
431
+ "document_location",
432
+ "canonical_source_declaration",
433
+ "specific_guidelines_subject_line",
434
+ "specific_guidelines_accessibility"
435
+ ]
436
+ }
437
+ }
438
+ ```
439
+
440
+ ---
441
+
442
+ ## Generation Strategy & Best Practices
443
+
444
+ ## Preparing Data for Tinker Training
445
+
446
+ Use the official Tinker renderer utilities to transform the JSONL data into `types.Datum` objects before SFT/RL runs. This ensures the tokenizer, stop sequences, and weight masks match what the trainer expects ([`renderers.build_supervised_example`](tinker_docs.md#file-renderingmdx) and [`types.Datum`](tinker_docs.md#part-2-type-definitions)).
447
+
448
+ ```python
449
+ import json
450
+ import tinker
451
+ from tinker import types
452
+ from tinker_cookbook import renderers, tokenizer_utils
453
+
454
+ tokenizer = tokenizer_utils.get_tokenizer("meta-llama/Llama-3.1-8B")
455
+ renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
456
+
457
+ def conversation_to_datum(conversation_json: dict) -> types.Datum:
458
+ tokens, weights = renderer.build_supervised_example(conversation_json["conversation"])
459
+ model_input = types.ModelInput.from_ints(tokens[:-1])
460
+ datum = types.Datum(
461
+ model_input=model_input,
462
+ loss_fn_inputs=dict(
463
+ target_tokens=tokens[1:],
464
+ weights=weights[1:],
465
+ ),
466
+ )
467
+ if datum.model_input.length > 4096:
468
+ raise ValueError("Conversation exceeds model context window")
469
+ return datum
470
+ ```
471
+
472
+ **Checklist**
473
+ - [ ] Conversations tokenized with the same renderer used during training
474
+ - [ ] Resulting `ModelInput` length < base model context window (4k for Llama-3.1-8B)
475
+ - [ ] Non-zero loss weights present (otherwise drop example)
476
+ - [ ] Saved as pickled or JSONL `Datum` payloads ready for `forward_backward_async`
477
+
478
+ ### Coverage Matrix
479
+
480
+ Generate scenarios to ensure balanced coverage:
481
+
482
+ | Category | Target % | Min Examples | With Distractor | Multi-Label |
483
+ |----------|----------|--------------|-----------------|-------------|
484
+ | company.brand_core | 10% | 100 | 30 | 20 |
485
+ | company.strategic_signatures | 8% | 80 | 25 | 15 |
486
+ | company.knowledge_artifacts | 8% | 80 | 25 | 15 |
487
+ | company.business_priorities | 10% | 100 | 40 | 30 |
488
+ | company.tools_config | 7% | 70 | 20 | 10 |
489
+ | company.performance_context | 9% | 90 | 30 | 20 |
490
+ | user.communication_style | 10% | 100 | 30 | 25 |
491
+ | user.strategic_approach | 9% | 90 | 25 | 20 |
492
+ | user.role_context | 7% | 70 | 20 | 15 |
493
+ | user.workflow_patterns | 8% | 80 | 25 | 20 |
494
+ | user.session_history | 6% | 60 | 15 | 10 |
495
+ | user.interaction_preferences | 8% | 80 | 25 | 20 |
496
+ | none | 10% | 100 | 50 | 5 |
497
+
498
+ **Total Target:** 1,100-1,200 examples minimum for SFT
499
+
500
+ ### Quality Validation Checklist
501
+
502
+ For each generated example, validate:
503
+
504
+ **Structural:**
505
+ - [ ] Valid JSON format
506
+ - [ ] All required fields present
507
+ - [ ] 4-10 turns in conversation
508
+ - [ ] Alternating roles (mostly)
509
+ - [ ] Categories are from valid taxonomy
510
+
511
+ **Content:**
512
+ - [ ] Natural language flow (not robotic)
513
+ - [ ] Specific details present (not generic)
514
+ - [ ] Clear signal for each labeled category
515
+ - [ ] Distractor is truly off-topic (if present)
516
+ - [ ] Persistence horizon matches category definition
517
+
518
+ **Label Quality:**
519
+ - [ ] Rationale explains category choice
520
+ - [ ] Multi-label examples have signals for all categories
521
+ - [ ] "none" examples have no memorable information
522
+ - [ ] Memory scope matches categories (company.* → company)
523
+
524
+ ### Batch Generation Process
525
+
526
+ 1. **Define Coverage Plan**
527
+ - Decide on total dataset size (1,500-2,000 recommended)
528
+ - Allocate examples per category per coverage matrix
529
+ - Generate scenario specifications for each category
530
+
531
+ 2. **Generate Conversations**
532
+ - Process scenarios in batches of 50-100
533
+ - Use GPT-5 or Claude Opus for generation
534
+ - Temperature: 0.7 for diversity
535
+ - Validate each batch before proceeding
536
+
537
+ 3. **Quality Review**
538
+ - Sample 100 random examples for human review
539
+ - Check for common failure modes:
540
+ * Generic statements ("our brand is innovative")
541
+ * Unclear signals (ambiguous category)
542
+ * Unrealistic dialogue (too formal/robotic)
543
+ * Missing distractors where planned
544
+ - Iterate prompts if quality issues found
545
+
546
+ 4. **Teacher Labeling**
547
+ - Run ALL examples through teacher labeling prompt
548
+ - Compare teacher labels to synthetic labels
549
+ - Agreement threshold: >95%
550
+ - If disagreement, review and regenerate
551
+
552
+ 5. **Final Dataset Assembly**
553
+ - Split train/test (80/20)
554
+ - Stratify by category to ensure test coverage
555
+ - Save as `train.jsonl` and `test.jsonl`
556
+ - Document metadata (generation date, model used, prompts)
557
+
558
+ ---
559
+
560
+ ## Teacher Labeling Prompt
561
+
562
+ Use this prompt to generate gold labels for any conversation (including real production data later):
563
+
564
+ ```
565
+ You are a memory routing classifier for a marketing AI system. Your job is to analyze conversations and determine what information should be stored in long-term memory and in which categories.
566
+
567
+ MEMORY TAXONOMY:
568
+
569
+ COMPANY MEMORY (about the organization):
570
+ 1. company.brand_core - Voice, values, positioning, identity anchors [Long-term: >1y]
571
+ 2. company.strategic_signatures - Decision frameworks, strategic heuristics [Long-term: >1y]
572
+ 3. company.knowledge_artifacts - Documents, style guides, playbooks [Long-term: >1y]
573
+ 4. company.business_priorities - Quarterly/seasonal goals, active campaigns [Short-term: <3m]
574
+ 5. company.tools_config - Integrations, API keys, workflow settings [Medium-term: ~6m]
575
+ 6. company.performance_context - Campaign metrics, retrospectives, learnings [Rolling: ~6m]
576
+
577
+ USER MEMORY (about the individual):
578
+ 7. user.communication_style - Tone, verbosity, format expectations [Long-term: >1y]
579
+ 8. user.strategic_approach - Personal priorities, success definitions [Long-term: >1y]
580
+ 9. user.role_context - Title, scope, decision authority [Medium-term: ~1y]
581
+ 10. user.workflow_patterns - Review cadence, collaboration norms [Medium-term: ~1y]
582
+ 11. user.session_history - Immediate context, recent asks [Short-term: <2w]
583
+ 12. user.interaction_preferences - Coaching style, feedback expectations [Evolving]
584
+
585
+ SPECIAL:
586
+ 13. none - Irrelevant, vague, or transactional content (use when nothing is worth remembering)
587
+
588
+ ROUTING RULES:
589
+ 1. Distinguish company.* (organization-level) from user.* (individual-level)
590
+ 2. Match persistence horizon to information lifetime
591
+ 3. Predict ≤3 categories unless strictly necessary
592
+ 4. Prefer "none" unless there's CONCRETE, DURABLE information
593
+ 5. For company.* categories: look for "we", "our", organizational facts
594
+ 6. For user.* categories: look for "I", "my", personal preferences
595
+ 7. Vague statements like "we should be innovative" → none (too generic)
596
+ 8. Specific statements like "our brand voice uses contractions" → company.brand_core
597
+
598
+ CONVERSATION TO ANALYZE:
599
+ {conversation}
600
+
601
+ OUTPUT FORMAT (JSON):
602
+ {
603
+ "categories": ["category1", "category2"],
604
+ "persistence_horizon": "long|medium|short",
605
+ "memory_scope": "company|user|mixed|none",
606
+ "rationale": "Brief explanation of why these categories were selected",
607
+ "confidence": "high|medium|low",
608
+ "extractable_facts": [
609
+ "List 1-3 specific facts that would be stored in memory"
610
+ ]
611
+ }
612
+
613
+ CRITICAL RULES:
614
+ - Be conservative with labeling - when in doubt, use "none"
615
+ - Only label what's EXPLICITLY stated, not implied
616
+ - Multi-label only when multiple distinct types of information are present
617
+ - If conversation is small talk or transactional → ["none"]
618
+
619
+ Analyze the conversation and provide your classification:
620
+ ```
621
+
622
+ ---
623
+
624
+ ## Validation & Metrics
625
+
626
+ ### Data Quality Metrics to Track
627
+
628
+ **Coverage Metrics:**
629
+ - Category distribution (should match target ±5%)
630
+ - Persistence distribution (long: 35%, medium: 30%, short: 25%, mixed: 10%)
631
+ - Memory scope distribution (company: 45%, user: 45%, mixed: 5%, none: 5%)
632
+ - Multi-label frequency (target: 20-25% of non-none examples)
633
+
634
+ **Quality Metrics:**
635
+ - Teacher agreement rate (target: >95%)
636
+ - Average turn length (target: 20-150 tokens)
637
+ - Conversation length distribution (target: mean 6.5 ± 1.5 turns)
638
+ - "none" precision via human review (target: >90%)
639
+
640
+ **Signal Metrics:**
641
+ - Average signals per conversation (target: 2-3)
642
+ - Signal diversity (unique signal types / total signals: target >0.7)
643
+ - Distractor effectiveness (human annotators can identify: target >85%)
644
+
645
+ ### Automated Validation Script
646
+
647
+ ```python
648
+ import json
649
+ from collections import Counter
650
+ from typing import Dict, List
651
+
652
+ def validate_synthetic_data(filepath: str) -> Dict[str, any]:
653
+ """Validate synthetic data quality"""
654
+
655
+ with open(filepath, 'r') as f:
656
+ data = [json.loads(line) for line in f]
657
+
658
+ # Category distribution
659
+ all_categories = []
660
+ for item in data:
661
+ all_categories.extend(item['labels']['categories'])
662
+ category_dist = Counter(all_categories)
663
+
664
+ # Multi-label frequency
665
+ multi_label_count = sum(1 for item in data if len(item['labels']['categories']) > 1)
666
+ multi_label_freq = multi_label_count / len(data)
667
+
668
+ # Turn count distribution
669
+ turn_counts = [item['metadata']['turn_count'] for item in data]
670
+ avg_turns = sum(turn_counts) / len(turn_counts)
671
+
672
+ # Persistence distribution
673
+ persistence_dist = Counter(item['labels']['persistence_horizon'] for item in data)
674
+
675
+ # Memory scope distribution
676
+ scope_dist = Counter(item['labels']['memory_scope'] for item in data)
677
+
678
+ return {
679
+ 'total_examples': len(data),
680
+ 'category_distribution': dict(category_dist),
681
+ 'multi_label_frequency': multi_label_freq,
682
+ 'avg_turns_per_conversation': avg_turns,
683
+ 'persistence_distribution': dict(persistence_dist),
684
+ 'scope_distribution': dict(scope_dist),
685
+ 'warnings': _generate_warnings(category_dist, multi_label_freq, avg_turns)
686
+ }
687
+
688
+ def _generate_warnings(cat_dist, ml_freq, avg_turns):
689
+ warnings = []
690
+
691
+ # Check for imbalanced categories
692
+ total = sum(cat_dist.values())
693
+ for cat, count in cat_dist.items():
694
+ if count / total < 0.05:
695
+ warnings.append(f"Category '{cat}' underrepresented: {count/total:.1%}")
696
+
697
+ # Check multi-label frequency
698
+ if ml_freq < 0.15:
699
+ warnings.append(f"Low multi-label frequency: {ml_freq:.1%} (target: 20-25%)")
700
+
701
+ # Check turn length
702
+ if avg_turns < 5 or avg_turns > 8:
703
+ warnings.append(f"Average turns out of range: {avg_turns:.1f} (target: 6.5±1.5)")
704
+
705
+ return warnings
706
+
707
+ # Usage
708
+ metrics = validate_synthetic_data('train.jsonl')
709
+ print(json.dumps(metrics, indent=2))
710
+ ```
711
+
712
+ ---
713
+
714
+ ## Prompt Engineering Tips
715
+
716
+ ### For Better Diversity:
717
+ 1. Set the `temperature` parameter (1)
718
+ 2. Add "Generate a DIFFERENT conversation that..." to avoid repetition
719
+ 3. Provide counter-examples: "Don't make it like this: [generic example]"
720
+ 4. Use different starting phrases: "Create", "Generate", "Produce", "Design"
721
+
722
+ ### For Better Quality:
723
+ 1. Include GOOD and BAD examples in prompt
724
+ 2. Specify "Be specific, not generic" multiple times
725
+ 3. Add negative instructions: "Avoid phrases like 'let's think about', 'going forward'"
726
+ 4. Request concrete numbers, names, specific tools/platforms
727
+
728
+ ### For Better Label Accuracy:
729
+ 1. Show the model 3-5 labeled examples before asking for labels
730
+ 2. Use chain-of-thought: "First, identify the durable facts. Then, assign categories."
731
+ 3. Add calibration: "Be conservative - most conversations should have 1-2 categories, not 4+"
732
+ 4. Include edge cases in few-shot examples (generic vs specific, multi-label)
733
+
734
+ ---
735
+
736
+ ## Common Pitfalls & How to Avoid Them
737
+
738
+ ### Pitfall 1: Generic, Vague Content
739
+ **Bad Example:** "We value innovation and customer focus."
740
+ **Good Example:** "We use contractions in all customer-facing copy to sound conversational, and we never say 'utilize' when 'use' works fine."
741
+
742
+ **Fix:** Add to prompt: "Include SPECIFIC details like exact phrases, numbers, tool names, or concrete examples."
743
+
744
+ ### Pitfall 2: Over-labeling
745
+ **Bad:** Every conversation gets 3-4 categories
746
+ **Good:** Most conversations get 1-2 categories, some get 0 (none)
747
+
748
+ **Fix:** Emphasize in teacher prompt: "Be conservative. Most conversations are noise or context-specific."
749
+
750
+ ### Pitfall 3: Unrealistic Dialogue
751
+ **Bad:** "Hello, I would like to discuss our brand positioning strategy and establish core value propositions."
752
+ **Good:** "Hey, quick question about our brand voice - are we still doing the contractions thing in emails?"
753
+
754
+ **Fix:** Add natural language examples and specify: "Make it sound like a real conversation, including filler words, casual language, and natural transitions."
755
+
756
+ ### Pitfall 4: Missing Edge Cases
757
+ **Bad:** Every example is clean and obvious
758
+ **Good:** 20% have ambiguity, distractors, or edge cases
759
+
760
+ **Fix:** Explicitly generate "hard negative" scenarios: near-misses, multi-label, heavy distractors
761
+
762
+ ### Pitfall 5: Persistence Mismatch
763
+ **Bad:** "Our Q1 campaign goal" labeled as long-term
764
+ **Good:** "Our Q1 campaign goal" labeled as short-term (company.business_priorities)
765
+
766
+ **Fix:** Include persistence definitions in EVERY prompt and validate programmatically
767
+
768
+ ---
769
+
770
+ ## Summary Checklist
771
+
772
+ Before finalizing your synthetic dataset:
773
+
774
+ - [ ] 1,500-2,000 total examples generated
775
+ - [ ] All 13 categories represented (100+ examples each)
776
+ - [ ] 20-25% multi-label examples
777
+ - [ ] 10-15% "none" examples
778
+ - [ ] 30% with intentional distractors
779
+ - [ ] Teacher labeling agreement >95%
780
+ - [ ] Average 6.5 ± 1.5 turns per conversation
781
+ - [ ] Persistence distribution: ~35% long, ~30% medium, ~25% short
782
+ - [ ] Human review of 100 random samples (quality check)
783
+ - [ ] Train/test split (80/20, stratified by category)
784
+ - [ ] Documentation of generation process and prompts saved
docs/tinker_docs.md ADDED
The diff for this file is too large to render. See raw diff
 
huggingface/README.md ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ tags:
6
+ - memory-routing
7
+ - marketing
8
+ - classification
9
+ - llama
10
+ - lora
11
+ - tinker
12
+ base_model: meta-llama/Llama-3.1-8B
13
+ datasets:
14
+ - muratcankoylan/memory-routing-marketing
15
+ metrics:
16
+ - f1
17
+ - accuracy
18
+ pipeline_tag: text-classification
19
+ ---
20
+
21
+ # Memory Routing Agent (Llama-8B + LoRA)
22
+
23
+ A specialized 8B parameter model that **outperforms 104B models** on marketing conversation classification.
24
+
25
+ ## Key Results
26
+
27
+ | Model | Size | Avg F1 | Exact Match |
28
+ |-------|------|--------|-------------|
29
+ | **This Model** | 8B | **0.68** | **60%** |
30
+ | Cohere Command-R-Plus | 104B | 0.61 | 26% |
31
+
32
+ **11.1% higher F1** than the 104B teacher model that generated its training data.
33
+
34
+ ## Model Description
35
+
36
+ The Memory Routing Agent classifies marketing conversations into 13 memory categories:
37
+
38
+ ### Company Categories
39
+ - `company.brand_core` - Voice, values, positioning
40
+ - `company.strategic_signatures` - Decision frameworks
41
+ - `company.knowledge_artifacts` - Docs, style guides
42
+ - `company.business_priorities` - Quarterly goals
43
+ - `company.tools_config` - Integrations, APIs
44
+ - `company.performance_context` - Campaign metrics
45
+
46
+ ### User Categories
47
+ - `user.communication_style` - Tone, format preferences
48
+ - `user.strategic_approach` - Personal priorities
49
+ - `user.role_context` - Title, scope
50
+ - `user.workflow_patterns` - Review cadence
51
+ - `user.session_history` - Immediate context
52
+ - `user.interaction_preferences` - Coaching style
53
+
54
+ ### Special
55
+ - `none` - Transactional or irrelevant content
56
+
57
+ ## Training
58
+
59
+ - **Base Model**: meta-llama/Llama-3.1-8B
60
+ - **Method**: LoRA (rank 32) + SFT + RL
61
+ - **Platform**: Tinker (Thinking Machines)
62
+ - **Dataset**: 2,001 synthetic marketing conversations
63
+ - **Teacher**: Cohere Command-R-Plus (104B)
64
+
65
+ ### Training Pipeline
66
+
67
+ 1. **SFT Phase**: 100 steps, batch size 128, cross-entropy loss
68
+ 2. **RL Phase**: 12 iterations, importance sampling policy gradient
69
+ 3. **Reward**: 0.6×F1 + 0.2×temporal + 0.1×parity + 0.1×efficiency
70
+
71
+ ## Usage
72
+
73
+ ```python
74
+ # Note: This model was trained on Tinker platform
75
+ # The checkpoint is: tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012
76
+
77
+ import tinker
78
+ from tinker import types
79
+ from tinker_cookbook import renderers
80
+ from tinker_cookbook.tokenizer_utils import get_tokenizer
81
+
82
+ service_client = tinker.ServiceClient()
83
+ checkpoint = "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012"
84
+ sampling_client = service_client.create_sampling_client(model_path=checkpoint)
85
+
86
+ tokenizer = get_tokenizer("meta-llama/Llama-3.1-8B")
87
+ renderer = renderers.get_renderer(name="llama3", tokenizer=tokenizer)
88
+
89
+ conversation = """
90
+ USER: Our brand voice is professional but approachable.
91
+ ASSISTANT: So authoritative content with a conversational tone?
92
+ USER: Exactly. We never use jargon without explaining it first.
93
+ """
94
+
95
+ messages = [
96
+ {"role": "system", "content": "You route marketing conversations into structured memory categories..."},
97
+ {"role": "user", "content": f"Analyze this conversation:\n\n{conversation}"}
98
+ ]
99
+
100
+ prompt = renderer.build_generation_prompt(messages)
101
+ params = types.SamplingParams(max_tokens=100, temperature=0.1, stop=renderer.get_stop_sequences())
102
+ result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
103
+
104
+ response, _ = renderer.parse_response(result.sequences[0].tokens)
105
+ print(f"Categories: {response['content']}")
106
+ # Output: company.brand_core
107
+ ```
108
+
109
+ ## Benchmark
110
+
111
+ 50 challenging marketing scenarios across 7 domains:
112
+
113
+ | Difficulty | Our Model | Cohere (104B) |
114
+ |------------|-----------|---------------|
115
+ | Easy | 0.86 F1 | 0.48 F1 |
116
+ | Medium | 0.65 F1 | 0.64 F1 |
117
+ | Hard | 0.50 F1 | 0.72 F1 |
118
+
119
+ ## Limitations
120
+
121
+ - Under-predicts multi-label scenarios
122
+ - Sometimes confuses company vs user categories
123
+ - Marketing-specific; not tested on other domains
124
+
125
+ ## Citation
126
+
127
+ ```bibtex
128
+ @misc{memory-routing-agent-2024,
129
+ title={Memory Routing Agent: Prompt Distillation for Marketing AI},
130
+ author={Muratcan Koylan},
131
+ year={2024},
132
+ howpublished={\url{https://github.com/muratcankoylan/memory-routing-agent}},
133
+ }
134
+ ```
135
+
136
+ ## Links
137
+
138
+ - **GitHub**: [muratcankoylan/memory-routing-agent](https://github.com/muratcankoylan/memory-routing-agent)
139
+ - **Training Platform**: [Tinker by Thinking Machines](https://thinkingmachines.ai/)
140
+
huggingface/upload_to_hf.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload Memory Routing Agent to HuggingFace Hub
3
+
4
+ This script uploads:
5
+ 1. Model card (README.md)
6
+ 2. Training dataset
7
+ 3. Benchmark dataset
8
+ 4. Training configuration
9
+ """
10
+
11
+ import os
12
+ import json
13
+ from dotenv import load_dotenv
14
+ from huggingface_hub import HfApi, create_repo, upload_file, upload_folder, login
15
+
16
+ load_dotenv()
17
+
18
+ # Configuration
19
+ HF_TOKEN = os.getenv("HF_TOKEN")
20
+ REPO_ID = "muratcankoylan/memory-routing-agent"
21
+ DATASET_REPO_ID = "muratcankoylan/memory-routing-marketing"
22
+
23
+ # Login first
24
+ if HF_TOKEN:
25
+ print(f"Logging in with token (first 10 chars): {HF_TOKEN[:10]}...")
26
+ login(token=HF_TOKEN)
27
+ else:
28
+ print("ERROR: HF_TOKEN not found in .env file")
29
+ exit(1)
30
+
31
+ def upload_model():
32
+ """Upload model card and metadata to HuggingFace."""
33
+ api = HfApi(token=HF_TOKEN)
34
+
35
+ # Create model repo
36
+ try:
37
+ create_repo(repo_id=REPO_ID, token=HF_TOKEN, exist_ok=True)
38
+ print(f"Created/verified repo: {REPO_ID}")
39
+ except Exception as e:
40
+ print(f"Repo creation note: {e}")
41
+
42
+ # Upload README (model card)
43
+ upload_file(
44
+ path_or_fileobj="huggingface/README.md",
45
+ path_in_repo="README.md",
46
+ repo_id=REPO_ID,
47
+ token=HF_TOKEN,
48
+ )
49
+ print("Uploaded model card")
50
+
51
+ # Upload benchmark
52
+ upload_file(
53
+ path_or_fileobj="training/benchmarks/marketing_routing_benchmark.json",
54
+ path_in_repo="benchmark/marketing_routing_benchmark.json",
55
+ repo_id=REPO_ID,
56
+ token=HF_TOKEN,
57
+ )
58
+ print("Uploaded benchmark")
59
+
60
+ # Upload training config
61
+ config = {
62
+ "base_model": "meta-llama/Llama-3.1-8B",
63
+ "lora_rank": 32,
64
+ "sft_steps": 100,
65
+ "sft_batch_size": 128,
66
+ "sft_learning_rate": 2.86e-4,
67
+ "rl_iterations": 12,
68
+ "rl_groups_per_batch": 64,
69
+ "rl_group_size": 32,
70
+ "rl_learning_rate": 2e-5,
71
+ "tinker_checkpoint": "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012",
72
+ "reward_weights": {
73
+ "f1": 0.6,
74
+ "temporal": 0.2,
75
+ "parity": 0.1,
76
+ "efficiency": 0.1
77
+ }
78
+ }
79
+
80
+ with open("huggingface/config.json", "w") as f:
81
+ json.dump(config, f, indent=2)
82
+
83
+ upload_file(
84
+ path_or_fileobj="huggingface/config.json",
85
+ path_in_repo="config.json",
86
+ repo_id=REPO_ID,
87
+ token=HF_TOKEN,
88
+ )
89
+ print("Uploaded config")
90
+
91
+ print(f"\nModel uploaded to: https://huggingface.co/{REPO_ID}")
92
+
93
+
94
+ def upload_dataset():
95
+ """Upload training dataset to HuggingFace Datasets."""
96
+ api = HfApi(token=HF_TOKEN)
97
+
98
+ # Create dataset repo
99
+ try:
100
+ create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
101
+ print(f"Created/verified dataset repo: {DATASET_REPO_ID}")
102
+ except Exception as e:
103
+ print(f"Dataset repo creation note: {e}")
104
+
105
+ # Create dataset README
106
+ dataset_readme = """---
107
+ license: apache-2.0
108
+ language:
109
+ - en
110
+ tags:
111
+ - memory-routing
112
+ - marketing
113
+ - classification
114
+ - synthetic
115
+ size_categories:
116
+ - 1K<n<10K
117
+ ---
118
+
119
+ # Memory Routing Marketing Dataset
120
+
121
+ 2,001 synthetic marketing conversations for training memory routing classifiers.
122
+
123
+ ## Dataset Description
124
+
125
+ This dataset contains marketing conversations labeled with memory categories. Each conversation includes:
126
+ - Multi-turn dialogue between a user and AI assistant
127
+ - Category labels (13 possible categories)
128
+ - Persistence horizon (long/medium/short)
129
+ - Memory scope (company/user/none)
130
+
131
+ ## Categories
132
+
133
+ ### Company Categories
134
+ - `company.brand_core` - Voice, values, positioning
135
+ - `company.strategic_signatures` - Decision frameworks
136
+ - `company.knowledge_artifacts` - Docs, style guides
137
+ - `company.business_priorities` - Quarterly goals
138
+ - `company.tools_config` - Integrations, APIs
139
+ - `company.performance_context` - Campaign metrics
140
+
141
+ ### User Categories
142
+ - `user.communication_style` - Tone, format preferences
143
+ - `user.strategic_approach` - Personal priorities
144
+ - `user.role_context` - Title, scope
145
+ - `user.workflow_patterns` - Review cadence
146
+ - `user.session_history` - Immediate context
147
+ - `user.interaction_preferences` - Coaching style
148
+
149
+ ### Special
150
+ - `none` - Transactional or irrelevant content
151
+
152
+ ## Generation
153
+
154
+ Generated using Cohere Command-R-Plus (104B) as teacher model with diverse prompts covering:
155
+ - Multiple industries (tech, retail, healthcare, finance, etc.)
156
+ - Various user roles (CMO, VP Marketing, Growth Lead, etc.)
157
+ - Different conversation styles and complexities
158
+
159
+ ## Usage
160
+
161
+ ```python
162
+ from datasets import load_dataset
163
+
164
+ dataset = load_dataset("muratcankoylan/memory-routing-marketing")
165
+ ```
166
+
167
+ ## Citation
168
+
169
+ ```bibtex
170
+ @misc{memory-routing-agent-2024,
171
+ title={Memory Routing Agent: Prompt Distillation for Marketing AI},
172
+ author={Muratcan Koylan},
173
+ year={2024},
174
+ howpublished={\\url{https://github.com/muratcankoylan/memory-routing-agent}},
175
+ }
176
+ ```
177
+ """
178
+
179
+ with open("huggingface/dataset_readme.md", "w") as f:
180
+ f.write(dataset_readme)
181
+
182
+ upload_file(
183
+ path_or_fileobj="huggingface/dataset_readme.md",
184
+ path_in_repo="README.md",
185
+ repo_id=DATASET_REPO_ID,
186
+ repo_type="dataset",
187
+ token=HF_TOKEN,
188
+ )
189
+ print("Uploaded dataset README")
190
+
191
+ # Upload training data
192
+ upload_file(
193
+ path_or_fileobj="synthetic_data/merged_training_dataset_2001.jsonl",
194
+ path_in_repo="data/train.jsonl",
195
+ repo_id=DATASET_REPO_ID,
196
+ repo_type="dataset",
197
+ token=HF_TOKEN,
198
+ )
199
+ print("Uploaded training data")
200
+
201
+ print(f"\nDataset uploaded to: https://huggingface.co/datasets/{DATASET_REPO_ID}")
202
+
203
+
204
+ if __name__ == "__main__":
205
+ print("=" * 60)
206
+ print("Uploading Memory Routing Agent to HuggingFace")
207
+ print("=" * 60)
208
+
209
+ print("\n1. Uploading model...")
210
+ upload_model()
211
+
212
+ print("\n2. Uploading dataset...")
213
+ upload_dataset()
214
+
215
+ print("\n" + "=" * 60)
216
+ print("UPLOAD COMPLETE")
217
+ print("=" * 60)
218
+ print(f"Model: https://huggingface.co/{REPO_ID}")
219
+ print(f"Dataset: https://huggingface.co/datasets/{DATASET_REPO_ID}")
220
+
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ tinker-toolkit>=0.1.0
3
+ python-dotenv>=1.0.0
4
+ cohere>=5.0.0
5
+
6
+ # ML/Data processing
7
+ torch>=2.0.0
8
+ numpy>=1.24.0
9
+ transformers>=4.35.0
10
+ huggingface-hub>=0.19.0
11
+
12
+ # Utilities
13
+ tqdm>=4.65.0
14
+
synthetic_data/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Synthetic Data Generation Pipeline
2
+
3
+ This directory contains the tools for generating and validating synthetic training data using Cohere's `command-a-reasoning-08-2025` model.
4
+
5
+ ## Setup
6
+
7
+ 1. **Install Dependencies**:
8
+ ```bash
9
+ python3 -m venv venv
10
+ source venv/bin/activate
11
+ pip install cohere python-dotenv tinker tinker-cookbook
12
+ ```
13
+
14
+ 2. **Environment Variables**:
15
+ Ensure your `.env` file contains your Cohere API key:
16
+ ```
17
+ COHERE_API_KEY=your_api_key_here
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ ### 1. Generate Data
23
+ Use the `SyntheticDataPipeline` class to generate data batches.
24
+
25
+ ```python
26
+ from synthetic_data.pipeline import SyntheticDataPipeline
27
+
28
+ pipeline = SyntheticDataPipeline()
29
+ # Generate 10 examples for a specific category
30
+ results = pipeline.run_batch(count=10, category="company.brand_core")
31
+ ```
32
+
33
+ You can also run the sample generator script:
34
+ ```bash
35
+ python3 synthetic_data/generate_sample.py
36
+ ```
37
+
38
+ ### 2. Validate Data
39
+ Run the validation script on any generated JSON or JSONL file to check compliance with the schema and distribution targets.
40
+
41
+ ```bash
42
+ python3 synthetic_data/validate.py synthetic_data/sample_batch.json
43
+ ```
44
+
45
+ The validator checks:
46
+ * JSON structure and required fields
47
+ * Category distribution
48
+ * Multi-label frequency
49
+ * Conversation length
50
+ * Persistence and scope consistency
51
+
52
+ ## Pipeline Components
53
+
54
+ * `pipeline.py`: Core logic for 2-stage generation (Scenario -> Conversation) using Cohere.
55
+ * `validate.py`: Quality assurance script implementing checks from `docs/synthetic_data.md`.
56
+ * `test_pipeline.py`: Unit tests for the pipeline structure.
57
+ * `generate_sample.py`: Helper script to produce a quick sample batch.
58
+
synthetic_data/all_generated_data_1000.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/balanced_async_log.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ======================================================================
2
+ BALANCED CONCURRENT DATASET GENERATION
3
+ ======================================================================
4
+ Target per category: 77
5
+ Total categories: 13
6
+ Expected total: 1001
7
+ Batch size: 10 concurrent requests
8
+
9
+
10
+ [Batch 1] Generating 10 items...
11
+ Success: 10/10 | Total: 10/1001
12
+
13
+ [Batch 2] Generating 10 items...
14
+ Success: 10/10 | Total: 20/1001
15
+
16
+ [Batch 3] Generating 10 items...
17
+ Success: 10/10 | Total: 30/1001
18
+
19
+ [Batch 4] Generating 10 items...
20
+ Success: 10/10 | Total: 40/1001
21
+
22
+ [Batch 5] Generating 10 items...
23
+ Success: 10/10 | Total: 50/1001
24
+
25
+ [Batch 6] Generating 10 items...
26
+ Success: 10/10 | Total: 60/1001
27
+
28
+ [Batch 7] Generating 10 items...
29
+ Success: 10/10 | Total: 70/1001
30
+
31
+ [Batch 8] Generating 10 items...
32
+ Success: 10/10 | Total: 80/1001
33
+
34
+ [Batch 9] Generating 10 items...
35
+ Success: 10/10 | Total: 90/1001
36
+
37
+ [Batch 10] Generating 10 items...
38
+ Success: 10/10 | Total: 100/1001
39
+
40
+ Category Progress:
41
+ company.brand_core [█████░░░░░░░░░░░░░░░] 20/77
42
+ company.business_priorities [█████░░░░░░░░░░░░░░░] 20/77
43
+ company.knowledge_artifacts [█████░░░░░░░░░░░░░░░] 20/77
44
+ company.performance_context [░░░░░░░░░░░░░░░░░░░░] 0/77
45
+ company.strategic_signatures [█████░░░░░░░░░░░░░░░] 20/77
46
+ company.tools_config [█████░░░░░░░░░░░░░░░] 20/77
47
+ none [░░░░░░░░░░░░░░░░░░░░] 0/77
48
+ user.communication_style [░░░░░░░░░░░░░░░░░░░░] 0/77
49
+ user.interaction_preferences [░░░░░░░░░░░░░░░░░░░░] 0/77
50
+ user.role_context [░░░░░░░░░░░░░░░░░░░░] 0/77
51
+ user.session_history [░░░░░░░░░░░░░░░░░░░░] 0/77
52
+ user.strategic_approach [░░░░░░░░░░░░░░░░░░░░] 0/77
53
+ user.workflow_patterns [░░░░░░░░░░░░░░░░░░░░] 0/77
54
+
55
+ [Batch 11] Generating 10 items...
56
+ Success: 10/10 | Total: 110/1001
57
+
58
+ [Batch 12] Generating 10 items...
59
+ Success: 10/10 | Total: 120/1001
60
+
61
+ [Batch 13] Generating 10 items...
62
+ Success: 10/10 | Total: 130/1001
63
+
64
+ [Batch 14] Generating 10 items...
65
+ Success: 10/10 | Total: 140/1001
66
+
67
+ [Batch 15] Generating 10 items...
synthetic_data/balanced_generation_log.txt ADDED
File without changes
synthetic_data/balanced_generation_log_20251124_184530.txt ADDED
File without changes
synthetic_data/clean_batch.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+
4
+ def clean_batch(filepath):
5
+ print(f"Cleaning {filepath}...")
6
+ cleaned_data = []
7
+ fixed_count = 0
8
+
9
+ with open(filepath, 'r') as f:
10
+ for line in f:
11
+ if not line.strip():
12
+ continue
13
+ item = json.loads(line)
14
+ cats = item['labels']['categories']
15
+
16
+ if 'none' in cats and len(cats) > 1:
17
+ print(f"Fixing mixed 'none' in {item['scenario_id']}: {cats}")
18
+ cats.remove('none')
19
+ item['labels']['categories'] = cats
20
+ item['metadata']['cleaned_none_mix'] = True
21
+ fixed_count += 1
22
+
23
+ cleaned_data.append(item)
24
+
25
+ output_path = filepath.replace('.jsonl', '_cleaned.jsonl')
26
+ with open(output_path, 'w') as f:
27
+ for item in cleaned_data:
28
+ f.write(json.dumps(item) + '\n')
29
+
30
+ print(f"Cleaned {len(cleaned_data)} items. Fixed {fixed_count} issues.")
31
+ print(f"Saved to {output_path}")
32
+
33
+ if __name__ == "__main__":
34
+ if len(sys.argv) < 2:
35
+ print("Usage: python3 clean_batch.py <jsonl_file>")
36
+ sys.exit(1)
37
+ clean_batch(sys.argv[1])
38
+
synthetic_data/clean_data.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+ from typing import List, Dict, Any
4
+
5
+ def clean_datum(item: Dict[str, Any]) -> Dict[str, Any]:
6
+ """
7
+ Clean a single data item:
8
+ 1. Remove 'none' if other categories are present.
9
+ 2. Deduplicate categories.
10
+ 3. Ensure consistent formatting.
11
+ """
12
+ if "labels" not in item or "categories" not in item["labels"]:
13
+ return item
14
+
15
+ cats = item["labels"]["categories"]
16
+ # Deduplicate
17
+ cats = list(set(cats))
18
+
19
+ # Remove 'none' if other categories exist
20
+ if len(cats) > 1 and "none" in cats:
21
+ cats.remove("none")
22
+
23
+ # Update the item
24
+ item["labels"]["categories"] = cats
25
+ return item
26
+
27
+ def clean_file(input_path: str, output_path: str):
28
+ print(f"Cleaning {input_path} -> {output_path}")
29
+ cleaned_count = 0
30
+ data = []
31
+
32
+ # Read input
33
+ with open(input_path, 'r') as f:
34
+ content = f.read().strip()
35
+ if not content:
36
+ print("Empty file")
37
+ return
38
+
39
+ # Handle JSONL or list of JSON
40
+ if content.startswith('[') and content.endswith(']'):
41
+ raw_data = json.loads(content)
42
+ else:
43
+ raw_data = [json.loads(line) for line in content.split('\n') if line.strip()]
44
+
45
+ # Process
46
+ for item in raw_data:
47
+ original_cats = item.get("labels", {}).get("categories", [])
48
+ cleaned_item = clean_datum(item)
49
+ new_cats = cleaned_item["labels"]["categories"]
50
+
51
+ if set(original_cats) != set(new_cats):
52
+ cleaned_count += 1
53
+
54
+ data.append(cleaned_item)
55
+
56
+ # Write output (always as JSONL for training)
57
+ with open(output_path, 'w') as f:
58
+ for item in data:
59
+ f.write(json.dumps(item) + '\n')
60
+
61
+ print(f"Processed {len(data)} items. Cleaned {cleaned_count} items (removed 'none' or duplicates).")
62
+
63
+ if __name__ == "__main__":
64
+ if len(sys.argv) < 2:
65
+ print("Usage: python clean_data.py input_file [output_file]")
66
+ sys.exit(1)
67
+
68
+ input_file = sys.argv[1]
69
+ output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace('.json', '_cleaned.jsonl').replace('.jsonl', '_cleaned.jsonl')
70
+
71
+ clean_file(input_file, output_file)
72
+
synthetic_data/debug_key.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ import cohere
4
+
5
+ load_dotenv()
6
+
7
+ key = os.getenv("CO_API_KEY")
8
+
9
+ if key:
10
+ print(f"Key found. Length: {len(key)}")
11
+ print(f"Prefix: {key[:4]}...")
12
+
13
+ try:
14
+ client = cohere.ClientV2(api_key=key)
15
+ print("Client initialized.")
16
+ # Simple test call
17
+ print("Testing simple chat...")
18
+ response = client.chat(
19
+ model="command-r-plus", # Use a cheaper/standard model for quick test
20
+ messages=[{"role": "user", "content": "Hello"}]
21
+ )
22
+ print("Response received!")
23
+ print(response)
24
+ except Exception as e:
25
+ print(f"Error: {e}")
26
+ else:
27
+ print("Key NOT found in environment.")
28
+
synthetic_data/debug_key_raw.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ key = os.getenv("CO_API_KEY")
7
+
8
+ if key:
9
+ print(f"Key raw: '{key}'")
10
+ print(f"Length: {len(key)}")
11
+ else:
12
+ print("Key NOT found.")
13
+
synthetic_data/diverse_dataset_20251124_192207.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/diverse_generation_log.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ======================================================================
2
+ WORLD-CLASS DATASET GENERATION
3
+ ======================================================================
4
+ Temperature: 0.95 | Max creative freedom
5
+ Target: 77 per category x 13 = 1001 total
6
+ Output: synthetic_data/diverse_dataset_20251124_190248.jsonl
7
+ ======================================================================
8
+
9
+ [Batch 1] 10 items | Categories: {'none', 'company.business_priorities', 'company.performance_context', 'user.session_history', 'company.brand_core', 'user.interaction_preferences', 'company.strategic_signatures', 'user.strategic_approach'}
synthetic_data/diverse_log.txt ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/generate_sample.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from synthetic_data.pipeline import SyntheticDataPipeline
3
+
4
+ def generate_sample():
5
+ pipeline = SyntheticDataPipeline()
6
+ print("Generating sample batch...")
7
+ results = pipeline.run_batch(count=2, category="company.brand_core")
8
+
9
+ with open("synthetic_data/sample_batch.json", "w") as f:
10
+ json.dump(results, f, indent=2)
11
+ print(f"Saved {len(results)} examples to synthetic_data/sample_batch.json")
12
+
13
+ if __name__ == "__main__":
14
+ generate_sample()
15
+
synthetic_data/generation_log_100.txt ADDED
@@ -0,0 +1,702 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Starting generation of 100 items in 10 batches (Size: 10)...
2
+
3
+ === Processing Batch 1/10 ===
4
+ Generating item 1/10 (Category: user.strategic_approach)...
5
+ Generated: company.strategic_signatures_compliance_curriculum_142
6
+ Sleeping for 10s to avoid rate limits...
7
+ Generating item 2/10 (Category: company.business_priorities)...
8
+ Generated: company.business_priorities_descriptive_456
9
+ Sleeping for 10s to avoid rate limits...
10
+ Generating item 3/10 (Category: user.workflow_patterns)...
11
+ Generated: company.strategic_signatures_descriptive_742
12
+ Sleeping for 10s to avoid rate limits...
13
+ Generating item 4/10 (Category: user.session_history)...
14
+ Generated: company.tools_config_workflow_157
15
+ Sleeping for 10s to avoid rate limits...
16
+ Generating item 5/10 (Category: none)...
17
+ Generated: company.business_priorities_descriptive_456
18
+ Sleeping for 10s to avoid rate limits...
19
+ Generating item 6/10 (Category: company.knowledge_artifacts)...
20
+ Scenario generation failed (attempt 1/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': 'f32c28401e57e5a48a02968561190d5c', 'date': 'Fri, 21 Nov 2025 21:02:43 GMT', 'x-envoy-upstream-service-time': '9', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': '16af4d33-1859-4a1e-b2c5-bab9a414a063', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
21
+ Retrying in 5s...
22
+ Generated: company.business_priorities_CPG_seasonal_launch_742
23
+ Sleeping for 10s to avoid rate limits...
24
+ Generating item 7/10 (Category: user.strategic_approach)...
25
+ Generated: company.strategic_signatures_retention_742
26
+ Sleeping for 10s to avoid rate limits...
27
+ Generating item 8/10 (Category: user.workflow_patterns)...
28
+ Generated: company_business_priorities_456
29
+ Sleeping for 10s to avoid rate limits...
30
+ Generating item 9/10 (Category: company.tools_config)...
31
+ Generated: company.tools_config_fundraising_123
32
+ Sleeping for 10s to avoid rate limits...
33
+ Generating item 10/10 (Category: user.role_context)...
34
+ Generated: company.business_priorities_retention_strategy_123
35
+ Sleeping for 10s to avoid rate limits...
36
+ Saved batch to synthetic_data/batch_01.json
37
+ Validating batch...
38
+ {
39
+ "total_examples": 10,
40
+ "category_distribution": {
41
+ "company.strategic_signatures": 5,
42
+ "company.business_priorities": 9,
43
+ "user.strategic_approach": 5,
44
+ "user.workflow_patterns": 3,
45
+ "company.performance_context": 5,
46
+ "user.role_context": 7,
47
+ "company.knowledge_artifacts": 2,
48
+ "company.tools_config": 2,
49
+ "user.session_history": 1,
50
+ "user.interaction_preferences": 1,
51
+ "none": 1
52
+ },
53
+ "multi_label_frequency": 1.0,
54
+ "avg_turns_per_conversation": 6.4,
55
+ "persistence_distribution": {
56
+ "mixed": 7,
57
+ "medium": 2,
58
+ "short": 1
59
+ },
60
+ "scope_distribution": {
61
+ "mixed": 10
62
+ },
63
+ "warnings": []
64
+ }
65
+
66
+ === Processing Batch 2/10 ===
67
+ Generating item 1/10 (Category: company.business_priorities)...
68
+ Generated: company.business_priorities_sponsorship_742
69
+ Sleeping for 10s to avoid rate limits...
70
+ Generating item 2/10 (Category: company.tools_config)...
71
+ Generated: company.tools_config_collaborative_configuration_142
72
+ Sleeping for 10s to avoid rate limits...
73
+ Generating item 3/10 (Category: user.session_history)...
74
+ Generated: user_session_history_456
75
+ Sleeping for 10s to avoid rate limits...
76
+ Generating item 4/10 (Category: user.interaction_preferences)...
77
+ Generated: company.knowledge_artifacts_EdTech_strategy_refinement_451
78
+ Sleeping for 10s to avoid rate limits...
79
+ Generating item 5/10 (Category: user.strategic_approach)...
80
+ Generated: company_strategic_signatures_123
81
+ Sleeping for 10s to avoid rate limits...
82
+ Generating item 6/10 (Category: user.communication_style)...
83
+ Generated: company.brand_core_collaborative_messaging_142
84
+ Sleeping for 10s to avoid rate limits...
85
+ Generating item 7/10 (Category: user.communication_style)...
86
+ Generated: company.strategic_signatures_donor_messaging_742
87
+ Sleeping for 10s to avoid rate limits...
88
+ Generating item 8/10 (Category: company.knowledge_artifacts)...
89
+ Generated: company.knowledge_artifacts_onboarding_742
90
+ Sleeping for 10s to avoid rate limits...
91
+ Generating item 9/10 (Category: user.workflow_patterns)...
92
+ Generated: company_tools_config_742
93
+ Sleeping for 10s to avoid rate limits...
94
+ Generating item 10/10 (Category: company.knowledge_artifacts)...
95
+ Generated: company.knowledge_artifacts_retail_452
96
+ Sleeping for 10s to avoid rate limits...
97
+ Saved batch to synthetic_data/batch_02.json
98
+ Validating batch...
99
+ {
100
+ "total_examples": 10,
101
+ "category_distribution": {
102
+ "company.business_priorities": 5,
103
+ "company.knowledge_artifacts": 7,
104
+ "company.strategic_signatures": 4,
105
+ "user.strategic_approach": 6,
106
+ "company.tools_config": 3,
107
+ "company.performance_context": 2,
108
+ "user.role_context": 3,
109
+ "user.session_history": 2,
110
+ "company.brand_core": 3,
111
+ "none": 4,
112
+ "user.interaction_preferences": 1,
113
+ "user.workflow_patterns": 3,
114
+ "user.communication_style": 2
115
+ },
116
+ "multi_label_frequency": 1.0,
117
+ "avg_turns_per_conversation": 7.6,
118
+ "persistence_distribution": {
119
+ "mixed": 8,
120
+ "medium": 1,
121
+ "long": 1
122
+ },
123
+ "scope_distribution": {
124
+ "mixed": 9,
125
+ "company": 1
126
+ },
127
+ "warnings": []
128
+ }
129
+
130
+ === Processing Batch 3/10 ===
131
+ Generating item 1/10 (Category: user.session_history)...
132
+ Generated: company.business_priorities_execution_729
133
+ Sleeping for 10s to avoid rate limits...
134
+ Generating item 2/10 (Category: company.business_priorities)...
135
+ Generated: company.business_priorities_planning_753
136
+ Sleeping for 10s to avoid rate limits...
137
+ Generating item 3/10 (Category: user.role_context)...
138
+ Generated: company.business_priorities_compliance_update_142
139
+ Sleeping for 10s to avoid rate limits...
140
+ Generating item 4/10 (Category: user.communication_style)...
141
+ Generated: user.strategic_approach_negotiation_742
142
+ Sleeping for 10s to avoid rate limits...
143
+ Generating item 5/10 (Category: none)...
144
+ Generated: company.business_priorities_coordination_742
145
+ Sleeping for 10s to avoid rate limits...
146
+ Generating item 6/10 (Category: user.workflow_patterns)...
147
+ Generated: company.performance_context_b2b_saas_456
148
+ Sleeping for 10s to avoid rate limits...
149
+ Generating item 7/10 (Category: user.session_history)...
150
+ Generated: company_business_priorities_campaign_adjustment_789
151
+ Sleeping for 10s to avoid rate limits...
152
+ Generating item 8/10 (Category: user.interaction_preferences)...
153
+ Generated: user_interaction_preferences_AI_Collaboration_456
154
+ Sleeping for 10s to avoid rate limits...
155
+ Generating item 9/10 (Category: company.performance_context)...
156
+ Generated: company.performance_context_quarterly_review_742
157
+ Sleeping for 10s to avoid rate limits...
158
+ Generating item 10/10 (Category: company.business_priorities)...
159
+ Generated: company.business_priorities_healthcare_123
160
+ Sleeping for 10s to avoid rate limits...
161
+ Saved batch to synthetic_data/batch_03.json
162
+ Validating batch...
163
+ {
164
+ "total_examples": 10,
165
+ "category_distribution": {
166
+ "company.business_priorities": 9,
167
+ "user.role_context": 7,
168
+ "company.performance_context": 4,
169
+ "none": 3,
170
+ "company.knowledge_artifacts": 5,
171
+ "user.session_history": 1,
172
+ "user.strategic_approach": 4,
173
+ "user.communication_style": 1,
174
+ "company.strategic_signatures": 2,
175
+ "user.workflow_patterns": 2,
176
+ "company.tools_config": 3,
177
+ "user.interaction_preferences": 1
178
+ },
179
+ "multi_label_frequency": 1.0,
180
+ "avg_turns_per_conversation": 7.6,
181
+ "persistence_distribution": {
182
+ "short": 3,
183
+ "mixed": 4,
184
+ "medium": 3
185
+ },
186
+ "scope_distribution": {
187
+ "mixed": 10
188
+ },
189
+ "warnings": []
190
+ }
191
+
192
+ === Processing Batch 4/10 ===
193
+ Generating item 1/10 (Category: user.strategic_approach)...
194
+ Generated: company.strategic_signatures_personalization_147
195
+ Sleeping for 10s to avoid rate limits...
196
+ Generating item 2/10 (Category: user.session_history)...
197
+ Generated: company.performance_context_procurement_742
198
+ Sleeping for 10s to avoid rate limits...
199
+ Generating item 3/10 (Category: company.business_priorities)...
200
+ Generated: company_business_priorities_123
201
+ Sleeping for 10s to avoid rate limits...
202
+ Generating item 4/10 (Category: none)...
203
+ Generated: company_performance_context_742
204
+ Sleeping for 10s to avoid rate limits...
205
+ Generating item 5/10 (Category: company.business_priorities)...
206
+ Generated: company.strategic_signatures_personalization_742
207
+ Sleeping for 10s to avoid rate limits...
208
+ Generating item 6/10 (Category: user.session_history)...
209
+ Scenario generation failed (attempt 1/3): Server disconnected without sending a response.
210
+ Retrying in 5s...
211
+ Generated: company.business_priorities_healthcare_campaign_742
212
+ Sleeping for 10s to avoid rate limits...
213
+ Generating item 7/10 (Category: user.communication_style)...
214
+ Generated: user.workflow_patterns_collaboration_142
215
+ Sleeping for 10s to avoid rate limits...
216
+ Generating item 8/10 (Category: user.interaction_preferences)...
217
+ Generated: company.tools_config_workflow_adjustment_742
218
+ Sleeping for 10s to avoid rate limits...
219
+ Generating item 9/10 (Category: company.brand_core)...
220
+ Generated: company_strategic_signatures_742
221
+ Sleeping for 10s to avoid rate limits...
222
+ Generating item 10/10 (Category: company.strategic_signatures)...
223
+ Generated: company.knowledge_artifacts_compliance_142
224
+ Sleeping for 10s to avoid rate limits...
225
+ Saved batch to synthetic_data/batch_04.json
226
+ Validating batch...
227
+ {
228
+ "total_examples": 10,
229
+ "category_distribution": {
230
+ "company.strategic_signatures": 5,
231
+ "company.performance_context": 5,
232
+ "company.tools_config": 4,
233
+ "user.strategic_approach": 5,
234
+ "user.role_context": 5,
235
+ "company.business_priorities": 7,
236
+ "none": 2,
237
+ "user.session_history": 2,
238
+ "company.brand_core": 2,
239
+ "user.communication_style": 2,
240
+ "user.workflow_patterns": 3,
241
+ "company.knowledge_artifacts": 2
242
+ },
243
+ "multi_label_frequency": 1.0,
244
+ "avg_turns_per_conversation": 7.8,
245
+ "persistence_distribution": {
246
+ "mixed": 8,
247
+ "medium": 1,
248
+ "long": 1
249
+ },
250
+ "scope_distribution": {
251
+ "mixed": 10
252
+ },
253
+ "warnings": []
254
+ }
255
+
256
+ === Processing Batch 5/10 ===
257
+ Generating item 1/10 (Category: company.strategic_signatures)...
258
+ Generated: company.strategic_signatures_pilot_142
259
+ Sleeping for 10s to avoid rate limits...
260
+ Generating item 2/10 (Category: user.communication_style)...
261
+ Generated: company_knowledge_artifacts_742
262
+ Sleeping for 10s to avoid rate limits...
263
+ Generating item 3/10 (Category: user.role_context)...
264
+ Generated: company.business_priorities_campaign_142
265
+ Sleeping for 10s to avoid rate limits...
266
+ Generating item 4/10 (Category: user.communication_style)...
267
+ Generated: user.communication_style_retention_strategy_147
268
+ Sleeping for 10s to avoid rate limits...
269
+ Generating item 5/10 (Category: user.strategic_approach)...
270
+ Generated: company.business_priorities_compliance_training_123
271
+ Sleeping for 10s to avoid rate limits...
272
+ Generating item 6/10 (Category: company.brand_core)...
273
+ Generated: company.business_priorities_demand_response_123
274
+ Sleeping for 10s to avoid rate limits...
275
+ Generating item 7/10 (Category: company.strategic_signatures)...
276
+ Generated: company.business_priorities_negotiation_147
277
+ Sleeping for 10s to avoid rate limits...
278
+ Generating item 8/10 (Category: company.brand_core)...
279
+ Generated: company.brand_core_proposal_standardization_742
280
+ Sleeping for 10s to avoid rate limits...
281
+ Generating item 9/10 (Category: company.business_priorities)...
282
+ Generated: company.business_priorities_PLG_789
283
+ Sleeping for 10s to avoid rate limits...
284
+ Generating item 10/10 (Category: user.strategic_approach)...
285
+ Generated: company.strategic_signatures_planning_123
286
+ Sleeping for 10s to avoid rate limits...
287
+ Saved batch to synthetic_data/batch_05.json
288
+ Validating batch...
289
+ {
290
+ "total_examples": 10,
291
+ "category_distribution": {
292
+ "company.strategic_signatures": 5,
293
+ "company.brand_core": 3,
294
+ "company.performance_context": 7,
295
+ "company.business_priorities": 8,
296
+ "user.role_context": 4,
297
+ "user.strategic_approach": 7,
298
+ "company.knowledge_artifacts": 4,
299
+ "user.communication_style": 2,
300
+ "user.workflow_patterns": 2,
301
+ "company.tools_config": 2,
302
+ "user.session_history": 2,
303
+ "none": 1
304
+ },
305
+ "multi_label_frequency": 1.0,
306
+ "avg_turns_per_conversation": 8.9,
307
+ "persistence_distribution": {
308
+ "mixed": 8,
309
+ "short": 1,
310
+ "long": 1
311
+ },
312
+ "scope_distribution": {
313
+ "mixed": 10
314
+ },
315
+ "warnings": []
316
+ }
317
+
318
+ === Processing Batch 6/10 ===
319
+ Generating item 1/10 (Category: company.knowledge_artifacts)...
320
+ Generated: company_performance_context_742
321
+ Sleeping for 10s to avoid rate limits...
322
+ Generating item 2/10 (Category: user.role_context)...
323
+ Generated: company.business_priorities_event_launch_coordination_452
324
+ Sleeping for 10s to avoid rate limits...
325
+ Generating item 3/10 (Category: user.role_context)...
326
+ Generated: company.business_priorities_developer_tools_142
327
+ Sleeping for 10s to avoid rate limits...
328
+ Generating item 4/10 (Category: company.brand_core)...
329
+ Generated: company_brand_core_742
330
+ Sleeping for 10s to avoid rate limits...
331
+ Generating item 5/10 (Category: company.business_priorities)...
332
+ Generated: company.strategic_signatures_fintech_147
333
+ Sleeping for 10s to avoid rate limits...
334
+ Generating item 6/10 (Category: company.strategic_signatures)...
335
+ Generated: company.business_priorities_partner_strategy_review_457
336
+ Sleeping for 10s to avoid rate limits...
337
+ Generating item 7/10 (Category: company.business_priorities)...
338
+ Generated: company.business_priorities_healthcare_compliance_123
339
+ Sleeping for 10s to avoid rate limits...
340
+ Generating item 8/10 (Category: none)...
341
+ Generated: company.strategic_signatures_hospitality_724
342
+ Sleeping for 10s to avoid rate limits...
343
+ Generating item 9/10 (Category: user.workflow_patterns)...
344
+ Conversation generation failed (attempt 1/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '6b121e3dd2897f71e3b41c24deb7f246', 'date': 'Fri, 21 Nov 2025 21:34:38 GMT', 'x-envoy-upstream-service-time': '3', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': 'd6f09c3c-7117-4fde-81ae-070998fc069a', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
345
+ Retrying in 5s...
346
+ Conversation generation failed (attempt 2/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '0347bae8c3cc9541daddc86a581ba123', 'date': 'Fri, 21 Nov 2025 21:34:43 GMT', 'x-envoy-upstream-service-time': '5', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': '76cde387-275f-4942-9c12-46096a26ef59', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
347
+ Retrying in 10s...
348
+ Conversation generation failed (attempt 3/3): headers: {'access-control-expose-headers': 'X-Debug-Trace-ID', 'cache-control': 'no-cache, no-store, no-transform, must-revalidate, private, max-age=0', 'content-encoding': 'gzip', 'content-type': 'application/json', 'expires': 'Thu, 01 Jan 1970 00:00:00 GMT', 'pragma': 'no-cache', 'vary': 'Origin,Accept-Encoding', 'x-accel-expires': '0', 'x-debug-trace-id': '06d09c4265780b1648480b4022c7b208', 'date': 'Fri, 21 Nov 2025 21:34:53 GMT', 'x-envoy-upstream-service-time': '6', 'server': 'envoy', 'via': '1.1 google', 'alt-svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000', 'transfer-encoding': 'chunked'}, status_code: 429, body: {'id': 'a8cbb61a-75f3-4faa-a11d-86640fe742cc', 'message': 'You are past the per-minute request limit for this model, please wait and try again later.'}
349
+ Failed to generate conversation for user.workflow_patterns
350
+ Sleeping for 10s to avoid rate limits...
351
+ Generating item 10/10 (Category: company.business_priorities)...
352
+ Generated: company.business_priorities_pilot_execution_142
353
+ Sleeping for 10s to avoid rate limits...
354
+ Saved batch to synthetic_data/batch_06.json
355
+ Validating batch...
356
+ {
357
+ "total_examples": 9,
358
+ "category_distribution": {
359
+ "company.performance_context": 3,
360
+ "company.knowledge_artifacts": 2,
361
+ "user.role_context": 9,
362
+ "user.strategic_approach": 2,
363
+ "company.business_priorities": 7,
364
+ "user.workflow_patterns": 4,
365
+ "company.brand_core": 1,
366
+ "company.strategic_signatures": 3,
367
+ "company.tools_config": 2
368
+ },
369
+ "multi_label_frequency": 1.0,
370
+ "avg_turns_per_conversation": 6.111111111111111,
371
+ "persistence_distribution": {
372
+ "mixed": 6,
373
+ "short": 3
374
+ },
375
+ "scope_distribution": {
376
+ "mixed": 9
377
+ },
378
+ "warnings": []
379
+ }
380
+
381
+ === Processing Batch 7/10 ===
382
+ Generating item 1/10 (Category: company.strategic_signatures)...
383
+ Generated: company_strategic_signatures_123
384
+ Sleeping for 10s to avoid rate limits...
385
+ Generating item 2/10 (Category: user.workflow_patterns)...
386
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
387
+ Retrying in 5s...
388
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
389
+ Retrying in 10s...
390
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
391
+ Failed to generate scenario for user.workflow_patterns
392
+ Generating item 3/10 (Category: none)...
393
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
394
+ Retrying in 5s...
395
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
396
+ Retrying in 10s...
397
+ Scenario generation failed (attempt 3/3): The read operation timed out
398
+ Failed to generate scenario for none
399
+ Generating item 4/10 (Category: none)...
400
+ Conversation generation failed (attempt 1/3): The read operation timed out
401
+ Retrying in 5s...
402
+ Conversation generation failed (attempt 2/3): The read operation timed out
403
+ Retrying in 10s...
404
+ Conversation generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
405
+ Failed to generate conversation for none
406
+ Sleeping for 10s to avoid rate limits...
407
+ Generating item 5/10 (Category: user.workflow_patterns)...
408
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
409
+ Retrying in 5s...
410
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
411
+ Retrying in 10s...
412
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
413
+ Failed to generate scenario for user.workflow_patterns
414
+ Generating item 6/10 (Category: company.strategic_signatures)...
415
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
416
+ Retrying in 5s...
417
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
418
+ Retrying in 10s...
419
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
420
+ Failed to generate scenario for company.strategic_signatures
421
+ Generating item 7/10 (Category: none)...
422
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
423
+ Retrying in 5s...
424
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
425
+ Retrying in 10s...
426
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
427
+ Failed to generate scenario for none
428
+ Generating item 8/10 (Category: user.strategic_approach)...
429
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
430
+ Retrying in 5s...
431
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
432
+ Retrying in 10s...
433
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
434
+ Failed to generate scenario for user.strategic_approach
435
+ Generating item 9/10 (Category: user.workflow_patterns)...
436
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
437
+ Retrying in 5s...
438
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
439
+ Retrying in 10s...
440
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
441
+ Failed to generate scenario for user.workflow_patterns
442
+ Generating item 10/10 (Category: user.strategic_approach)...
443
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
444
+ Retrying in 5s...
445
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
446
+ Retrying in 10s...
447
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
448
+ Failed to generate scenario for user.strategic_approach
449
+ Saved batch to synthetic_data/batch_07.json
450
+ Validating batch...
451
+ {
452
+ "total_examples": 1,
453
+ "category_distribution": {
454
+ "company.strategic_signatures": 2,
455
+ "company.business_priorities": 3,
456
+ "none": 1,
457
+ "user.workflow_patterns": 1
458
+ },
459
+ "multi_label_frequency": 1.0,
460
+ "avg_turns_per_conversation": 7.0,
461
+ "persistence_distribution": {
462
+ "mixed": 1
463
+ },
464
+ "scope_distribution": {
465
+ "mixed": 1
466
+ },
467
+ "warnings": []
468
+ }
469
+
470
+ === Processing Batch 8/10 ===
471
+ Generating item 1/10 (Category: none)...
472
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
473
+ Retrying in 5s...
474
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
475
+ Retrying in 10s...
476
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
477
+ Failed to generate scenario for none
478
+ Generating item 2/10 (Category: company.knowledge_artifacts)...
479
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
480
+ Retrying in 5s...
481
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
482
+ Retrying in 10s...
483
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
484
+ Failed to generate scenario for company.knowledge_artifacts
485
+ Generating item 3/10 (Category: user.session_history)...
486
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
487
+ Retrying in 5s...
488
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
489
+ Retrying in 10s...
490
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
491
+ Failed to generate scenario for user.session_history
492
+ Generating item 4/10 (Category: user.workflow_patterns)...
493
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
494
+ Retrying in 5s...
495
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
496
+ Retrying in 10s...
497
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
498
+ Failed to generate scenario for user.workflow_patterns
499
+ Generating item 5/10 (Category: company.knowledge_artifacts)...
500
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
501
+ Retrying in 5s...
502
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
503
+ Retrying in 10s...
504
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
505
+ Failed to generate scenario for company.knowledge_artifacts
506
+ Generating item 6/10 (Category: none)...
507
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
508
+ Retrying in 5s...
509
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
510
+ Retrying in 10s...
511
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
512
+ Failed to generate scenario for none
513
+ Generating item 7/10 (Category: company.knowledge_artifacts)...
514
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
515
+ Retrying in 5s...
516
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
517
+ Retrying in 10s...
518
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
519
+ Failed to generate scenario for company.knowledge_artifacts
520
+ Generating item 8/10 (Category: user.role_context)...
521
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
522
+ Retrying in 5s...
523
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
524
+ Retrying in 10s...
525
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
526
+ Failed to generate scenario for user.role_context
527
+ Generating item 9/10 (Category: user.session_history)...
528
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
529
+ Retrying in 5s...
530
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
531
+ Retrying in 10s...
532
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
533
+ Failed to generate scenario for user.session_history
534
+ Generating item 10/10 (Category: user.role_context)...
535
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
536
+ Retrying in 5s...
537
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
538
+ Retrying in 10s...
539
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
540
+ Failed to generate scenario for user.role_context
541
+ Saved batch to synthetic_data/batch_08.json
542
+ Validating batch...
543
+ {
544
+ "error": "Empty dataset"
545
+ }
546
+
547
+ === Processing Batch 9/10 ===
548
+ Generating item 1/10 (Category: company.business_priorities)...
549
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
550
+ Retrying in 5s...
551
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
552
+ Retrying in 10s...
553
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
554
+ Failed to generate scenario for company.business_priorities
555
+ Generating item 2/10 (Category: company.brand_core)...
556
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
557
+ Retrying in 5s...
558
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
559
+ Retrying in 10s...
560
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
561
+ Failed to generate scenario for company.brand_core
562
+ Generating item 3/10 (Category: user.communication_style)...
563
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
564
+ Retrying in 5s...
565
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
566
+ Retrying in 10s...
567
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
568
+ Failed to generate scenario for user.communication_style
569
+ Generating item 4/10 (Category: company.knowledge_artifacts)...
570
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
571
+ Retrying in 5s...
572
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
573
+ Retrying in 10s...
574
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
575
+ Failed to generate scenario for company.knowledge_artifacts
576
+ Generating item 5/10 (Category: user.communication_style)...
577
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
578
+ Retrying in 5s...
579
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
580
+ Retrying in 10s...
581
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
582
+ Failed to generate scenario for user.communication_style
583
+ Generating item 6/10 (Category: company.tools_config)...
584
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
585
+ Retrying in 5s...
586
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
587
+ Retrying in 10s...
588
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
589
+ Failed to generate scenario for company.tools_config
590
+ Generating item 7/10 (Category: none)...
591
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
592
+ Retrying in 5s...
593
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
594
+ Retrying in 10s...
595
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
596
+ Failed to generate scenario for none
597
+ Generating item 8/10 (Category: user.interaction_preferences)...
598
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
599
+ Retrying in 5s...
600
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
601
+ Retrying in 10s...
602
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
603
+ Failed to generate scenario for user.interaction_preferences
604
+ Generating item 9/10 (Category: none)...
605
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
606
+ Retrying in 5s...
607
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
608
+ Retrying in 10s...
609
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
610
+ Failed to generate scenario for none
611
+ Generating item 10/10 (Category: none)...
612
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
613
+ Retrying in 5s...
614
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
615
+ Retrying in 10s...
616
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
617
+ Failed to generate scenario for none
618
+ Saved batch to synthetic_data/batch_09.json
619
+ Validating batch...
620
+ {
621
+ "error": "Empty dataset"
622
+ }
623
+
624
+ === Processing Batch 10/10 ===
625
+ Generating item 1/10 (Category: none)...
626
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
627
+ Retrying in 5s...
628
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
629
+ Retrying in 10s...
630
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
631
+ Failed to generate scenario for none
632
+ Generating item 2/10 (Category: company.business_priorities)...
633
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
634
+ Retrying in 5s...
635
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
636
+ Retrying in 10s...
637
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
638
+ Failed to generate scenario for company.business_priorities
639
+ Generating item 3/10 (Category: user.strategic_approach)...
640
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
641
+ Retrying in 5s...
642
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
643
+ Retrying in 10s...
644
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
645
+ Failed to generate scenario for user.strategic_approach
646
+ Generating item 4/10 (Category: company.performance_context)...
647
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
648
+ Retrying in 5s...
649
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
650
+ Retrying in 10s...
651
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
652
+ Failed to generate scenario for company.performance_context
653
+ Generating item 5/10 (Category: company.strategic_signatures)...
654
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
655
+ Retrying in 5s...
656
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
657
+ Retrying in 10s...
658
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
659
+ Failed to generate scenario for company.strategic_signatures
660
+ Generating item 6/10 (Category: company.brand_core)...
661
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
662
+ Retrying in 5s...
663
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
664
+ Retrying in 10s...
665
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
666
+ Failed to generate scenario for company.brand_core
667
+ Generating item 7/10 (Category: company.strategic_signatures)...
668
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
669
+ Retrying in 5s...
670
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
671
+ Retrying in 10s...
672
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
673
+ Failed to generate scenario for company.strategic_signatures
674
+ Generating item 8/10 (Category: user.strategic_approach)...
675
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
676
+ Retrying in 5s...
677
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
678
+ Retrying in 10s...
679
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
680
+ Failed to generate scenario for user.strategic_approach
681
+ Generating item 9/10 (Category: company.brand_core)...
682
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
683
+ Retrying in 5s...
684
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
685
+ Retrying in 10s...
686
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
687
+ Failed to generate scenario for company.brand_core
688
+ Generating item 10/10 (Category: none)...
689
+ Scenario generation failed (attempt 1/3): [Errno 8] nodename nor servname provided, or not known
690
+ Retrying in 5s...
691
+ Scenario generation failed (attempt 2/3): [Errno 8] nodename nor servname provided, or not known
692
+ Retrying in 10s...
693
+ Scenario generation failed (attempt 3/3): [Errno 8] nodename nor servname provided, or not known
694
+ Failed to generate scenario for none
695
+ Saved batch to synthetic_data/batch_10.json
696
+ Validating batch...
697
+ {
698
+ "error": "Empty dataset"
699
+ }
700
+
701
+ Completed. Total items generated: 60
702
+ Full dataset saved to synthetic_data/all_generated_data_100.json
synthetic_data/generation_log_100_v2.txt ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/generation_log_async.txt ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/generation_log_final.txt ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/merged_training_dataset_2001.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/pipeline.py ADDED
@@ -0,0 +1,311 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ import asyncio
5
+ import time
6
+ import cohere
7
+ from typing import List, Dict, Any, Optional
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ DOMAIN_CONTEXTS = [
13
+ "B2B SaaS workflow automation for enterprise teams",
14
+ "Consumer fintech budgeting assistant rolling out in LATAM",
15
+ "Healthcare patient engagement platform coordinating compliance content",
16
+ "Retail omnichannel loyalty program for a fashion brand",
17
+ "EdTech company designing AI tutoring playbooks",
18
+ "Hospitality chain redefining guest personalization across regions",
19
+ "Developer tools startup improving product-led growth motions",
20
+ "Sports media network negotiating sponsorship activations",
21
+ "Gaming studio planning live-ops launches",
22
+ "Non-profit fundraising platform balancing donor messaging",
23
+ "Enterprise cybersecurity firm running incident response playbooks",
24
+ "Supply-chain analytics platform optimizing vendor collaboration",
25
+ "CPG beverage brand planning seasonal launches with agencies",
26
+ "Real-estate marketplace coordinating broker enablement",
27
+ "Mobility/ride-hailing service planning driver communications",
28
+ "Streaming media company managing international content drops",
29
+ "Insurance carrier modernizing agent training workflows",
30
+ "Energy provider coordinating demand-response campaigns",
31
+ "Professional services firm standardizing proposal playbooks",
32
+ "AI infrastructure startup refining go-to-market with partners",
33
+ "Luxury beauty brand orchestrating influencer activations",
34
+ "Food delivery platform improving courier retention messaging",
35
+ "Corporate learning company updating compliance curricula",
36
+ "Outdoor gear company rolling out omnichannel retail pilots"
37
+ ]
38
+
39
+ class SyntheticDataPipeline:
40
+ def __init__(self, api_key: Optional[str] = None, max_retries: int = 5):
41
+ self.api_key = api_key or os.getenv("COHERE_API_KEY")
42
+ if not self.api_key:
43
+ raise ValueError("COHERE_API_KEY not found in environment variables")
44
+ self.client = cohere.ClientV2(api_key=self.api_key)
45
+ # Switched to command-r-plus-08-2024 due to rate limits on reasoning model
46
+ self.model = "command-r-plus-08-2024"
47
+ self.max_retries = max_retries
48
+
49
+ def _sample_domain_context(self) -> str:
50
+ return random.choice(DOMAIN_CONTEXTS)
51
+
52
+ @staticmethod
53
+ def _extract_text(response) -> Optional[str]:
54
+ """Extract the first text block from a Cohere response."""
55
+ if not response or not getattr(response, "message", None):
56
+ return None
57
+ blocks = getattr(response.message, "content", []) or []
58
+ for block in blocks:
59
+ text = getattr(block, "text", None)
60
+ if isinstance(text, str) and text.strip():
61
+ return text
62
+ return None
63
+
64
+ def generate_scenario_spec(self, category: str, distractor: Optional[str] = None,
65
+ persistence: str = "long", tone: str = "neutral",
66
+ turns: int = 6, special_reqs: str = "") -> Dict[str, Any]:
67
+ """Stage 1: Generate a scenario specification."""
68
+ domain_context = self._sample_domain_context()
69
+ midstream_note = "Conversation should start mid-thread (no greetings) and refer back to earlier collaboration."
70
+ diversity_note = "Keep subject matter aligned with the given domain context; avoid repeating eco/climate themes unless category demands it."
71
+ combined_reqs = " | ".join(filter(None, [special_reqs, midstream_note, diversity_note]))
72
+
73
+ if category == "none":
74
+ prompt = f"""Generate a JSON scenario specification for a conversation that has NO long-term memory value (Category: none).
75
+ The conversation should be strictly transactional, vague, or temporary.
76
+ Examples: checking status, scheduling a meeting, asking a clarification, greeting, small talk, or discussing weather/lunch.
77
+
78
+ CONTEXT: General professional setting. Do NOT include any strategic projects, specific brand details, or user preferences that would trigger memory storage.
79
+
80
+ Requirements:
81
+ - Primary Category: none
82
+ - Distractor Category: {distractor if distractor else "None"}
83
+ - Persistence Level: short
84
+ - Turn Count: {turns}
85
+ - Special Requirements: {combined_reqs}
86
+
87
+ Return a JSON object with:
88
+ {{
89
+ "scenario_description": "Brief narrative setup (2-3 sentences) - MUST BE NON-MEMORABLE",
90
+ "user_profile": "User role",
91
+ "key_signals_to_include": ["List of 2-4 signals that are specifically IRRELEVANT or TEMPORARY"],
92
+ "distractor_signals": ["Optional list of signals"],
93
+ "suggested_turn_breakdown": "Flow of conversation"
94
+ }}
95
+ """
96
+ else:
97
+ prompt = f"""You are designing training scenarios for an AI memory system in marketing context. Generate a scenario specification tailored to this business setting: {domain_context}.
98
+
99
+ Requirements:
100
+ - Primary Category: {category}
101
+ - Distractor Category: {distractor if distractor else "None"}
102
+ - Persistence Level: {persistence}
103
+ - Emotional Tone: {tone}
104
+ - Turn Count: {turns}
105
+ - Special Requirements: {combined_reqs}
106
+
107
+ Return a JSON object with:
108
+ {{
109
+ "scenario_description": "Brief narrative setup (2-3 sentences)",
110
+ "user_profile": "User role and context",
111
+ "key_signals_to_include": ["List of 2-4 specific memory-worthy signals"],
112
+ "distractor_signals": ["Optional list of noise/irrelevant info"],
113
+ "suggested_turn_breakdown": "How the conversation should flow"
114
+ }}
115
+ """
116
+
117
+ for attempt in range(self.max_retries + 1):
118
+ try:
119
+ response = self.client.chat(
120
+ messages=[{"role": "user", "content": prompt}],
121
+ temperature=0.7,
122
+ model=self.model,
123
+ response_format={"type": "json_object"}
124
+ )
125
+ content = self._extract_text(response)
126
+ if not content:
127
+ raise ValueError("No text content found in scenario response")
128
+
129
+ if content.startswith("```json"):
130
+ content = content[7:]
131
+ if content.endswith("```"):
132
+ content = content[:-3]
133
+ return json.loads(content.strip())
134
+ except Exception as e:
135
+ print(f"Scenario generation failed (attempt {attempt+1}/{self.max_retries+1}): {e}")
136
+ if attempt < self.max_retries:
137
+ sleep_time = 10 * (2 ** attempt)
138
+ print(f"Retrying in {sleep_time}s...")
139
+ time.sleep(sleep_time)
140
+ return {}
141
+
142
+ def generate_conversation(self, scenario_spec: Dict[str, Any], turn_count: int = 6, category: Optional[str] = None) -> Dict[str, Any]:
143
+ """Stage 2: Generate conversation based on scenario spec."""
144
+
145
+ domain_context = self._sample_domain_context()
146
+
147
+ # Detect if this is a NONE category scenario
148
+ is_none = category == "none" or (category is None and "none" in str(scenario_spec).lower())
149
+
150
+ if is_none:
151
+ prompt = f"""You are generating a realistic conversation between a user and an AI assistant.
152
+ The conversation should be transactional, casual, or vague. IT SHOULD NOT contain any significant long-term memory value for a marketing context.
153
+
154
+ CONTEXT: General professional setting.
155
+ SCENARIO SPECIFICATION:
156
+ {json.dumps(scenario_spec, indent=2)}
157
+
158
+ GENERATION RULES:
159
+ 1. Make it natural and fluid.
160
+ 2. DO NOT include detailed strategic plans, brand values, or user preferences.
161
+ 3. Focus on immediate tasks (scheduling, clarifications, small talk).
162
+ 4. Length: {turn_count} turns.
163
+ 5. Avoid opening pleasantries like "Hi" - start mid-thread if appropriate, or just dive in.
164
+
165
+ OUTPUT FORMAT:
166
+ Return a JSON object with:
167
+ {{
168
+ "scenario_id": "none_transactional_{{random_3_digit_number}}",
169
+ "conversation": [
170
+ {{"role": "user", "content": "..."}},
171
+ {{"role": "assistant", "content": "..."}}
172
+ ],
173
+ "labels": {{
174
+ "categories": ["none"],
175
+ "persistence_horizon": "short",
176
+ "memory_scope": "none",
177
+ "rationale": "Explanation why this is not memory-worthy"
178
+ }},
179
+ "metadata": {{
180
+ "scenario_type": "negative_example",
181
+ "primary_category": "none",
182
+ "distractor_present": false,
183
+ "turn_count": {turn_count},
184
+ "signals_present": []
185
+ }}
186
+ }}
187
+
188
+ CRITICAL: Respond with ONLY the JSON object.
189
+ """
190
+ else:
191
+ prompt = f"""You are generating realistic marketing conversations between a user and an AI marketing assistant. Generate natural dialogue that contains specific information worth storing in long-term memory. The conversation should start mid-thread (no greetings) and reference the ongoing initiative described below.
192
+
193
+ CONTEXT:
194
+ You will create a conversation that exemplifies certain memory categories while maintaining realism and natural flow. Assume this is part of {domain_context}.
195
+
196
+ SCENARIO SPECIFICATION:
197
+ {json.dumps(scenario_spec, indent=2)}
198
+
199
+ MEMORY TAXONOMY (for reference):
200
+ COMPANY MEMORY:
201
+ - company.brand_core: Voice, values, positioning, identity anchors (Persistence: Long >1y)
202
+ - company.strategic_signatures: Decision frameworks, strategic heuristics (Persistence: Long >1y)
203
+ - company.knowledge_artifacts: Docs, style guides, playbooks (Persistence: Long >1y)
204
+ - company.business_priorities: Quarterly/seasonal goals, active campaigns (Persistence: Short <3m)
205
+ - company.tools_config: Integrations, API keys, workflow settings (Persistence: Medium ~6m)
206
+ - company.performance_context: Campaign metrics, retrospectives, learnings (Persistence: Rolling ~6m)
207
+
208
+ USER MEMORY:
209
+ - user.communication_style: Tone, verbosity, format expectations (Persistence: Long >1y)
210
+ - user.strategic_approach: Personal priorities, success definitions (Persistence: Long >1y)
211
+ - user.role_context: Title, scope, decision authority (Persistence: Medium ~1y)
212
+ - user.workflow_patterns: Review cadence, collaboration norms (Persistence: Medium ~1y)
213
+ - user.session_history: Immediate context, recent asks (Persistence: Short <2w)
214
+ - user.interaction_preferences: Coaching style, feedback expectations (Persistence: Evolving)
215
+
216
+ SPECIAL:
217
+ - none: Irrelevant, vague, or transactional content
218
+
219
+ GENERATION RULES:
220
+ 1. Make conversations feel natural - include some filler, transitions, acknowledgments
221
+ 2. Embed memory-worthy information organically (don't make it too obvious)
222
+ 3. Include 1-2 utterances that should map to "none" for realism
223
+ 4. If multi-label scenario, ensure signals for both categories are present
224
+ 5. Length: {turn_count} turns (alternating user/assistant)
225
+ 6. Include specific, concrete details (not generic statements)
226
+ 7. For company.* categories: use "we", "our company", "our brand"
227
+ 8. For user.* categories: use "I prefer", "my approach", "I typically"
228
+ 9. Avoid opening pleasantries like "Hi" or "Hello"—jump straight into the ongoing topic.
229
+ 10. **CRITICAL CONSTRAINT**: Limit output to 1-3 categories maximum.
230
+ 11. **EXCLUSIVE NONE**: If "none" is in the categories list, it MUST be the ONLY category. NEVER mix "none" with other categories. If valid signals exist, do NOT include "none".
231
+
232
+ OUTPUT FORMAT:
233
+ Return a JSON object with:
234
+ {{
235
+ "scenario_id": "{{primary_category}}_{{scenario_type}}_{{random_3_digit_number}}",
236
+ "conversation": [
237
+ {{"role": "user", "content": "..."}},
238
+ {{"role": "assistant", "content": "..."}},
239
+ ...
240
+ ],
241
+ "labels": {{
242
+ "categories": ["array of applicable categories"],
243
+ "persistence_horizon": "long|medium|short",
244
+ "memory_scope": "company|user|mixed|none",
245
+ "rationale": "1-2 sentence explanation of category choices"
246
+ }},
247
+ "metadata": {{
248
+ "scenario_type": "descriptive_label",
249
+ "primary_category": "main_category",
250
+ "distractor_present": true|false,
251
+ "turn_count": integer,
252
+ "signals_present": ["list of specific signals included"]
253
+ }}
254
+ }}
255
+
256
+ CRITICAL: Respond with ONLY the JSON object. No markdown formatting, no explanation, no preamble.
257
+
258
+ Generate the conversation now."""
259
+
260
+ for attempt in range(self.max_retries + 1):
261
+ try:
262
+ response = self.client.chat(
263
+ messages=[{"role": "user", "content": prompt}],
264
+ temperature=0.7,
265
+ model=self.model,
266
+ response_format={"type": "json_object"}
267
+ )
268
+ content = self._extract_text(response)
269
+ if not content:
270
+ raise ValueError("No text content found in conversation response")
271
+
272
+ if content.startswith("```json"):
273
+ content = content[7:]
274
+ if content.endswith("```"):
275
+ content = content[:-3]
276
+ return json.loads(content.strip())
277
+ except Exception as e:
278
+ print(f"Conversation generation failed (attempt {attempt+1}/{self.max_retries+1}): {e}")
279
+ if attempt < self.max_retries:
280
+ sleep_time = 10 * (2 ** attempt)
281
+ print(f"Retrying in {sleep_time}s...")
282
+ time.sleep(sleep_time)
283
+ return {}
284
+
285
+ def run_batch(self, count: int = 1, category: str = "company.brand_core") -> List[Dict[str, Any]]:
286
+ """Run a batch generation."""
287
+ results = []
288
+ print(f"Starting batch generation for {count} examples of {category}...")
289
+
290
+ for i in range(count):
291
+ print(f"Generating example {i+1}/{count}...")
292
+ scenario = self.generate_scenario_spec(category=category)
293
+ if not scenario:
294
+ print("Skipping due to scenario generation failure")
295
+ continue
296
+
297
+ conversation = self.generate_conversation(scenario)
298
+ if conversation:
299
+ results.append(conversation)
300
+ print(f"Successfully generated conversation: {conversation.get('scenario_id', 'unknown')}")
301
+ else:
302
+ print("Failed to generate conversation")
303
+
304
+ return results
305
+
306
+ if __name__ == "__main__":
307
+ # Simple test run
308
+ pipeline = SyntheticDataPipeline()
309
+ results = pipeline.run_batch(count=1)
310
+ print(json.dumps(results, indent=2))
311
+
synthetic_data/quick_test_diverse.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick test of diverse generation."""
2
+ import json
3
+ import random
4
+ import os
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ import cohere
9
+
10
+ client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
11
+
12
+ # Test one generation
13
+ category = "company.tools_config"
14
+ industry = "Series A fintech building a neobank"
15
+ persona = "a growth lead obsessed with metrics"
16
+ situation = "debugging why a campaign tanked"
17
+ tone = "frustrated"
18
+
19
+ prompt = f"""You are a world-class creative writer generating training data for an AI memory routing system.
20
+
21
+ Create a completely unique, realistic conversation between {persona} at a {industry} and their AI marketing assistant.
22
+
23
+ Context: They are {situation}. The tone is {tone}.
24
+
25
+ CATEGORY TO DEMONSTRATE: {category}
26
+ The conversation should involve tool setup, integrations, APIs, or workflow automation.
27
+
28
+ CREATIVE FREEDOM:
29
+ - Invent specific, realistic details (names, numbers, dates, products)
30
+ - The conversation can start anywhere - mid-thought, mid-project, mid-crisis
31
+ - Vary structure dramatically
32
+ - Include natural speech patterns
33
+ - Make it feel like eavesdropping on a real conversation
34
+
35
+ The ONLY hard requirement: the conversation must clearly demonstrate {category}.
36
+
37
+ Output as JSON:
38
+ {{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"]}}, "metadata": {{"primary_category": "{category}", "industry": "{industry}"}}}}"""
39
+
40
+ print("Sending request...")
41
+ response = client.chat(
42
+ messages=[{"role": "user", "content": prompt}],
43
+ temperature=0.95,
44
+ model="command-r-plus-08-2024",
45
+ response_format={"type": "json_object"}
46
+ )
47
+
48
+ content = response.message.content[0].text
49
+ print("\n=== RAW RESPONSE ===")
50
+ print(content[:500])
51
+
52
+ data = json.loads(content)
53
+ print("\n=== PARSED ===")
54
+ print(f"Categories: {data.get('labels', {}).get('categories', [])}")
55
+ conv = data.get("conversation", [])
56
+ if conv:
57
+ for i, turn in enumerate(conv[:4]):
58
+ if isinstance(turn, dict):
59
+ print(f"\n[{turn.get('role', 'unknown')}]: {turn.get('content', '')[:150]}...")
60
+ else:
61
+ print(f"\n[turn {i}]: {str(turn)[:150]}...")
62
+
synthetic_data/run_balanced_async.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Balanced Dataset Generation with Concurrent API Calls
3
+
4
+ Generates 10 items simultaneously per batch for faster generation.
5
+ """
6
+
7
+ import json
8
+ import random
9
+ import time
10
+ import sys
11
+ import asyncio
12
+ import os
13
+ from typing import List, Dict, Any, Optional
14
+ from datetime import datetime
15
+ from concurrent.futures import ThreadPoolExecutor
16
+ import cohere
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+
21
+ # Target counts per category (balanced)
22
+ CATEGORY_TARGETS = {
23
+ "company.brand_core": 77,
24
+ "company.strategic_signatures": 77,
25
+ "company.knowledge_artifacts": 77,
26
+ "company.business_priorities": 77,
27
+ "company.tools_config": 77,
28
+ "company.performance_context": 77,
29
+ "user.communication_style": 77,
30
+ "user.strategic_approach": 77,
31
+ "user.role_context": 77,
32
+ "user.workflow_patterns": 77,
33
+ "user.session_history": 77,
34
+ "user.interaction_preferences": 77,
35
+ "none": 77,
36
+ }
37
+
38
+ CATEGORY_EXAMPLES = {
39
+ "company.brand_core": {
40
+ "signals": ["brand voice is warm", "primary color is #2563EB", "never use jargon", "tagline is..."],
41
+ },
42
+ "company.strategic_signatures": {
43
+ "signals": ["always prioritize retention", "80/20 rule", "never launch without testing"],
44
+ },
45
+ "company.knowledge_artifacts": {
46
+ "signals": ["style guide says", "playbook recommends", "SOP for launches", "template includes"],
47
+ },
48
+ "company.business_priorities": {
49
+ "signals": ["Q4 focus is", "this quarter's target", "holiday campaign", "prioritizing APAC"],
50
+ },
51
+ "company.tools_config": {
52
+ "signals": ["Slack webhook URL", "HubSpot sync", "API key is", "Zapier integration"],
53
+ },
54
+ "company.performance_context": {
55
+ "signals": ["24% open rate", "CTR improved by", "retrospective showed", "conversion dropped"],
56
+ },
57
+ "user.communication_style": {
58
+ "signals": ["prefer bullet points", "keep it under 200 words", "casual tone", "data-driven"],
59
+ },
60
+ "user.strategic_approach": {
61
+ "signals": ["prioritize speed over perfection", "test fast fail fast", "customer feedback"],
62
+ },
63
+ "user.role_context": {
64
+ "signals": ["As VP of Marketing", "report to CMO", "budget authority up to", "manage team of"],
65
+ },
66
+ "user.workflow_patterns": {
67
+ "signals": ["review drafts Monday", "don't send Friday", "async via Slack", "weekly sync Tuesday"],
68
+ },
69
+ "user.session_history": {
70
+ "signals": ["as we discussed yesterday", "continuing from last", "proposal we started"],
71
+ },
72
+ "user.interaction_preferences": {
73
+ "signals": ["push back on my ideas", "give me options", "be direct", "ask clarifying questions"],
74
+ },
75
+ "none": {
76
+ "signals": ["what time is meeting", "checking status", "confirming receipt", "quick question"],
77
+ },
78
+ }
79
+
80
+
81
+ class BalancedAsyncGenerator:
82
+ def __init__(self):
83
+ self.api_key = os.getenv("COHERE_API_KEY")
84
+ if not self.api_key:
85
+ raise ValueError("COHERE_API_KEY not found")
86
+ self.client = cohere.ClientV2(api_key=self.api_key)
87
+ self.model = "command-r-plus-08-2024"
88
+ self.executor = ThreadPoolExecutor(max_workers=10)
89
+
90
+ def _extract_text(self, response) -> Optional[str]:
91
+ if not response or not getattr(response, "message", None):
92
+ return None
93
+ blocks = getattr(response.message, "content", []) or []
94
+ for block in blocks:
95
+ text = getattr(block, "text", None)
96
+ if isinstance(text, str) and text.strip():
97
+ return text
98
+ return None
99
+
100
+ def _generate_sync(self, category: str) -> Optional[Dict]:
101
+ """Synchronous generation for a single category."""
102
+ signals = CATEGORY_EXAMPLES.get(category, {}).get("signals", [])
103
+ signals_text = "\n".join(f"- {s}" for s in signals[:4])
104
+
105
+ if category == "none":
106
+ prompt = f"""Generate a marketing conversation with NO long-term memory value.
107
+ Transactional, vague, or temporary only. Examples: status check, scheduling, confirming.
108
+ 4-6 turns, no greetings, start mid-conversation.
109
+
110
+ OUTPUT (JSON only):
111
+ {{"scenario_id": "none_{random.randint(100,999)}", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["none"], "persistence_horizon": "short", "memory_scope": "none", "rationale": "..."}}, "metadata": {{"primary_category": "none", "turn_count": 4}}}}"""
112
+ else:
113
+ prompt = f"""Generate a marketing conversation demonstrating: {category}
114
+
115
+ SIGNALS FOR THIS CATEGORY:
116
+ {signals_text}
117
+
118
+ REQUIREMENTS:
119
+ 1. MUST contain clear signals for {category}
120
+ 2. 4-6 turns, no greetings, start mid-conversation
121
+ 3. Include specific details (names, numbers, dates)
122
+
123
+ CRITICAL: categories array MUST include "{category}"
124
+
125
+ OUTPUT (JSON only):
126
+ {{"scenario_id": "{category.replace('.', '_')}_{random.randint(100,999)}", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"], "persistence_horizon": "long", "memory_scope": "company", "rationale": "..."}}, "metadata": {{"primary_category": "{category}", "turn_count": 4}}}}"""
127
+
128
+ try:
129
+ response = self.client.chat(
130
+ messages=[{"role": "user", "content": prompt}],
131
+ temperature=0.7,
132
+ model=self.model,
133
+ response_format={"type": "json_object"}
134
+ )
135
+
136
+ content = self._extract_text(response)
137
+ if not content:
138
+ return None
139
+
140
+ if content.startswith("```json"):
141
+ content = content[7:]
142
+ if content.endswith("```"):
143
+ content = content[:-3]
144
+
145
+ data = json.loads(content.strip())
146
+
147
+ # Validate target category is present
148
+ categories = data.get("labels", {}).get("categories", [])
149
+ if category.lower() not in [c.lower() for c in categories]:
150
+ return None
151
+
152
+ # Clean: Remove "none" if other categories exist
153
+ if len(categories) > 1 and "none" in [c.lower() for c in categories]:
154
+ data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
155
+
156
+ return data
157
+
158
+ except Exception as e:
159
+ return None
160
+
161
+ async def generate_batch(self, categories: List[str]) -> List[Dict]:
162
+ """Generate a batch of items concurrently."""
163
+ loop = asyncio.get_event_loop()
164
+ tasks = [
165
+ loop.run_in_executor(self.executor, self._generate_sync, cat)
166
+ for cat in categories
167
+ ]
168
+ results = await asyncio.gather(*tasks, return_exceptions=True)
169
+ return [r for r in results if isinstance(r, dict)]
170
+
171
+
172
+ async def run_balanced_generation_async():
173
+ """Run balanced generation with concurrent batches."""
174
+
175
+ generator = BalancedAsyncGenerator()
176
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
177
+ output_file = f"synthetic_data/balanced_dataset_{timestamp}.jsonl"
178
+
179
+ # Track progress per category
180
+ category_counts = {cat: 0 for cat in CATEGORY_TARGETS}
181
+ all_data = []
182
+
183
+ print("=" * 70, flush=True)
184
+ print("BALANCED CONCURRENT DATASET GENERATION", flush=True)
185
+ print("=" * 70, flush=True)
186
+ print(f"Target per category: 77", flush=True)
187
+ print(f"Total categories: {len(CATEGORY_TARGETS)}", flush=True)
188
+ print(f"Expected total: {77 * len(CATEGORY_TARGETS)}", flush=True)
189
+ print(f"Batch size: 10 concurrent requests", flush=True)
190
+ print(flush=True)
191
+
192
+ batch_num = 0
193
+
194
+ while True:
195
+ # Find categories that still need examples
196
+ needed = []
197
+ for cat, target in CATEGORY_TARGETS.items():
198
+ remaining = target - category_counts[cat]
199
+ needed.extend([cat] * min(remaining, 2)) # Up to 2 per category per batch
200
+
201
+ if not needed:
202
+ break
203
+
204
+ # Take up to 10 for this batch
205
+ batch_categories = needed[:10]
206
+ batch_num += 1
207
+
208
+ print(f"\n[Batch {batch_num}] Generating {len(batch_categories)} items...", flush=True)
209
+
210
+ results = await generator.generate_batch(batch_categories)
211
+
212
+ # Process results
213
+ for result in results:
214
+ if result:
215
+ primary = result.get("metadata", {}).get("primary_category") or \
216
+ result.get("labels", {}).get("categories", ["unknown"])[0]
217
+
218
+ if primary in category_counts:
219
+ category_counts[primary] += 1
220
+ all_data.append(result)
221
+
222
+ # Save incrementally
223
+ with open(output_file, "a") as f:
224
+ f.write(json.dumps(result) + "\n")
225
+
226
+ # Progress report
227
+ total_done = sum(category_counts.values())
228
+ total_target = sum(CATEGORY_TARGETS.values())
229
+ print(f" Success: {len(results)}/{len(batch_categories)} | Total: {total_done}/{total_target}", flush=True)
230
+
231
+ # Show category progress every 10 batches
232
+ if batch_num % 10 == 0:
233
+ print("\n Category Progress:", flush=True)
234
+ for cat, count in sorted(category_counts.items()):
235
+ target = CATEGORY_TARGETS[cat]
236
+ bar = "█" * (count * 20 // target) + "░" * (20 - count * 20 // target)
237
+ print(f" {cat:<35} [{bar}] {count}/{target}", flush=True)
238
+
239
+ # Rate limit: wait 3 seconds between batches
240
+ await asyncio.sleep(3)
241
+
242
+ # Final summary
243
+ print("\n" + "=" * 70, flush=True)
244
+ print("GENERATION COMPLETE", flush=True)
245
+ print("=" * 70, flush=True)
246
+ print(f"\nFinal Distribution:", flush=True)
247
+ for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
248
+ pct = count / len(all_data) * 100 if all_data else 0
249
+ print(f" {cat:<40} {count:>4} ({pct:.1f}%)", flush=True)
250
+
251
+ print(f"\nTotal examples: {len(all_data)}", flush=True)
252
+ print(f"Output file: {output_file}", flush=True)
253
+
254
+ return output_file
255
+
256
+
257
+ if __name__ == "__main__":
258
+ asyncio.run(run_balanced_generation_async())
259
+
synthetic_data/run_balanced_generation.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Balanced Dataset Generation Script
3
+
4
+ This script generates a balanced training dataset with:
5
+ 1. STRICT category enforcement - the model MUST output the target category
6
+ 2. Equal distribution across all categories
7
+ 3. Improved prompts for underrepresented categories
8
+ """
9
+
10
+ import json
11
+ import random
12
+ import time
13
+ import sys
14
+ import asyncio
15
+ import os
16
+ from typing import List, Dict, Any, Optional
17
+ from datetime import datetime
18
+ import cohere
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+
23
+ # BALANCED DISTRIBUTION - Equal weight for all categories
24
+ BALANCED_DISTRIBUTION = {
25
+ "company.brand_core": 80,
26
+ "company.strategic_signatures": 80,
27
+ "company.knowledge_artifacts": 80,
28
+ "company.business_priorities": 80,
29
+ "company.tools_config": 80,
30
+ "company.performance_context": 80,
31
+ "user.communication_style": 80,
32
+ "user.strategic_approach": 80,
33
+ "user.role_context": 80,
34
+ "user.workflow_patterns": 80,
35
+ "user.session_history": 80,
36
+ "user.interaction_preferences": 80,
37
+ "none": 80,
38
+ }
39
+
40
+ # Category-specific examples and signals for better generation
41
+ CATEGORY_EXAMPLES = {
42
+ "company.brand_core": {
43
+ "description": "Brand voice, values, positioning, visual identity, tone guidelines",
44
+ "example_signals": [
45
+ "Our brand voice is warm and conversational",
46
+ "We always use sentence case for headlines",
47
+ "Our primary color is #2563EB",
48
+ "We never use corporate jargon",
49
+ "Our tagline is 'Simplify Everything'"
50
+ ],
51
+ "example_conversation": "USER: Remember, our brand personality is 'friendly expert' - knowledgeable but approachable."
52
+ },
53
+ "company.strategic_signatures": {
54
+ "description": "Decision frameworks, strategic heuristics, recurring patterns in how the company operates",
55
+ "example_signals": [
56
+ "We always prioritize retention over acquisition",
57
+ "Our 80/20 rule: 80% proven tactics, 20% experiments",
58
+ "We never launch without A/B testing",
59
+ "Customer lifetime value drives all decisions"
60
+ ],
61
+ "example_conversation": "USER: Our strategic principle is 'land and expand' - start small with enterprises then grow."
62
+ },
63
+ "company.knowledge_artifacts": {
64
+ "description": "Style guides, playbooks, SOPs, documented processes, templates",
65
+ "example_signals": [
66
+ "Here's our content style guide",
67
+ "The campaign playbook says...",
68
+ "According to our SOP for launches",
69
+ "Our template for proposals includes..."
70
+ ],
71
+ "example_conversation": "USER: I'm attaching our updated brand guidelines PDF. Make sure all content follows section 3.2."
72
+ },
73
+ "company.business_priorities": {
74
+ "description": "Quarterly goals, seasonal campaigns, current OKRs, active initiatives",
75
+ "example_signals": [
76
+ "Q4 focus is enterprise expansion",
77
+ "This quarter's target is 500 MQLs",
78
+ "Holiday campaign launches December 1st",
79
+ "We're prioritizing APAC market this quarter"
80
+ ],
81
+ "example_conversation": "USER: For Q1, we're shifting focus entirely to the SMB segment. All campaigns should target companies under 100 employees."
82
+ },
83
+ "company.tools_config": {
84
+ "description": "Integrations, API keys, workflow settings, tool configurations",
85
+ "example_signals": [
86
+ "The Slack webhook URL is...",
87
+ "Configure HubSpot to sync with...",
88
+ "The API key for analytics is...",
89
+ "Set up the Zapier integration to..."
90
+ ],
91
+ "example_conversation": "USER: Here's the API key for our analytics dashboard: sk-xxx-123. Make sure it syncs every 6 hours."
92
+ },
93
+ "company.performance_context": {
94
+ "description": "Campaign metrics, retrospectives, learnings, performance data",
95
+ "example_signals": [
96
+ "Last campaign had 24% open rate",
97
+ "CTR improved by 15% after the redesign",
98
+ "The retrospective showed we need more testing",
99
+ "Conversion rate dropped after the price change"
100
+ ],
101
+ "example_conversation": "USER: The email campaign results are in: 28% open rate, 4.2% CTR. That's our best performance this year."
102
+ },
103
+ "user.communication_style": {
104
+ "description": "Preferred tone, verbosity, format expectations, writing style",
105
+ "example_signals": [
106
+ "I prefer bullet points over paragraphs",
107
+ "Keep responses under 200 words",
108
+ "Use casual, friendly tone with me",
109
+ "I like data-driven explanations"
110
+ ],
111
+ "example_conversation": "USER: Just so you know, I prefer concise bullet points. No need for lengthy explanations with me."
112
+ },
113
+ "user.strategic_approach": {
114
+ "description": "Personal priorities, success definitions, decision-making style",
115
+ "example_signals": [
116
+ "I always prioritize speed over perfection",
117
+ "My philosophy is test fast, fail fast",
118
+ "I measure success by customer feedback",
119
+ "I believe in data-driven decisions only"
120
+ ],
121
+ "example_conversation": "USER: My approach is always 'done is better than perfect'. I'd rather ship and iterate."
122
+ },
123
+ "user.role_context": {
124
+ "description": "Title, scope, decision authority, reporting structure",
125
+ "example_signals": [
126
+ "As VP of Marketing, I approve all campaigns",
127
+ "I report directly to the CMO",
128
+ "My budget authority is up to $50k",
129
+ "I manage a team of 12 marketers"
130
+ ],
131
+ "example_conversation": "USER: Just for context, I'm the Director of Growth and I have final say on all acquisition campaigns."
132
+ },
133
+ "user.workflow_patterns": {
134
+ "description": "Review cadence, collaboration norms, meeting schedules",
135
+ "example_signals": [
136
+ "I review drafts every Monday morning",
137
+ "Don't send me anything on Fridays",
138
+ "I prefer async communication via Slack",
139
+ "Weekly sync is Tuesdays at 2pm"
140
+ ],
141
+ "example_conversation": "USER: My review schedule is Monday mornings only. Anything sent Friday won't be seen until next week."
142
+ },
143
+ "user.session_history": {
144
+ "description": "Immediate context, recent asks, current working session",
145
+ "example_signals": [
146
+ "As we discussed yesterday...",
147
+ "Continuing from our last conversation",
148
+ "The proposal we started earlier",
149
+ "Following up on the draft you sent"
150
+ ],
151
+ "example_conversation": "USER: Let's pick up where we left off yesterday on the Johnson account proposal."
152
+ },
153
+ "user.interaction_preferences": {
154
+ "description": "Coaching style, feedback expectations, collaboration preferences",
155
+ "example_signals": [
156
+ "I want you to push back on my ideas",
157
+ "Give me options, not just one answer",
158
+ "Be direct with feedback, don't sugarcoat",
159
+ "I prefer you ask clarifying questions"
160
+ ],
161
+ "example_conversation": "USER: I want you to challenge my assumptions. If you think I'm wrong, tell me directly."
162
+ },
163
+ "none": {
164
+ "description": "Transactional, vague, or temporary content with no memory value",
165
+ "example_signals": [
166
+ "What time is the meeting?",
167
+ "Can you check the status?",
168
+ "Just confirming receipt",
169
+ "Quick question about the attachment"
170
+ ],
171
+ "example_conversation": "USER: Hey, what's the status on that thing we discussed? Just checking in."
172
+ }
173
+ }
174
+
175
+ class BalancedDataGenerator:
176
+ def __init__(self, api_key: Optional[str] = None):
177
+ self.api_key = api_key or os.getenv("COHERE_API_KEY")
178
+ if not self.api_key:
179
+ raise ValueError("COHERE_API_KEY not found")
180
+ self.client = cohere.ClientV2(api_key=self.api_key)
181
+ self.model = "command-r-plus-08-2024"
182
+
183
+ def _extract_text(self, response) -> Optional[str]:
184
+ if not response or not getattr(response, "message", None):
185
+ return None
186
+ blocks = getattr(response.message, "content", []) or []
187
+ for block in blocks:
188
+ text = getattr(block, "text", None)
189
+ if isinstance(text, str) and text.strip():
190
+ return text
191
+ return None
192
+
193
+ def generate_for_category(self, category: str, max_retries: int = 3) -> Optional[Dict]:
194
+ """Generate a conversation that MUST contain the specified category."""
195
+
196
+ cat_info = CATEGORY_EXAMPLES.get(category, {})
197
+ description = cat_info.get("description", category)
198
+ example_signals = cat_info.get("example_signals", [])
199
+ example_conv = cat_info.get("example_conversation", "")
200
+
201
+ # Build a very specific prompt
202
+ if category == "none":
203
+ prompt = f"""Generate a realistic marketing conversation that has NO long-term memory value.
204
+
205
+ The conversation should be:
206
+ - Transactional (checking status, scheduling, confirming)
207
+ - Vague or generic (no specific details worth remembering)
208
+ - Temporary (only relevant for this moment)
209
+
210
+ Examples of "none" conversations:
211
+ - "What time is the meeting tomorrow?"
212
+ - "Just confirming you received the file"
213
+ - "Quick status check on the project"
214
+ - "Can you resend that link?"
215
+
216
+ Generate a 4-6 turn conversation between USER and ASSISTANT.
217
+ Start mid-conversation (no greetings).
218
+
219
+ OUTPUT FORMAT (JSON only):
220
+ {{
221
+ "scenario_id": "none_{random.randint(100,999)}",
222
+ "conversation": [
223
+ {{"role": "user", "content": "..."}},
224
+ {{"role": "assistant", "content": "..."}}
225
+ ],
226
+ "labels": {{
227
+ "categories": ["none"],
228
+ "persistence_horizon": "short",
229
+ "memory_scope": "none",
230
+ "rationale": "This conversation is transactional/temporary with no memory value"
231
+ }},
232
+ "metadata": {{
233
+ "primary_category": "none",
234
+ "turn_count": 4
235
+ }}
236
+ }}"""
237
+ else:
238
+ prompt = f"""Generate a marketing conversation that clearly demonstrates the category: {category}
239
+
240
+ CATEGORY DEFINITION:
241
+ {description}
242
+
243
+ SIGNALS THAT INDICATE THIS CATEGORY:
244
+ {chr(10).join(f"- {s}" for s in example_signals[:4])}
245
+
246
+ EXAMPLE UTTERANCE:
247
+ {example_conv}
248
+
249
+ REQUIREMENTS:
250
+ 1. The conversation MUST contain clear signals for {category}
251
+ 2. The USER should explicitly state information that maps to this category
252
+ 3. Make it natural and realistic - embed the signals organically
253
+ 4. 4-6 turns, start mid-conversation (no greetings)
254
+ 5. Include specific, concrete details (names, numbers, dates)
255
+
256
+ CRITICAL: The output categories array MUST include "{category}" as the primary category.
257
+ You may include 1 additional category if naturally present, but {category} MUST be there.
258
+
259
+ OUTPUT FORMAT (JSON only):
260
+ {{
261
+ "scenario_id": "{category.replace('.', '_')}_{random.randint(100,999)}",
262
+ "conversation": [
263
+ {{"role": "user", "content": "..."}},
264
+ {{"role": "assistant", "content": "..."}}
265
+ ],
266
+ "labels": {{
267
+ "categories": ["{category}"],
268
+ "persistence_horizon": "long|medium|short",
269
+ "memory_scope": "company|user",
270
+ "rationale": "Explanation of why {category} applies"
271
+ }},
272
+ "metadata": {{
273
+ "primary_category": "{category}",
274
+ "turn_count": 4
275
+ }}
276
+ }}"""
277
+
278
+ for attempt in range(max_retries):
279
+ try:
280
+ response = self.client.chat(
281
+ messages=[{"role": "user", "content": prompt}],
282
+ temperature=0.7,
283
+ model=self.model,
284
+ response_format={"type": "json_object"}
285
+ )
286
+
287
+ content = self._extract_text(response)
288
+ if not content:
289
+ continue
290
+
291
+ # Clean JSON
292
+ if content.startswith("```json"):
293
+ content = content[7:]
294
+ if content.endswith("```"):
295
+ content = content[:-3]
296
+
297
+ data = json.loads(content.strip())
298
+
299
+ # VALIDATE: Ensure target category is present
300
+ categories = data.get("labels", {}).get("categories", [])
301
+ if category.lower() not in [c.lower() for c in categories]:
302
+ print(f" Warning: Target {category} not in output {categories}. Retrying...")
303
+ continue
304
+
305
+ # Clean: Remove "none" if other categories exist
306
+ if len(categories) > 1 and "none" in [c.lower() for c in categories]:
307
+ data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
308
+
309
+ return data
310
+
311
+ except Exception as e:
312
+ print(f" Attempt {attempt+1} failed: {e}")
313
+ time.sleep(5 * (attempt + 1))
314
+
315
+ return None
316
+
317
+
318
+ async def generate_balanced_dataset(output_dir: str = "synthetic_data", target_per_category: int = 80):
319
+ """Generate a balanced dataset with equal examples per category."""
320
+
321
+ os.makedirs(output_dir, exist_ok=True)
322
+ generator = BalancedDataGenerator()
323
+
324
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
325
+ output_file = f"{output_dir}/balanced_dataset_{timestamp}.jsonl"
326
+ log_file = f"{output_dir}/balanced_generation_log_{timestamp}.txt"
327
+
328
+ all_data = []
329
+ category_counts = {cat: 0 for cat in BALANCED_DISTRIBUTION.keys()}
330
+
331
+ print("=" * 70, flush=True)
332
+ print("BALANCED DATASET GENERATION", flush=True)
333
+ print("=" * 70, flush=True)
334
+ print(f"Target per category: {target_per_category}", flush=True)
335
+ print(f"Total categories: {len(BALANCED_DISTRIBUTION)}", flush=True)
336
+ print(f"Expected total: {target_per_category * len(BALANCED_DISTRIBUTION)}", flush=True)
337
+ print(flush=True)
338
+
339
+ with open(log_file, "w") as log:
340
+ log.write(f"Balanced Generation Started: {timestamp}\n")
341
+ log.write(f"Target per category: {target_per_category}\n\n")
342
+
343
+ for category in BALANCED_DISTRIBUTION.keys():
344
+ print(f"\n--- Generating {target_per_category} examples for: {category} ---", flush=True)
345
+ log.write(f"\n=== {category} ===\n")
346
+ log.flush()
347
+
348
+ for i in range(target_per_category):
349
+ result = generator.generate_for_category(category)
350
+
351
+ if result:
352
+ all_data.append(result)
353
+ category_counts[category] += 1
354
+
355
+ # Save incrementally
356
+ with open(output_file, "a") as f:
357
+ f.write(json.dumps(result) + "\n")
358
+
359
+ if (i + 1) % 10 == 0:
360
+ print(f" Progress: {i+1}/{target_per_category}", flush=True)
361
+ log.write(f" {i+1}/{target_per_category} complete\n")
362
+ log.flush()
363
+ else:
364
+ print(f" Failed: {i+1}", flush=True)
365
+ log.write(f" Failed to generate example {i+1}\n")
366
+ log.flush()
367
+
368
+ # Rate limiting
369
+ await asyncio.sleep(0.5)
370
+
371
+ print(f" Completed: {category_counts[category]}/{target_per_category}", flush=True)
372
+
373
+ # Final summary
374
+ print("\n" + "=" * 70)
375
+ print("GENERATION COMPLETE")
376
+ print("=" * 70)
377
+ print(f"\nCategory Distribution:")
378
+ for cat, count in sorted(category_counts.items(), key=lambda x: -x[1]):
379
+ pct = count / len(all_data) * 100 if all_data else 0
380
+ print(f" {cat:<40} {count:>4} ({pct:.1f}%)")
381
+
382
+ print(f"\nTotal examples: {len(all_data)}")
383
+ print(f"Output file: {output_file}")
384
+
385
+ return output_file
386
+
387
+
388
+ if __name__ == "__main__":
389
+ target = int(sys.argv[1]) if len(sys.argv) > 1 else 80
390
+ asyncio.run(generate_balanced_dataset(target_per_category=target))
391
+
synthetic_data/run_batch.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import time
4
+ import sys
5
+ from typing import List, Dict, Any
6
+ from synthetic_data.pipeline import SyntheticDataPipeline
7
+ from synthetic_data.validate import validate_synthetic_data
8
+
9
+ CATEGORY_DISTRIBUTION = {
10
+ "company.brand_core": 0.10,
11
+ "company.strategic_signatures": 0.08,
12
+ "company.knowledge_artifacts": 0.08,
13
+ "company.business_priorities": 0.10,
14
+ "company.tools_config": 0.07,
15
+ "company.performance_context": 0.09,
16
+ "user.communication_style": 0.10,
17
+ "user.strategic_approach": 0.09,
18
+ "user.role_context": 0.07,
19
+ "user.workflow_patterns": 0.08,
20
+ "user.session_history": 0.06,
21
+ "user.interaction_preferences": 0.08,
22
+ "none": 0.10
23
+ }
24
+
25
+ def run_pipeline_batches(total_items: int = 100, batch_size: int = 10):
26
+ pipeline = SyntheticDataPipeline()
27
+ categories = list(CATEGORY_DISTRIBUTION.keys())
28
+ weights = list(CATEGORY_DISTRIBUTION.values())
29
+
30
+ all_data = []
31
+ num_batches = max(1, total_items // batch_size)
32
+
33
+ print(f"Starting generation of {total_items} items in {num_batches} batches (Size: {batch_size})...")
34
+
35
+ for batch_num in range(1, num_batches + 1):
36
+ print(f"\n=== Processing Batch {batch_num}/{num_batches} ===")
37
+ batch_data = []
38
+
39
+ while len(batch_data) < batch_size:
40
+ category = random.choices(categories, weights=weights, k=1)[0]
41
+ current_count = len(batch_data) + 1
42
+ print(f" Generating item {current_count}/{batch_size} (Category: {category})...")
43
+
44
+ # Determine if we should add a distractor (30% chance)
45
+ distractor = None
46
+ if random.random() < 0.30 and category != "none":
47
+ possible_distractors = [c for c in categories if c != category and c != "none"]
48
+ if possible_distractors:
49
+ distractor = random.choice(possible_distractors)
50
+
51
+ persistence = _get_persistence_for_category(category)
52
+ turns = random.randint(4, 10)
53
+
54
+ scenario = pipeline.generate_scenario_spec(
55
+ category=category,
56
+ distractor=distractor,
57
+ persistence=persistence,
58
+ turns=turns
59
+ )
60
+
61
+ if not scenario:
62
+ print(f" Failed to generate scenario for {category}. Retrying...")
63
+ time.sleep(20)
64
+ continue
65
+
66
+ conversation = pipeline.generate_conversation(scenario, turn_count=turns)
67
+
68
+ if conversation:
69
+ batch_data.append(conversation)
70
+ print(f" Generated: {conversation.get('scenario_id', 'Unknown ID')}")
71
+ else:
72
+ print(f" Failed to generate conversation for {category}. Retrying...")
73
+ time.sleep(20)
74
+ continue
75
+
76
+ print(" Sleeping for 15s to avoid rate limits...")
77
+ time.sleep(15)
78
+
79
+ # Save batch
80
+ batch_filename = f"synthetic_data/batch_{batch_num:02d}.json"
81
+ with open(batch_filename, "w") as f:
82
+ json.dump(batch_data, f, indent=2)
83
+ print(f" Saved batch to {batch_filename}")
84
+
85
+ # Validate batch
86
+ print(" Validating batch...")
87
+ metrics = validate_synthetic_data(batch_filename)
88
+ print(json.dumps(metrics, indent=2))
89
+
90
+ all_data.extend(batch_data)
91
+
92
+ # Save all data
93
+ with open("synthetic_data/all_generated_data_100.json", "w") as f:
94
+ json.dump(all_data, f, indent=2)
95
+ print(f"\nCompleted. Total items generated: {len(all_data)}")
96
+ print("Full dataset saved to synthetic_data/all_generated_data_100.json")
97
+
98
+ def _get_persistence_for_category(category: str) -> str:
99
+ if "brand_core" in category or "strategic_signatures" in category or "knowledge_artifacts" in category or "communication_style" in category or "strategic_approach" in category:
100
+ return "long"
101
+ elif "tools_config" in category or "role_context" in category or "workflow_patterns" in category:
102
+ return "medium"
103
+ elif "business_priorities" in category or "session_history" in category:
104
+ return "short"
105
+ elif "performance_context" in category:
106
+ return "rolling"
107
+ elif "interaction_preferences" in category:
108
+ return "evolving"
109
+ elif "none" in category:
110
+ return "short"
111
+ return "medium"
112
+
113
+ if __name__ == "__main__":
114
+ total = int(sys.argv[1]) if len(sys.argv) > 1 else 100
115
+ batch = int(sys.argv[2]) if len(sys.argv) > 2 else 10
116
+ run_pipeline_batches(total, batch)
synthetic_data/run_batch_async.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ import time
4
+ import sys
5
+ import asyncio
6
+ import os
7
+ from typing import List, Dict, Any
8
+ from synthetic_data.pipeline import SyntheticDataPipeline
9
+ from synthetic_data.validate import validate_synthetic_data
10
+ from synthetic_data.clean_data import clean_datum
11
+
12
+ CATEGORY_DISTRIBUTION = {
13
+ "none": 0.15,
14
+ "user.interaction_preferences": 0.12,
15
+ "user.session_history": 0.10,
16
+ "company.brand_core": 0.10,
17
+
18
+ "company.strategic_signatures": 0.07,
19
+ "company.knowledge_artifacts": 0.07,
20
+ "user.communication_style": 0.07,
21
+ "user.strategic_approach": 0.07,
22
+ "user.workflow_patterns": 0.07,
23
+
24
+ "company.tools_config": 0.05,
25
+ "company.performance_context": 0.05,
26
+ "company.business_priorities": 0.04,
27
+ "user.role_context": 0.04
28
+ }
29
+
30
+ async def generate_single_item(pipeline: SyntheticDataPipeline, category: str, item_num: int) -> Dict[str, Any]:
31
+ """Generate a single conversation item asynchronously."""
32
+ print(f" Starting item {item_num} (Target: {category})...")
33
+
34
+ # Determine distractor
35
+ categories = list(CATEGORY_DISTRIBUTION.keys())
36
+ distractor = None
37
+ if random.random() < 0.30 and category != "none":
38
+ possible_distractors = [c for c in categories if c != category and c != "none"]
39
+ if possible_distractors:
40
+ distractor = random.choice(possible_distractors)
41
+
42
+ persistence = _get_persistence_for_category(category)
43
+ turns = random.randint(4, 10)
44
+
45
+ # Generate scenario (synchronous call wrapped in executor)
46
+ loop = asyncio.get_event_loop()
47
+ scenario = await loop.run_in_executor(
48
+ None,
49
+ pipeline.generate_scenario_spec,
50
+ category,
51
+ distractor,
52
+ persistence,
53
+ "neutral",
54
+ turns,
55
+ ""
56
+ )
57
+
58
+ if not scenario:
59
+ print(f" Failed item {item_num}: scenario generation failed")
60
+ return None
61
+
62
+ # Generate conversation
63
+ conversation = await loop.run_in_executor(
64
+ None,
65
+ pipeline.generate_conversation,
66
+ scenario,
67
+ turns,
68
+ category
69
+ )
70
+
71
+ if conversation:
72
+ # Clean the item immediately
73
+ cleaned_conversation = clean_datum(conversation)
74
+ print(f" Completed item {item_num}: {cleaned_conversation.get('scenario_id', 'Unknown')}")
75
+ return cleaned_conversation
76
+ else:
77
+ print(f" Failed item {item_num}: conversation generation failed")
78
+ return None
79
+
80
+ async def generate_batch_concurrent(pipeline: SyntheticDataPipeline, batch_size: int, batch_num: int) -> List[Dict[str, Any]]:
81
+ """Generate a full batch of items concurrently, retrying until batch is full."""
82
+ print(f"\n=== Processing Batch {batch_num} (Concurrent) ===")
83
+
84
+ categories = list(CATEGORY_DISTRIBUTION.keys())
85
+ weights = list(CATEGORY_DISTRIBUTION.values())
86
+
87
+ batch_data = []
88
+ items_needed = batch_size
89
+
90
+ while items_needed > 0:
91
+ # Select categories for this chunk of work
92
+ batch_categories = random.choices(categories, weights=weights, k=items_needed)
93
+
94
+ # Create tasks for needed items
95
+ tasks = [
96
+ generate_single_item(pipeline, category, len(batch_data) + i + 1)
97
+ for i, category in enumerate(batch_categories)
98
+ ]
99
+
100
+ print(f" Launch {items_needed} concurrent tasks...")
101
+ results = await asyncio.gather(*tasks, return_exceptions=True)
102
+
103
+ # Collect successes
104
+ success_count = 0
105
+ for result in results:
106
+ if isinstance(result, Exception):
107
+ print(f" Task exception: {result}")
108
+ elif result is not None:
109
+ batch_data.append(result)
110
+ success_count += 1
111
+
112
+ items_needed = batch_size - len(batch_data)
113
+ if items_needed > 0:
114
+ print(f" Batch incomplete ({len(batch_data)}/{batch_size}). Retrying {items_needed} items in 5s...")
115
+ await asyncio.sleep(5)
116
+
117
+ print(f"Batch {batch_num} complete: {len(batch_data)}/{batch_size} items generated")
118
+ return batch_data
119
+
120
+ async def run_pipeline_batches_async(total_items: int = 100, batch_size: int = 10):
121
+ """Run the full pipeline with concurrent batch processing."""
122
+ pipeline = SyntheticDataPipeline(max_retries=5)
123
+
124
+ all_data = []
125
+ num_batches = max(1, total_items // batch_size)
126
+
127
+ print(f"Starting CONCURRENT generation of {total_items} items in {num_batches} batches...")
128
+ print(f"Batch size: {batch_size} items (generated in parallel)")
129
+
130
+ for batch_num in range(1, num_batches + 1):
131
+ # Check if batch already exists
132
+ batch_filename = f"synthetic_data/batch_{batch_num:02d}.jsonl"
133
+ if os.path.exists(batch_filename):
134
+ print(f"Batch {batch_num} already exists ({batch_filename}). Skipping generation...")
135
+ # Load existing data to include in final output
136
+ try:
137
+ with open(batch_filename, 'r') as f:
138
+ for line in f:
139
+ if line.strip():
140
+ all_data.append(json.loads(line))
141
+ print(f"Loaded {len(all_data)} items so far.")
142
+ continue
143
+ except Exception as e:
144
+ print(f"Error reading existing batch {batch_num}: {e}. Regenerating...")
145
+
146
+ # Generate entire batch concurrently
147
+ batch_data = await generate_batch_concurrent(pipeline, batch_size, batch_num)
148
+
149
+ # Save batch as JSONL
150
+ with open(batch_filename, "w") as f:
151
+ for item in batch_data:
152
+ f.write(json.dumps(item) + "\n")
153
+ print(f"Saved batch to {batch_filename}")
154
+
155
+ # Validate batch
156
+ print("Validating batch...")
157
+ metrics = validate_synthetic_data(batch_filename)
158
+ print(json.dumps(metrics, indent=2))
159
+
160
+ all_data.extend(batch_data)
161
+
162
+ # Wait 5 seconds before next batch
163
+ if batch_num < num_batches:
164
+ print("Waiting 5 seconds before next batch...")
165
+ await asyncio.sleep(5)
166
+
167
+ # Save all data
168
+ output_file = f"synthetic_data/all_generated_data_{total_items}.jsonl"
169
+ with open(output_file, "w") as f:
170
+ for item in all_data:
171
+ f.write(json.dumps(item) + "\n")
172
+
173
+ print(f"\n{'='*60}")
174
+ print(f"COMPLETED: {len(all_data)} items generated")
175
+ print(f"Full dataset saved to {output_file}")
176
+ print(f"{'='*60}")
177
+
178
+ def _get_persistence_for_category(category: str) -> str:
179
+ """Map category to its expected persistence level."""
180
+ if "brand_core" in category or "strategic_signatures" in category or "knowledge_artifacts" in category or "communication_style" in category or "strategic_approach" in category:
181
+ return "long"
182
+ elif "tools_config" in category or "role_context" in category or "workflow_patterns" in category:
183
+ return "medium"
184
+ elif "business_priorities" in category or "session_history" in category:
185
+ return "short"
186
+ elif "performance_context" in category:
187
+ return "rolling"
188
+ elif "interaction_preferences" in category:
189
+ return "evolving"
190
+ elif "none" in category:
191
+ return "short"
192
+ return "medium"
193
+
194
+ if __name__ == "__main__":
195
+ total = int(sys.argv[1]) if len(sys.argv) > 1 else 100
196
+ batch = int(sys.argv[2]) if len(sys.argv) > 2 else 10
197
+
198
+ asyncio.run(run_pipeline_batches_async(total, batch))
synthetic_data/run_diverse_generation.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ World-Class Diverse Dataset Generation - 20 Concurrent API Calls per Batch
3
+
4
+ Key features:
5
+ - 20 API calls simultaneously per batch
6
+ - Wait for batch to complete, then next batch
7
+ - Temperature 0.95 for maximum diversity
8
+ - No templates, maximum creative freedom
9
+ """
10
+
11
+ import json
12
+ import random
13
+ import os
14
+ import asyncio
15
+ from typing import List, Dict, Optional
16
+ from datetime import datetime
17
+ from concurrent.futures import ThreadPoolExecutor
18
+ import cohere
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+
23
+ CATEGORY_TARGETS = {
24
+ "company.brand_core": 77,
25
+ "company.strategic_signatures": 77,
26
+ "company.knowledge_artifacts": 77,
27
+ "company.business_priorities": 77,
28
+ "company.tools_config": 77,
29
+ "company.performance_context": 77,
30
+ "user.communication_style": 77,
31
+ "user.strategic_approach": 77,
32
+ "user.role_context": 77,
33
+ "user.workflow_patterns": 77,
34
+ "user.session_history": 77,
35
+ "user.interaction_preferences": 77,
36
+ "none": 77,
37
+ }
38
+
39
+ INDUSTRIES = [
40
+ "Series A fintech building a neobank", "hospital network digitizing patient intake",
41
+ "DTC sneaker brand scaling to retail", "industrial valve manufacturer going digital",
42
+ "K-12 tutoring platform expanding to Asia", "commercial real estate analytics startup",
43
+ "ghost kitchen aggregator in NYC", "enterprise zero-trust security vendor",
44
+ "luxury cruise line post-pandemic", "connected fitness hardware company",
45
+ "immigration law firm automating visas", "recruiting platform for nurses",
46
+ "pet insurance disruptor", "last-mile drone delivery startup",
47
+ "indie game studio with a viral hit", "podcast network monetizing premium content",
48
+ "EV charging network operator", "solar panel installer franchise",
49
+ "modular home construction startup", "veterinary telehealth platform",
50
+ "wine subscription service", "corporate wellness SaaS",
51
+ "NFT marketplace pivoting to digital art", "AI code review tool for enterprises",
52
+ "climate risk analytics for insurers", "restaurant POS system provider",
53
+ "online therapy platform", "B2B payments infrastructure",
54
+ "influencer marketing agency", "smart home security company"
55
+ ]
56
+
57
+ PERSONAS = [
58
+ "a stressed CMO preparing for board review",
59
+ "a junior marketing coordinator on their first campaign",
60
+ "a VP who just joined from a competitor",
61
+ "a founder wearing multiple hats",
62
+ "a seasoned brand director with 20 years experience",
63
+ "a growth lead obsessed with metrics",
64
+ "a creative director frustrated with process",
65
+ "a demand gen manager under pressure to hit pipeline",
66
+ "a content strategist building a new team",
67
+ "a marketing ops person drowning in tools",
68
+ "a product marketer launching next week",
69
+ "an email specialist optimizing deliverability",
70
+ "a social media manager handling a PR crisis",
71
+ "a field marketer planning regional events",
72
+ "a partner marketing lead negotiating co-marketing",
73
+ "an analyst presenting attribution findings"
74
+ ]
75
+
76
+ SITUATIONS = [
77
+ "in the middle of a heated planning session",
78
+ "wrapping up a long day before vacation",
79
+ "preparing for a last-minute executive ask",
80
+ "debugging why a campaign tanked",
81
+ "celebrating a successful launch",
82
+ "onboarding after joining last week",
83
+ "dealing with budget cuts",
84
+ "scaling something that unexpectedly worked",
85
+ "cleaning up a predecessor's mess",
86
+ "trying to align with a difficult stakeholder"
87
+ ]
88
+
89
+ TONES = ["urgent", "casual", "frustrated", "excited", "methodical", "skeptical", "collaborative", "directive"]
90
+
91
+ CATEGORY_HINTS = {
92
+ "company.brand_core": "The conversation should naturally surface brand identity elements - could be voice, visuals, values, positioning, or personality.",
93
+ "company.strategic_signatures": "The conversation should reveal how this company makes decisions - their frameworks, principles, or recurring patterns.",
94
+ "company.knowledge_artifacts": "The conversation should reference internal documentation - guides, playbooks, templates, or SOPs.",
95
+ "company.business_priorities": "The conversation should touch on current goals, quarterly targets, or active initiatives.",
96
+ "company.tools_config": "The conversation should involve tool setup, integrations, APIs, or workflow automation.",
97
+ "company.performance_context": "The conversation should discuss metrics, campaign results, or performance learnings.",
98
+ "user.communication_style": "The user should express how they prefer to receive information - format, length, tone, or style.",
99
+ "user.strategic_approach": "The user should reveal their personal philosophy, priorities, or decision-making style.",
100
+ "user.role_context": "The user should mention their role, responsibilities, authority, or team structure.",
101
+ "user.workflow_patterns": "The user should describe their schedule, review process, or collaboration preferences.",
102
+ "user.session_history": "The conversation should reference recent context, ongoing work, or previous discussions.",
103
+ "user.interaction_preferences": "The user should express how they want the AI to behave - proactivity, feedback style, or coaching level.",
104
+ "none": "The conversation should be purely transactional with nothing worth remembering long-term."
105
+ }
106
+
107
+
108
+ class ConcurrentGenerator:
109
+ def __init__(self):
110
+ self.api_key = os.getenv("COHERE_API_KEY")
111
+ if not self.api_key:
112
+ raise ValueError("COHERE_API_KEY not found")
113
+ self.client = cohere.ClientV2(api_key=self.api_key)
114
+ self.model = "command-r-plus-08-2024"
115
+ self.executor = ThreadPoolExecutor(max_workers=20)
116
+
117
+ def _extract_text(self, response) -> Optional[str]:
118
+ if not response or not getattr(response, "message", None):
119
+ return None
120
+ blocks = getattr(response.message, "content", []) or []
121
+ for block in blocks:
122
+ text = getattr(block, "text", None)
123
+ if isinstance(text, str) and text.strip():
124
+ return text
125
+ return None
126
+
127
+ def _generate_one(self, category: str) -> Optional[Dict]:
128
+ """Generate a single example with maximum creativity."""
129
+
130
+ industry = random.choice(INDUSTRIES)
131
+ persona = random.choice(PERSONAS)
132
+ situation = random.choice(SITUATIONS)
133
+ tone = random.choice(TONES)
134
+ turns = random.randint(3, 10)
135
+ hint = CATEGORY_HINTS.get(category, "")
136
+
137
+ if category == "none":
138
+ prompt = f"""You are a creative writer generating training data for an AI memory system.
139
+
140
+ Create a completely realistic conversation between {persona} at a {industry} and their AI marketing assistant.
141
+
142
+ Context: They are {situation}. The tone is {tone}.
143
+
144
+ THIS CONVERSATION MUST BE FORGETTABLE - nothing worth storing in long-term memory:
145
+ - Quick status checks, scheduling, or confirmations
146
+ - Vague questions without actionable details
147
+ - Chitchat or temporary context that expires immediately
148
+
149
+ Be creative. Make it feel real. No templates. Surprise me.
150
+
151
+ Output as JSON with this structure:
152
+ {{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["none"], "persistence_horizon": "short", "memory_scope": "none", "rationale": "why this is unmemorable"}}, "metadata": {{"primary_category": "none", "turn_count": {turns}, "industry": "{industry}"}}}}"""
153
+
154
+ else:
155
+ prompt = f"""You are a world-class creative writer generating training data for an AI memory routing system.
156
+
157
+ Create a completely unique, realistic conversation between {persona} at a {industry} and their AI marketing assistant.
158
+
159
+ Context: They are {situation}. The tone is {tone}.
160
+
161
+ CATEGORY TO DEMONSTRATE: {category}
162
+ {hint}
163
+
164
+ CREATIVE FREEDOM:
165
+ - Invent specific, realistic details (names, numbers, dates, products)
166
+ - The conversation can start anywhere - mid-thought, mid-project, mid-crisis
167
+ - Vary structure dramatically - could be rapid-fire, could be detailed
168
+ - Include natural speech patterns, interruptions, tangents
169
+ - Make it feel like eavesdropping on a real conversation
170
+ - {turns} turns, but quality over quantity
171
+
172
+ The ONLY hard requirement: the conversation must clearly demonstrate {category}.
173
+
174
+ Output as JSON:
175
+ {{"scenario_id": "unique_id", "conversation": [{{"role": "user", "content": "..."}}, {{"role": "assistant", "content": "..."}}], "labels": {{"categories": ["{category}"], "persistence_horizon": "long/medium/short", "memory_scope": "{category.split('.')[0]}", "rationale": "why this fits {category}"}}, "metadata": {{"primary_category": "{category}", "turn_count": {turns}, "industry": "{industry}"}}}}"""
176
+
177
+ try:
178
+ response = self.client.chat(
179
+ messages=[{"role": "user", "content": prompt}],
180
+ temperature=0.95,
181
+ model=self.model,
182
+ response_format={"type": "json_object"}
183
+ )
184
+
185
+ content = self._extract_text(response)
186
+ if not content:
187
+ return None
188
+
189
+ if content.startswith("```"):
190
+ content = content.split("\n", 1)[1] if "\n" in content else content[3:]
191
+ if content.endswith("```"):
192
+ content = content[:-3]
193
+
194
+ data = json.loads(content.strip())
195
+
196
+ categories = data.get("labels", {}).get("categories", [])
197
+ if category.lower() not in [c.lower() for c in categories]:
198
+ return None
199
+
200
+ if len(categories) > 1 and "none" in [c.lower() for c in categories]:
201
+ data["labels"]["categories"] = [c for c in categories if c.lower() != "none"]
202
+
203
+ return data
204
+
205
+ except Exception as e:
206
+ return None
207
+
208
+ async def generate_batch_concurrent(self, categories: List[str]) -> List[Dict]:
209
+ """Generate 20 items concurrently."""
210
+ loop = asyncio.get_event_loop()
211
+ tasks = [
212
+ loop.run_in_executor(self.executor, self._generate_one, cat)
213
+ for cat in categories
214
+ ]
215
+ results = await asyncio.gather(*tasks, return_exceptions=True)
216
+ return [r for r in results if isinstance(r, dict)]
217
+
218
+
219
+ async def run_generation():
220
+ generator = ConcurrentGenerator()
221
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
222
+ output_file = f"synthetic_data/diverse_dataset_{timestamp}.jsonl"
223
+
224
+ category_counts = {cat: 0 for cat in CATEGORY_TARGETS}
225
+ all_data = []
226
+
227
+ print("=" * 70, flush=True)
228
+ print("WORLD-CLASS DATASET GENERATION (20 Concurrent)", flush=True)
229
+ print("=" * 70, flush=True)
230
+ print(f"Batch size: 20 concurrent API calls", flush=True)
231
+ print(f"Temperature: 0.95", flush=True)
232
+ print(f"Target: 77 per category x 13 = 1001 total", flush=True)
233
+ print(f"Output: {output_file}", flush=True)
234
+ print("=" * 70, flush=True)
235
+
236
+ batch_num = 0
237
+ start_time = datetime.now()
238
+
239
+ while True:
240
+ # Build list of needed categories
241
+ needed = []
242
+ for cat, target in CATEGORY_TARGETS.items():
243
+ remaining = target - category_counts[cat]
244
+ if remaining > 0:
245
+ needed.extend([cat] * min(remaining, 3)) # Up to 3 per category per batch
246
+
247
+ if not needed:
248
+ break
249
+
250
+ random.shuffle(needed)
251
+ batch_categories = needed[:20] # 20 concurrent
252
+ batch_num += 1
253
+
254
+ print(f"\n[Batch {batch_num}] Launching 20 concurrent requests...", flush=True)
255
+ batch_start = datetime.now()
256
+
257
+ results = await generator.generate_batch_concurrent(batch_categories)
258
+
259
+ batch_time = (datetime.now() - batch_start).seconds
260
+
261
+ for result in results:
262
+ if result:
263
+ primary = result.get("metadata", {}).get("primary_category") or \
264
+ result.get("labels", {}).get("categories", ["unknown"])[0]
265
+
266
+ if primary in category_counts:
267
+ category_counts[primary] += 1
268
+ all_data.append(result)
269
+
270
+ with open(output_file, "a") as f:
271
+ f.write(json.dumps(result) + "\n")
272
+
273
+ conv = result.get("conversation", [])
274
+ if conv and len(conv) > 0:
275
+ first_msg = conv[0].get("content", "") if isinstance(conv[0], dict) else str(conv[0])
276
+ print(f" [{primary}] {first_msg[:60]}...", flush=True)
277
+
278
+ total_done = sum(category_counts.values())
279
+ total_target = sum(CATEGORY_TARGETS.values())
280
+ elapsed = (datetime.now() - start_time).seconds
281
+ rate = total_done / max(elapsed, 1) * 60
282
+ eta = (total_target - total_done) / max(rate, 0.1)
283
+
284
+ print(f" Batch: {len(results)}/20 success in {batch_time}s | Total: {total_done}/{total_target} | Rate: {rate:.1f}/min | ETA: {eta:.0f}min", flush=True)
285
+
286
+ # Progress every 10 batches
287
+ if batch_num % 10 == 0:
288
+ print("\n === Category Breakdown ===", flush=True)
289
+ for cat in sorted(category_counts.keys()):
290
+ count = category_counts[cat]
291
+ target = CATEGORY_TARGETS[cat]
292
+ bar = "█" * (count * 20 // target) + "░" * (20 - count * 20 // target)
293
+ print(f" {cat:<35} [{bar}] {count:>3}/{target}", flush=True)
294
+ print()
295
+
296
+ # Wait 3 seconds between batches
297
+ await asyncio.sleep(3)
298
+
299
+ print("\n" + "=" * 70, flush=True)
300
+ print("GENERATION COMPLETE", flush=True)
301
+ print("=" * 70, flush=True)
302
+ elapsed_total = (datetime.now() - start_time).seconds / 60
303
+ print(f"Total: {len(all_data)} examples in {elapsed_total:.1f} minutes", flush=True)
304
+ print(f"Output: {output_file}", flush=True)
305
+
306
+ return output_file
307
+
308
+
309
+ if __name__ == "__main__":
310
+ asyncio.run(run_generation())
synthetic_data/sample_batch.json ADDED
@@ -0,0 +1,841 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "scenario_id": "company.brand_core_strategy_789",
4
+ "conversation": [
5
+ {
6
+ "role": "user",
7
+ "content": "Building on our Q3 discussion about brand consistency\u2014how does the loyalty program reinforce 'Timeless Elegance' in digital touchpoints?"
8
+ },
9
+ {
10
+ "role": "assistant",
11
+ "content": "Previously, we aligned on using minimalist UI and exclusive content to reflect that. For loyalty, this could mean members-only digital lookbooks showcasing archival pieces. Absolutely, that's a key consideration when maintaining our brand's heritage digitally."
12
+ },
13
+ {
14
+ "role": "user",
15
+ "content": "How should we balance exclusivity with accessibility in rewards? Our urban professionals expect privilege but also practicality."
16
+ },
17
+ {
18
+ "role": "assistant",
19
+ "content": "A tiered rewards system would work here. Base tier offers practical perks like complimentary alterations and priority customer service. Meanwhile, top-tier members get exclusive experiences\u2014think private styling sessions or early access to limited-edition collaborations. That makes sense\u2014let me structure this."
20
+ },
21
+ {
22
+ "role": "user",
23
+ "content": "Will the AI style profiling integrate here? How do we avoid diluting the 'Modern Edge' promise?"
24
+ },
25
+ {
26
+ "role": "assistant",
27
+ "content": "Profile data could fuel hyper-personalized rewards\u2014for instance, offering avant-garde capsule collection access to members with edgier profiles. We'd track redemption metrics against brand alignment to ensure we're staying true to our identity. Also, we could include a feedback loop where members rate reward 'edginess' to refine future offerings."
28
+ }
29
+ ],
30
+ "labels": {
31
+ "categories": [
32
+ "company.brand_core",
33
+ "company.strategic_signatures",
34
+ "company.business_priorities",
35
+ "user.role_context",
36
+ "user.strategic_approach"
37
+ ],
38
+ "persistence_horizon": "mixed",
39
+ "memory_scope": "mixed",
40
+ "rationale": "Conversation centers on brand identity (company.brand_core) and strategic frameworks (company.strategic_signatures) for loyalty program design, while referencing active initiatives (business_priorities) and the user's strategic priorities (user.strategic_approach) and role scope (user.role_context)."
41
+ },
42
+ "metadata": {
43
+ "scenario_type": "loyalty_program_alignment",
44
+ "primary_category": "company.brand_core",
45
+ "distractor_present": false,
46
+ "turn_count": 6,
47
+ "signals_present": [
48
+ "Timeless Elegance with Modern Edge",
49
+ "Urban professionals (25-45) valuing curated exclusivity",
50
+ "Accessible Luxury Without Compromise",
51
+ "Personalization through AI-driven style profiling"
52
+ ]
53
+ }
54
+ },
55
+ {
56
+ "scenario_id": "company.brand_core_protocol_update_142",
57
+ "conversation": [
58
+ {
59
+ "role": "user",
60
+ "content": "Need to finalize the safety messaging section before the summit \u2013 should we integrate the new accident response protocols here? Legal\u2019s been pushing for clearer driver guidelines."
61
+ },
62
+ {
63
+ "role": "assistant",
64
+ "content": "Absolutely, building on our Project Compass foundation. The 'Safety First' pillar should absorb those protocols. Remember the crisis comms framework we locked in March? We can mirror that structure here for consistency. Just making sure we're not reinventing the wheel."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content": "Yes, but how does 'Community Trust' tie into driver conduct without stepping on HR\u2019s toes? Need a clear boundary there \u2013 don\u2019t want confusion between brand expectations and official policies."
69
+ },
70
+ {
71
+ "role": "assistant",
72
+ "content": "Understood. Per our May 12th brand guidelines review, 'Community Trust' specifically addresses passenger interaction norms \u2013 smiles, greetings, conflict de-escalation. I\u2019ll pull the approved driver conduct verbiage from that document. HR policies remain separate but we\u2019ll cross-reference them in footnotes."
73
+ },
74
+ {
75
+ "role": "user",
76
+ "content": "Good. And the newsletter template \u2013 legal wants all three pillars in every header. Confirm we\u2019re not just stuffing them in the footer anymore? Drivers need to see these front and center."
77
+ },
78
+ {
79
+ "role": "assistant",
80
+ "content": "Confirmed. Template v3.1, approved June 8th, embeds all pillars in the header with the 'Seamless Journeys' tagline. Footer contains compliance links. Bi-weekly revision cycle stays on track for July 15th summit distribution. Let me double-check that version for you."
81
+ }
82
+ ],
83
+ "labels": {
84
+ "categories": [
85
+ "company.brand_core",
86
+ "company.business_priorities",
87
+ "company.knowledge_artifacts",
88
+ "user.workflow_patterns"
89
+ ],
90
+ "persistence_horizon": "mixed",
91
+ "memory_scope": "mixed",
92
+ "rationale": "Brand pillars and Project Compass reference require long-term retention (brand_core). Summit deadline and template revisions are short-term priorities (business_priorities). Template versions and guidelines are knowledge artifacts (knowledge_artifacts). Bi-weekly cycles reflect user workflow patterns (user.workflow_patterns)."
93
+ },
94
+ "metadata": {
95
+ "scenario_type": "brand_protocol_update",
96
+ "primary_category": "company.brand_core",
97
+ "distractor_present": false,
98
+ "turn_count": 6,
99
+ "signals_present": [
100
+ "Three brand messaging pillars",
101
+ "Deadline for handbook update: Q3 2025 driver summit",
102
+ "Bi-weekly driver newsletter template revision",
103
+ "Project Compass branding initiative reference"
104
+ ]
105
+ }
106
+ },
107
+ {
108
+ "scenario_id": "company.brand_core_live_ops_strategy_142",
109
+ "conversation": [
110
+ {
111
+ "role": "user",
112
+ "content": "Building on what we discussed earlier about the teaser cadence\u2014I need to ensure the in-game dimensional rift narrative aligns with our brand voice when we drop the lore updates next week. Can't have the tone shifting between mysterious and... whatever the devs wrote last night."
113
+ },
114
+ {
115
+ "role": "assistant",
116
+ "content": "Absolutely. Remember how our 'community co-creation' principle during the Q3 campaign boosted engagement by 22%? That approachable authority tone with witty lore hooks worked perfectly. For this event, should we lean into the *playful challenge* aspect of our brand values while maintaining that witty, authoritative voice?"
117
+ },
118
+ {
119
+ "role": "user",
120
+ "content": "Yes, but the 'dimensional rift' concept needs to feel innovative\u2014our core value\u2014without alienating players. The narrative should invite exploration, not confusion. And the global audience means inclusivity is non-negotiable. Avoid cultural missteps in the lore metaphors."
121
+ },
122
+ {
123
+ "role": "assistant",
124
+ "content": "Understood. Since innovation is our north star here, we could frame the rift as a *collaborative discovery*\u2014using our 'player storytelling' mission\u2014rather than a threat. Tone-wise, think 'curious guide' instead of 'ominous warning'. I\u2019ll reference the brand voice guidelines from our knowledge artifacts for specific phrasing examples."
125
+ },
126
+ {
127
+ "role": "user",
128
+ "content": "Perfect. And make sure the tooltips use gender-neutral terms and avoid region-specific idioms. Oh, and the devs mentioned something about a 'rift meter' UI element\u2014need to ensure that terminology stays consistent with our style guide. Don\u2019t want another 'elixir of fortitude' fiasco."
129
+ },
130
+ {
131
+ "role": "assistant",
132
+ "content": "Noted. Here\u2019s your launch checklist synthesis: 1) Narrative framing as collaborative discovery (innovation + inclusivity), 2) Tone audit against brand voice guidelines, 3) UI terminology cross-check with style guide, 4) Player communication templates with witty/lore-rich hooks. Shall I route these to the content team via our playbook workflow?"
133
+ }
134
+ ],
135
+ "labels": {
136
+ "categories": [
137
+ "company.brand_core",
138
+ "company.performance_context",
139
+ "company.knowledge_artifacts",
140
+ "user.strategic_approach"
141
+ ],
142
+ "persistence_horizon": "mixed",
143
+ "memory_scope": "mixed",
144
+ "rationale": "Brand mission/values (company.brand_core) and past campaign metrics (company.performance_context) require long-term retention. Style guide references (company.knowledge_artifacts) are foundational. User's emphasis on inclusivity and global audience (user.strategic_approach) reflects personal priorities needing long persistence."
145
+ },
146
+ "metadata": {
147
+ "scenario_type": "brand_consistency_live_ops",
148
+ "primary_category": "company.brand_core",
149
+ "distractor_present": false,
150
+ "turn_count": 6,
151
+ "signals_present": [
152
+ "Studio mission",
153
+ "Core brand values",
154
+ "Brand voice guidelines",
155
+ "Q3 2025 campaign reference"
156
+ ]
157
+ }
158
+ },
159
+ {
160
+ "scenario_id": "company.strategic_signatures_ongoing_142",
161
+ "conversation": [
162
+ {
163
+ "role": "user",
164
+ "content": "Let\u2019s revisit the segmentation thresholds we discussed last quarter. The new data shows 20% of Gold members haven\u2019t engaged in 3 months. Should we adjust the recency threshold or introduce re-engagement campaigns?"
165
+ },
166
+ {
167
+ "role": "assistant",
168
+ "content": "Previously, we set Gold status at $500 annual spend with quarterly engagement. Given our 'accessible luxury' positioning, I\u2019d suggest targeted re-engagement first\u2014perhaps exclusive early access to capsule collections via app notifications. That reinforces premium value without diluting the tier\u2019s exclusivity."
169
+ },
170
+ {
171
+ "role": "user",
172
+ "content": "Right, but the retail team also wants to promote in-store pickups for online orders. Can we tie bonus points to that behavior? Maybe 100 points per pickup?"
173
+ },
174
+ {
175
+ "role": "assistant",
176
+ "content": "Strategically, that aligns with our omnichannel goals. We could frame it as 'Gold Concierge Pickup'\u2014offering bonus points *and* priority packaging. This maintains the premium experience while driving the behavior. Need to ensure our POS systems log these redemptions seamlessly alongside app tracking."
177
+ },
178
+ {
179
+ "role": "user",
180
+ "content": "Wait, how does this affect the points expiration policy we finalized in January? We can\u2019t have conflicting rules."
181
+ },
182
+ {
183
+ "role": "assistant",
184
+ "content": "The core 12-month expiration for base points remains, but we can exempt pickup-bonus points from expiration entirely. This incentivizes omnichannel engagement without penalizing active members. Remember, our segmentation model already weights recency heavily\u2014this keeps the system coherent."
185
+ }
186
+ ],
187
+ "labels": {
188
+ "categories": [
189
+ "company.brand_core",
190
+ "company.strategic_signatures",
191
+ "company.business_priorities",
192
+ "user.role_context",
193
+ "user.session_history"
194
+ ],
195
+ "persistence_horizon": "mixed",
196
+ "memory_scope": "mixed",
197
+ "rationale": "Brand positioning ('accessible luxury') and strategic frameworks (tiered thresholds) require long-term persistence, while expiration policies and campaign tactics are shorter-term priorities. User's role context (omnichannel integration) and reference to prior discussions reflect medium/long persistence."
198
+ },
199
+ "metadata": {
200
+ "scenario_type": "loyalty_program_optimization",
201
+ "primary_category": "company.strategic_signatures",
202
+ "distractor_present": false,
203
+ "turn_count": 6,
204
+ "signals_present": [
205
+ "Existing loyalty program's tiered rewards structure",
206
+ "Customer segmentation criteria",
207
+ "Omnichannel integration points",
208
+ "Brand-core value: 'Accessible luxury'"
209
+ ]
210
+ }
211
+ },
212
+ {
213
+ "scenario_id": "company.business_priorities_vendor_collaboration_742",
214
+ "conversation": [
215
+ {
216
+ "role": "user",
217
+ "content": "Building on the EcoVend pilot success\u2014how quickly can we scale real-time inventory APIs to top-tier partners? We need to lock in Q3 timelines."
218
+ },
219
+ {
220
+ "role": "assistant",
221
+ "content": "Absolutely, though we need to balance that with the Q3 deadline extension requests from partners like GreenLogistics. Remember, we pushed their integration to September 15th to address their ERP compatibility concerns. EcoVend's pilot showed promising uptime, but their feedback highlighted API latency spikes during peak hours."
222
+ },
223
+ {
224
+ "role": "user",
225
+ "content": "Right, what other feedback trends are we seeing from vendors? Any recurring pain points beyond latency?"
226
+ },
227
+ {
228
+ "role": "assistant",
229
+ "content": "EcoVend specifically flagged inconsistent data schema mappings causing 15-20% error rates in their warehouse module. Smaller vendors also mentioned documentation gaps in our developer portal. We've got the shared analytics dashboard prototype ready, but finance is still reviewing the $180K budget request for full development."
230
+ },
231
+ {
232
+ "role": "user",
233
+ "content": "Budget status is critical. Any movement there? And remind me\u2014what were the top action items from last quarter's summit that we need to address before scaling?"
234
+ },
235
+ {
236
+ "role": "assistant",
237
+ "content": "Budget remains pending, but we've got preliminary approval to use Q2 rollover funds for vendor testing. From the summit: 1) Establish vendor onboarding SLAs by July, 2) Implement quarterly API performance reviews, and 3) Finalize data-sharing compliance frameworks. Oh, and legal wants all new contracts to reference the dashboard proposal's IP terms."
238
+ },
239
+ {
240
+ "role": "user",
241
+ "content": "Got it. Does the dashboard proposal align with the contract terms we negotiated with Prime Logistics? Just making sure we're not missing any fine print here."
242
+ },
243
+ {
244
+ "role": "assistant",
245
+ "content": "Yes\u2014the proposal's data residency clauses match Prime's requirements, and the 99.5% uptime SLA is codified. We cross-referenced their contract during the prototype phase. Their CTO also signed off on the API rate limits last week."
246
+ },
247
+ {
248
+ "role": "user",
249
+ "content": "Perfect. Let's confirm adjusted timelines then. Push non-critical vendors to Q4 if needed, but GreenLogistics and EcoVend stay on track for September go-live?"
250
+ },
251
+ {
252
+ "role": "assistant",
253
+ "content": "Agreed. GreenLogistics' extended deadline is firm at 9/15, and EcoVend's full integration is set for 9/22. We'll need to prioritize their sandbox testing environments by mid-July. Oh, and the compliance team wants weekly syncs starting next month\u2014should we schedule that?"
254
+ },
255
+ {
256
+ "role": "user",
257
+ "content": "Yeah, put it on the calendar. And send me the summit action items doc again\u2014I\u2019ll review before next week\u2019s steering committee."
258
+ }
259
+ ],
260
+ "labels": {
261
+ "categories": [
262
+ "company.business_priorities",
263
+ "company.performance_context",
264
+ "company.knowledge_artifacts",
265
+ "company.tools_config",
266
+ "none"
267
+ ],
268
+ "persistence_horizon": "mixed",
269
+ "memory_scope": "mixed",
270
+ "rationale": "Business priorities (Q3 deadlines/budget), performance context (pilot feedback metrics), and knowledge artifacts (summit action items/contracts) dominate. Transactional elements (calendar requests) map to 'none'."
271
+ },
272
+ "metadata": {
273
+ "scenario_type": "strategic_planning",
274
+ "primary_category": "company.business_priorities",
275
+ "distractor_present": false,
276
+ "turn_count": 11,
277
+ "signals_present": [
278
+ "Q3 vendor integration deadline extension request from key partners",
279
+ "Pilot program feedback from EcoVend regarding real-time inventory APIs",
280
+ "Pending budget approval for shared analytics dashboard development",
281
+ "Action items from last quarter's vendor collaboration summit"
282
+ ]
283
+ }
284
+ },
285
+ {
286
+ "scenario_id": "company.knowledge_artifacts_collaborative_refinement_742",
287
+ "conversation": [
288
+ {
289
+ "role": "user",
290
+ "content": "Building on the brand voice guidelines we finalized last quarter, we need to adjust our financial services proposal templates. The current drafts feel too generic for this sector's compliance needs."
291
+ },
292
+ {
293
+ "role": "assistant",
294
+ "content": "Understood. Since we're refining sector-specific templates, could you clarify the client segmentation criteria? Previously, we discussed tiers based on asset size\u2014should these variations align with our strategic/growth/volume client definitions?"
295
+ },
296
+ {
297
+ "role": "user",
298
+ "content": "Exactly. Strategic clients (>$5B AUM) require deep customization, growth tier ($1B-$5B) needs balanced detail, and volume (<$1B) should prioritize efficiency. Also, all financial templates must embed our SEC compliance disclaimers by default."
299
+ },
300
+ {
301
+ "role": "assistant",
302
+ "content": "Got it. The segmentation logic matches our earlier brand architecture framework. I've drafted three template variations\u2014strategic version includes the full credentials matrix, growth streamlines case studies, and volume uses modular service snapshots. Shall I share these in Confluence?"
303
+ },
304
+ {
305
+ "role": "user",
306
+ "content": "Yes, but flag the methodology section in the strategic template. Our parent brand's 'integrated advisory approach' needs stronger emphasis there, while sub-brands should maintain their specialized positioning. That consistency is critical for global offices."
307
+ },
308
+ {
309
+ "role": "assistant",
310
+ "content": "Noted. I'll revise the methodology section to anchor in our core value proposition\u2014'Seamless expertise across regulatory landscapes'\u2014while preserving sub-brand flexibility. Next, I'll integrate the credential library with dynamic fields for compliance updates. Expect the refreshed drafts by EOD Friday?"
311
+ }
312
+ ],
313
+ "labels": {
314
+ "categories": [
315
+ "company.brand_core",
316
+ "company.knowledge_artifacts",
317
+ "company.strategic_signatures",
318
+ "user.communication_style",
319
+ "user.role_context"
320
+ ],
321
+ "persistence_horizon": "mixed",
322
+ "memory_scope": "mixed",
323
+ "rationale": "Company categories dominate (brand framework, knowledge artifacts, strategic compliance) with long-term persistence. User's role context and concise feedback style warrant medium-term storage."
324
+ },
325
+ "metadata": {
326
+ "scenario_type": "collaborative_refinement",
327
+ "primary_category": "company.knowledge_artifacts",
328
+ "distractor_present": true,
329
+ "turn_count": 6,
330
+ "signals_present": [
331
+ "Brand architecture framework (parent brand vs. sub-brands)",
332
+ "Core value proposition statements approved by leadership",
333
+ "Client segmentation criteria for template customization",
334
+ "Standardized boilerplate sections (e.g., methodology, credentials)"
335
+ ]
336
+ }
337
+ },
338
+ {
339
+ "scenario_id": "company.brand_core_brand_alignment_initiative_742",
340
+ "conversation": [
341
+ {
342
+ "role": "user",
343
+ "content": "Building on our earlier collaboration around brand voice guidelines, we need to ensure these principles permeate the AI's dialogue patterns in the revised playbooks. The team wants approachable expertise reflected in every interaction."
344
+ },
345
+ {
346
+ "role": "assistant",
347
+ "content": "Understood. We'll embed the voice guidelines into response templates and micro-interactions. Should we prioritize consistency over pedagogical flexibility in edge cases?"
348
+ },
349
+ {
350
+ "role": "user",
351
+ "content": "Our core value of 'Learner-Centric Adaptive Pathways' must drive content personalization. The AI needs to dynamically adjust difficulty based on engagement signals\u2014not just predefined metrics."
352
+ },
353
+ {
354
+ "role": "assistant",
355
+ "content": "Noted. Could you clarify how the 'Scaffolded Mastery Framework' operationalizes this? What specific implementation examples should we reference for incremental progression?"
356
+ },
357
+ {
358
+ "role": "user",
359
+ "content": "Right, the framework starts with foundational concept checks before introducing complexity. For example, in algebra modules, it begins with visual equation balancing before abstract variables. Each mastery gate requires 90% competency before advancing."
360
+ },
361
+ {
362
+ "role": "assistant",
363
+ "content": "Got it\u2014granular progression gates. How does this tie to our USP of 'personalized learning at scale without human tutor costs'? Should accessibility messaging emphasize cost efficiency?"
364
+ },
365
+ {
366
+ "role": "user",
367
+ "content": "Exactly. Position the adaptive pathways as democratizing high-quality education. Highlight that our AI delivers individualized pacing at a fraction of traditional tutoring expenses. That's our accessibility hook."
368
+ },
369
+ {
370
+ "role": "assistant",
371
+ "content": "Confirmed. All brand elements\u2014voice guidelines, adaptive principles, framework details, and USP positioning\u2014are now documented for persistent memory. Ready for playbook integration."
372
+ }
373
+ ],
374
+ "labels": {
375
+ "categories": [
376
+ "company.brand_core",
377
+ "company.strategic_signatures",
378
+ "company.knowledge_artifacts",
379
+ "user.role_context",
380
+ "user.strategic_approach"
381
+ ],
382
+ "persistence_horizon": "long",
383
+ "memory_scope": "mixed",
384
+ "rationale": "Brand values (company.brand_core), teaching methodology (company.strategic_signatures), and playbooks (company.knowledge_artifacts) require long-term storage. User's PM role (user.role_context) and focus on brand-pedagogy balance (user.strategic_approach) provide context for future interactions."
385
+ },
386
+ "metadata": {
387
+ "scenario_type": "brand_alignment_initiative",
388
+ "primary_category": "company.brand_core",
389
+ "distractor_present": false,
390
+ "turn_count": 6,
391
+ "signals_present": [
392
+ "Brand voice guidelines emphasizing approachable expertise",
393
+ "Core value: 'Learner-Centric Adaptive Pathways'",
394
+ "Proprietary teaching methodology: 'Scaffolded Mastery Framework'",
395
+ "Unique value proposition: 'Personalized learning at scale without human tutor costs'"
396
+ ]
397
+ }
398
+ },
399
+ {
400
+ "scenario_id": "company.brand_core_influencer_strategy_refinement_127",
401
+ "conversation": [
402
+ {
403
+ "role": "user",
404
+ "content": "I'm concerned our regional adaptations for Southeast Asia might dilute our heritage narrative. How do we scale 'Timeless Elegance' principles to markets like Tokyo without losing that three-generation formulation expertise story?"
405
+ },
406
+ {
407
+ "role": "assistant",
408
+ "content": "Absolutely, that tension is real. Remember how our Parisian activation maintained 98% visual adherence while incorporating local artisans? We could apply similar 'craftsmanship story modules'\u2014pre-approved heritage highlights paired with localized rituals. This kept our core identity intact while resonating regionally."
409
+ },
410
+ {
411
+ "role": "user",
412
+ "content": "For Tokyo specifically, what concrete vetting criteria should we use for micro-influencers? We need more than just follower counts."
413
+ },
414
+ {
415
+ "role": "assistant",
416
+ "content": "Agreed. Prioritize creators with \u226570% affluent beauty connoisseur audiences, verified by our partner platform. Require content proposals to weave in formulation history\u2014like highlighting our cold-process extraction methods. We\u2019ll also mandate our new visual compliance toolkit for 95% guideline adherence tracking."
417
+ },
418
+ {
419
+ "role": "user",
420
+ "content": "The compliance toolkit is non-negotiable. We\u2019ll need persistent tracking of that 95% threshold across all activations. Also, remind me\u2014where\u2019s the central repository for approved heritage assets?"
421
+ },
422
+ {
423
+ "role": "assistant",
424
+ "content": "Understood. We\u2019ll bake compliance dashboards into every campaign brief. Approved assets live in the Brand Legacy Vault under 'Formulation Archives'. To summarize our strategy: 1) Heritage modules for local adaptation, 2) Strict audience/content vetting, 3) Real-time compliance tracking\u2014all anchored in our 'uncompromising craftsmanship' pillar. Shall I document this in the playbook?"
425
+ }
426
+ ],
427
+ "labels": {
428
+ "categories": [
429
+ "company.brand_core",
430
+ "company.strategic_signatures",
431
+ "company.performance_context",
432
+ "company.business_priorities",
433
+ "company.knowledge_artifacts"
434
+ ],
435
+ "persistence_horizon": "mixed",
436
+ "memory_scope": "mixed",
437
+ "rationale": "Brand heritage principles (company.brand_core) and influencer criteria (company.strategic_signatures) require long-term persistence, while compliance metrics (company.performance_context) and campaign tools (company.business_priorities) are shorter-term. Playbook documentation (company.knowledge_artifacts) bridges both."
438
+ },
439
+ "metadata": {
440
+ "scenario_type": "influencer_strategy_refinement",
441
+ "primary_category": "company.brand_core",
442
+ "distractor_present": true,
443
+ "turn_count": 6,
444
+ "signals_present": [
445
+ "Brand pillar: 'Uncompromising artisanal craftsmanship'",
446
+ "Mandatory influencer criterion: 70%+ audience alignment with 'affluent beauty connoisseurs' segment",
447
+ "Campaign consistency metric: 95% adherence to visual identity guidelines",
448
+ "Heritage narrative: 'Three-generation family formulation expertise'"
449
+ ]
450
+ }
451
+ },
452
+ {
453
+ "scenario_id": "company_brand_core_742",
454
+ "conversation": [
455
+ {
456
+ "role": "user",
457
+ "content": "Pull up the Q3 vendor reliability data \u2013 I need to compare it against the Project Helix baseline from Q2. Specifically, how are on-time deliveries trending?"
458
+ },
459
+ {
460
+ "role": "assistant",
461
+ "content": "Against the Q2 Project Helix benchmarks, Q3 shows a 12% increase in delivery delays with Vendor Alpha. Their on-time rate dropped to 88% versus the 95% we achieved under Helix. Vendors Beta and Gamma are holding steady at 94% and 97% respectively. Want me to drill into Alpha's specific pain points?"
462
+ },
463
+ {
464
+ "role": "user",
465
+ "content": "Yes, but first \u2013 how does this 12-point gap directly impact our 'Reliability' brand pillar? And remind me of their transparency performance from our earlier discussions."
466
+ },
467
+ {
468
+ "role": "assistant",
469
+ "content": "Right, the delivery consistency feeds directly into customer trust for our demand-response programs. On transparency, Vendor Alpha scored 68% in Q2's communication benchmarks \u2013 remember the delayed shipment notifications we flagged? Their predictive ETAs were off by 3.2 days on average. That historical pattern suggests systemic issues beyond just Q3 logistics."
470
+ },
471
+ {
472
+ "role": "user",
473
+ "content": "Exactly. We need to revise the scoring model to weight predictive analytics heavier than historical averages. Let's emphasize real-time tracking data for Q4 \u2013 I'll draft a proposal. Also, confirm our next sync date."
474
+ },
475
+ {
476
+ "role": "assistant",
477
+ "content": "Got it. To recap: 1) Revise vendor scoring with 40% weight on predictive analytics, 2) Formalize real-time tracking requirements by Friday, 3) Maintain bi-weekly syncs \u2013 next on October 12th. This builds on our Q2 transparency benchmarks and aligns with the 'Reliability' pillar's 95% on-time target. Anything else before I distribute the summary?"
478
+ }
479
+ ],
480
+ "labels": {
481
+ "categories": [
482
+ "company.brand_core",
483
+ "company.knowledge_artifacts",
484
+ "company.performance_context",
485
+ "user.workflow_patterns"
486
+ ],
487
+ "persistence_horizon": "mixed",
488
+ "memory_scope": "mixed",
489
+ "rationale": "Brand pillars (company.brand_core) and Project Helix model (company.knowledge_artifacts) require long-term retention. Vendor metrics (company.performance_context) need rolling context. Bi-weekly syncs (user.workflow_patterns) reflect medium-term user patterns."
490
+ },
491
+ "metadata": {
492
+ "scenario_type": "collaborative_initiative",
493
+ "primary_category": "company.brand_core",
494
+ "distractor_present": false,
495
+ "turn_count": 6,
496
+ "signals_present": [
497
+ "Project Helix vendor scoring model",
498
+ "Reliability brand pillar tied to on-time delivery",
499
+ "Vendor transparency benchmarks from Q2",
500
+ "Bi-weekly sync commitment"
501
+ ]
502
+ }
503
+ },
504
+ {
505
+ "scenario_id": "company_strategic_signatures_742",
506
+ "conversation": [
507
+ {
508
+ "role": "user",
509
+ "content": "Double-check the Japan visual assets against Project Phoenix specs\u2014especially the hero banner gradients. Legal\u2019s nervous about cultural missteps."
510
+ },
511
+ {
512
+ "role": "assistant",
513
+ "content": "Confirmed compliant with Project Phoenix 2.1 guidelines, but the crimson-to-gold gradient in Frame 3 deviates from our approved palette by 12%. Should we flag this for revision?"
514
+ },
515
+ {
516
+ "role": "user",
517
+ "content": "Right, the Mexico campaign last quarter bent the rules on typography but killed it locally. Does Japan get similar flexibility under the Unified Brand Pillars?"
518
+ },
519
+ {
520
+ "role": "assistant",
521
+ "content": "Version 4.2 of the Pillars allows regional adaptations if metrics justify\u2014Mexico\u2019s engagement lift was +28%. For Japan, we\u2019d need comparable data to justify the gradient shift. I\u2019ll attach the flexibility clause matrix."
522
+ },
523
+ {
524
+ "role": "user",
525
+ "content": "Brazil team\u2019s proposed tagline tweak for Episode 4 feels\u2026 edgy. What\u2019s the escalation path if they push back on our feedback?"
526
+ },
527
+ {
528
+ "role": "assistant",
529
+ "content": "Trigger Crimson Protocol for high-stakes drops. I\u2019ll draft a memo to Legal and Regional Leads citing Phoenix parameters, then schedule a sync within 24 hours. We\u2019ve used this workflow for 3 prior drops this quarter."
530
+ }
531
+ ],
532
+ "labels": {
533
+ "categories": [
534
+ "company.brand_core",
535
+ "company.strategic_signatures",
536
+ "company.knowledge_artifacts",
537
+ "company.business_priorities",
538
+ "user.role_context",
539
+ "user.workflow_patterns"
540
+ ],
541
+ "persistence_horizon": "mixed",
542
+ "memory_scope": "mixed",
543
+ "rationale": "Project Phoenix/Unified Brand Pillars (long-term artifacts), Crimson Protocol (strategic signature), and regional adaptation rules (brand core) require persistent storage. Campaign metrics and escalation workflows reflect evolving priorities and user-specific patterns."
544
+ },
545
+ "metadata": {
546
+ "scenario_type": "brand_consistency_planning",
547
+ "primary_category": "company_strategic_signatures",
548
+ "distractor_present": false,
549
+ "turn_count": 6,
550
+ "signals_present": [
551
+ "Project Phoenix",
552
+ "Unified Brand Pillars",
553
+ "Crimson Protocol",
554
+ "cultural adaptation balance"
555
+ ]
556
+ }
557
+ },
558
+ {
559
+ "scenario_id": "company.brand_core_demand_response_456",
560
+ "conversation": [
561
+ {
562
+ "role": "user",
563
+ "content": "We need to reinforce our 'Reliability First' positioning in the new demand-response collateral for commercial clients. How can we thread that through the value prop without overshadowing cost savings?"
564
+ },
565
+ {
566
+ "role": "assistant",
567
+ "content": "Absolutely. Remember how we structured the 'PeakPartner' framework last quarter? That balanced operational benefits with financial incentives. We could adapt that approach here\u2014lead with grid stability as the foundational promise, then layer in cost efficiencies as a natural outcome of our reliability."
568
+ },
569
+ {
570
+ "role": "user",
571
+ "content": "Makes sense. But commercial audiences will want proof points. How do we quantify 'reliability' beyond just stating it as a value?"
572
+ },
573
+ {
574
+ "role": "assistant",
575
+ "content": "Our grid stability metrics are perfect for this. We've maintained 99.8% uptime across commercial nodes since 2022\u2014that's industry-leading. We should feature that prominently, maybe with a regional comparison chart to contextualize the advantage."
576
+ },
577
+ {
578
+ "role": "user",
579
+ "content": "Uptime stats are good, but competitors can claim similar numbers. What technical validation can we add to differentiate?"
580
+ },
581
+ {
582
+ "role": "assistant",
583
+ "content": "Our proprietary 'EnergyPulse' algorithm is key here. It's what enables predictive load balancing with 92% accuracy. We could highlight how this technology actively prevents disruptions\u2014tying directly back to the 'Reliability First' promise while showcasing our innovation edge."
584
+ }
585
+ ],
586
+ "labels": {
587
+ "categories": [
588
+ "company.brand_core",
589
+ "company.strategic_signatures",
590
+ "company.performance_context",
591
+ "company.knowledge_artifacts"
592
+ ],
593
+ "persistence_horizon": "mixed",
594
+ "memory_scope": "company",
595
+ "rationale": "Brand pillar ('Reliability First') and strategic framework ('PeakPartner') represent enduring company identity and decision heuristics. Grid metrics and proprietary tech are performance assets and knowledge artifacts with longer-term value."
596
+ },
597
+ "metadata": {
598
+ "scenario_type": "demand_response_campaign",
599
+ "primary_category": "company.brand_core",
600
+ "distractor_present": false,
601
+ "turn_count": 6,
602
+ "signals_present": [
603
+ "Company's 'Reliability First' brand pillar",
604
+ "Reference to 'PeakPartner' campaign",
605
+ "99.8% grid uptime metric",
606
+ "'EnergyPulse' forecasting algorithm"
607
+ ]
608
+ }
609
+ },
610
+ {
611
+ "scenario_id": "company.brand_core_collaboration_147",
612
+ "conversation": [
613
+ {
614
+ "role": "user",
615
+ "content": "Wait, before we dive into the onboarding flows, could you recap our core brand principles? I want to ensure the new campaign messaging stays consistent with what we established last quarter."
616
+ },
617
+ {
618
+ "role": "assistant",
619
+ "content": "Absolutely. Our brand mission remains 'Empower developers to build without boundaries'. The core value proposition centers on seamless integration reducing toolchain complexity by 40%, directly addressing developer frustration with fragmented debugging workflows. Our reliability pillar\u2014'Uncompromising reliability for high-stakes coding environments'\u2014should anchor all technical narratives."
620
+ },
621
+ {
622
+ "role": "user",
623
+ "content": "Right, that reliability piece feels crucial for the hackathon sponsorship we're considering. How can we embed these principles into that activation without feeling forced?"
624
+ },
625
+ {
626
+ "role": "assistant",
627
+ "content": "For the hackathon, we could structure challenges around 'building without boundaries'\u2014like cross-platform integration tasks. And reliability messaging should live in the swag: think high-quality hardware tools with 'Uncompromising reliability' engraved. That reinforces the pillar tangibly."
628
+ },
629
+ {
630
+ "role": "user",
631
+ "content": "That makes sense. Now for the onboarding flows\u2014show me how the brand voice would manifest in actual copy. Something developers would find authentic, not marketing-heavy."
632
+ },
633
+ {
634
+ "role": "assistant",
635
+ "content": "Here's a neutral-toned example for the integration setup step: 'Connect your existing toolchain in 3 steps. Our unified API layer handles the heavy lifting\u2014so you can focus on code, not configuration. Designed for 99.99% uptime in production environments.' It emphasizes seamlessness and reliability without hyperbole."
636
+ },
637
+ {
638
+ "role": "user",
639
+ "content": "Perfect. Let's lock that direction in. Oh, and I'll need the final copy by EOD Friday\u2014my team's reviewing everything before the sprint starts. Almost forgot, did you see the new competitor benchmarks? Their onboarding takes 45 minutes..."
640
+ }
641
+ ],
642
+ "labels": {
643
+ "categories": [
644
+ "company.brand_core",
645
+ "company.strategic_signatures",
646
+ "company.knowledge_artifacts",
647
+ "user.workflow_patterns",
648
+ "none"
649
+ ],
650
+ "persistence_horizon": "mixed",
651
+ "memory_scope": "mixed",
652
+ "rationale": "Brand mission/UVP/reliability pillar (company.brand_core) and hackathon strategy (company.strategic_signatures) require long-term storage. Onboarding copy examples (company.knowledge_artifacts) and user's review deadline (user.workflow_patterns) are medium/short-term. Competitor mention is irrelevant (none)."
653
+ },
654
+ "metadata": {
655
+ "scenario_type": "brand_strategy_collaboration",
656
+ "primary_category": "company.brand_core",
657
+ "distractor_present": false,
658
+ "turn_count": 7,
659
+ "signals_present": [
660
+ "Empower developers to build without boundaries",
661
+ "Seamless integration reduces toolchain complexity by 40%",
662
+ "Developer frustration with fragmented debugging workflows",
663
+ "Uncompromising reliability for high-stakes coding environments"
664
+ ]
665
+ }
666
+ },
667
+ {
668
+ "scenario_id": "company.brand_core_messaging_alignment_123",
669
+ "conversation": [
670
+ {
671
+ "role": "user",
672
+ "content": "So revisiting that 'skills gap' framing we discussed earlier\u2014I think we need to sharpen it further. The board loves the direction but wants to ensure we're not diluting our core message when we talk about underserved communities."
673
+ },
674
+ {
675
+ "role": "assistant",
676
+ "content": "Understood. Building on our previous emphasis on measurable outcomes, perhaps we could reframe around specific skill acquisition rates? For instance, 'Every $50 donation funds 20 hours of certified job training'\u2014this keeps us tied to practical empowerment while addressing the gap quantifiably."
677
+ },
678
+ {
679
+ "role": "user",
680
+ "content": "That's solid. What about this test slogan: 'Building futures, one green job at a time'? It feels modern and hopeful. Wait, does 'green' pose any issues?"
681
+ },
682
+ {
683
+ "role": "assistant",
684
+ "content": "Careful with 'green'\u2014it risks drifting toward environmental themes, which we agreed to avoid. Our brand pillar is 'empowerment through practical skills', not sustainability. Let's pivot to something like 'Building futures through skilled hands' to maintain vocational focus."
685
+ },
686
+ {
687
+ "role": "user",
688
+ "content": "Got it. How should we differentiate this for corporate CSR teams versus individual donors? They consume messaging so differently."
689
+ },
690
+ {
691
+ "role": "assistant",
692
+ "content": "For CSR departments, lead with workforce pipeline impact: 'Reduce local skills shortages by partnering with SkillBridge'. Individuals respond better to human-scale outcomes\u2014like our Q1 campaign highlighting how Maria's welding certification led to a $15k salary increase. Remember, our community impact metrics showed 40% higher engagement when we tied donations to specific skill outcomes."
693
+ },
694
+ {
695
+ "role": "user",
696
+ "content": "Perfect. Let's lock in that direction. And maybe include the metric about 92% job placement within 6 months\u2014it's our strongest proof point."
697
+ },
698
+ {
699
+ "role": "assistant",
700
+ "content": "Confirmed. Final messaging will emphasize practical skills empowerment, avoid environmental references, and segment messaging using proven impact metrics. I'll draft the revised brief accordingly."
701
+ }
702
+ ],
703
+ "labels": {
704
+ "categories": [
705
+ "company.brand_core",
706
+ "company.strategic_signatures",
707
+ "company.performance_context",
708
+ "user.strategic_approach",
709
+ "user.session_history"
710
+ ],
711
+ "persistence_horizon": "mixed",
712
+ "memory_scope": "mixed",
713
+ "rationale": "Brand core (long-term) and strategic signatures (long-term) are central to messaging alignment. Performance context (rolling) supports metric-based claims. User's strategic approach (long-term) and session history (short-term) reflect their focus on measurable outcomes and ongoing discussion continuity."
714
+ },
715
+ "metadata": {
716
+ "scenario_type": "messaging_refinement",
717
+ "primary_category": "company.brand_core",
718
+ "distractor_present": false,
719
+ "turn_count": 8,
720
+ "signals_present": [
721
+ "Core brand pillar: 'Empowerment through practical skills'",
722
+ "Target donor segments: individual philanthropists (ages 35-65) and CSR departments",
723
+ "Key messaging constraint: avoid referencing climate/environmental themes",
724
+ "Previous collaboration note: established 'community impact metrics' as primary success indicator"
725
+ ]
726
+ }
727
+ },
728
+ {
729
+ "scenario_id": "company.brand_core_campaign_456",
730
+ "conversation": [
731
+ {
732
+ "role": "user",
733
+ "content": "Wait, building on our earlier discussion about the 'community trust' pillar\u2014how specifically should we articulate safety in driver emails without sounding redundant? We need concrete actions, not just slogans."
734
+ },
735
+ {
736
+ "role": "assistant",
737
+ "content": "Understood. Since safety is non-negotiable for our brand, we could highlight initiatives like mandatory quarterly safety training refreshers and the new collision detection feature in the driver app. These demonstrate proactive investment in driver well-being, reinforcing our core value. Does that align with the playbook framework we approved last month?"
738
+ },
739
+ {
740
+ "role": "user",
741
+ "content": "Perfect. Now shifting to reliability messaging for high-volume drivers\u2014they're obsessed with consistent earnings. How do we tie that to brand values without overpromising?"
742
+ },
743
+ {
744
+ "role": "assistant",
745
+ "content": "For that segment, we should emphasize our dynamic pricing algorithms ensuring stable earnings during peak hours, plus the guaranteed hourly minimums rolling out in Q3. This connects reliability directly to their livelihood concerns. Remember, the phased rollout starts with Tier 1 cities in July, so we'll need geo-targeted email digests."
746
+ },
747
+ {
748
+ "role": "user",
749
+ "content": "Got it. But how do we operationalize 'community' without leaning on eco-initiatives? Tenured drivers especially might see that as superficial."
750
+ },
751
+ {
752
+ "role": "assistant",
753
+ "content": "Valid point. Instead, we're proposing peer-to-peer recognition programs where drivers nominate colleagues for 'Reliability Hero' awards, featured in hub materials. This builds organic community while reinforcing our pillars. The nomination criteria will live in the updated driver playbook."
754
+ },
755
+ {
756
+ "role": "user",
757
+ "content": "Right. And just to confirm\u2014we're still locked into the Q3 timeline? Creative teams are stressing about physical hub material production lead times."
758
+ },
759
+ {
760
+ "role": "assistant",
761
+ "content": "Yes, the phased schedule holds: in-app notifications launch July 15, emails follow August 1, and hub materials hit 20+ cities by September. We'll prioritize digital-first for tenured drivers since they engage more with the app, per our segmentation strategy."
762
+ }
763
+ ],
764
+ "labels": {
765
+ "categories": [
766
+ "company.brand_core",
767
+ "company.strategic_signatures",
768
+ "company.business_priorities",
769
+ "company.knowledge_artifacts",
770
+ "user.role_context"
771
+ ],
772
+ "persistence_horizon": "mixed",
773
+ "memory_scope": "mixed",
774
+ "rationale": "Brand pillars (safety/reliability) and strategic frameworks (campaign pillars) require long-term company memory. Q3 timelines and playbook updates are short-term business priorities. The user's focus on segmentation aligns with their role context."
775
+ },
776
+ "metadata": {
777
+ "scenario_type": "proposal_standardization",
778
+ "primary_category": "company.brand_core",
779
+ "distractor_present": false,
780
+ "turn_count": 8,
781
+ "signals_present": [
782
+ "safety/reliability pillars",
783
+ "Q3 phased rollout",
784
+ "driver segmentation",
785
+ "approved channels"
786
+ ]
787
+ }
788
+ },
789
+ {
790
+ "scenario_id": "company.brand_core_proposal_template_refinement_142",
791
+ "conversation": [
792
+ {
793
+ "role": "user",
794
+ "content": "Let's tighten the executive summary section. We need to emphasize our Trust pillar more explicitly\u2014reference the Q2 healthcare proposal's opening structure but align it with financial services regulations. Keep it concise."
795
+ },
796
+ {
797
+ "role": "assistant",
798
+ "content": "Understood. I'll mirror the healthcare proposal's three-sentence hook structure but reframe it around financial compliance. Our brand's Trust pillar language from the guidelines will anchor the messaging. Drafting now..."
799
+ },
800
+ {
801
+ "role": "user",
802
+ "content": "Good start, but the value proposition paragraph feels generic. Bake in our regulatory compliance expertise more boldly\u2014that's the client's primary pain point. And watch the tone; it's leaning too formal for 'confident yet approachable'."
803
+ },
804
+ {
805
+ "role": "assistant",
806
+ "content": "Adjusting tone to match our approved voice guidelines. I've sharpened the compliance angle: 'Navigating complex regulations requires more than expertise\u2014it demands proactive partnership.' Does that resonate?"
807
+ },
808
+ {
809
+ "role": "user",
810
+ "content": "Better. Now integrate the Partnership pillar into the methodology section without diluting the financial focus. Use bullet points like we did in the healthcare template's Phase 2 breakdown."
811
+ },
812
+ {
813
+ "role": "assistant",
814
+ "content": "Integrated Partnership pillar through co-creation workshops and governance frameworks. Maintained financial specificity with examples like 'joint regulatory strategy sessions'. Final draft attached\u2014ready for your approval queue?"
815
+ }
816
+ ],
817
+ "labels": {
818
+ "categories": [
819
+ "company.brand_core",
820
+ "company.knowledge_artifacts",
821
+ "company.business_priorities",
822
+ "user.communication_style"
823
+ ],
824
+ "persistence_horizon": "mixed",
825
+ "memory_scope": "mixed",
826
+ "rationale": "Brand pillars (Trust/Partnership) and voice guidelines fall under company.brand_core (long persistence). Proposal templates reference company.knowledge_artifacts (long). Client-specific compliance focus ties to company.business_priorities (short). User's tone feedback touches user.communication_style (long)."
827
+ },
828
+ "metadata": {
829
+ "scenario_type": "proposal_template_refinement",
830
+ "primary_category": "company.brand_core",
831
+ "distractor_present": false,
832
+ "turn_count": 6,
833
+ "signals_present": [
834
+ "Firm's three-pillar brand framework (Trust, Innovation, Partnership)",
835
+ "Client-specific value proposition: 'Regulatory compliance expertise'",
836
+ "Approved brand voice guideline: 'Confident yet approachable'",
837
+ "Previous collaboration reference: Q2 2025 healthcare proposal structure"
838
+ ]
839
+ }
840
+ }
841
+ ]
synthetic_data/test_balanced.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick test of balanced generation for underrepresented categories."""
2
+
3
+ import json
4
+ import os
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ import cohere
9
+
10
+ client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
11
+
12
+ # Test with underrepresented categories
13
+ test_categories = ["company.tools_config", "company.knowledge_artifacts", "none"]
14
+
15
+ for category in test_categories:
16
+ print(f"\n{'='*60}")
17
+ print(f"Testing: {category}")
18
+ print("="*60)
19
+
20
+ if category == "none":
21
+ prompt = """Generate a marketing conversation that has NO long-term memory value.
22
+
23
+ The conversation should be transactional, vague, or temporary.
24
+ Examples: checking status, scheduling, confirming receipt.
25
+
26
+ Generate 4 turns. Start mid-conversation (no greetings).
27
+
28
+ OUTPUT FORMAT (JSON only):
29
+ {
30
+ "scenario_id": "none_001",
31
+ "conversation": [
32
+ {"role": "user", "content": "..."},
33
+ {"role": "assistant", "content": "..."}
34
+ ],
35
+ "labels": {
36
+ "categories": ["none"],
37
+ "rationale": "..."
38
+ }
39
+ }"""
40
+ else:
41
+ prompt = f"""Generate a marketing conversation that clearly demonstrates: {category}
42
+
43
+ The conversation MUST contain clear signals for this category.
44
+ 4-6 turns, start mid-conversation (no greetings).
45
+
46
+ CRITICAL: The categories array MUST include "{category}".
47
+
48
+ OUTPUT FORMAT (JSON only):
49
+ {{
50
+ "scenario_id": "{category.replace('.', '_')}_001",
51
+ "conversation": [
52
+ {{"role": "user", "content": "..."}},
53
+ {{"role": "assistant", "content": "..."}}
54
+ ],
55
+ "labels": {{
56
+ "categories": ["{category}"],
57
+ "rationale": "..."
58
+ }}
59
+ }}"""
60
+
61
+ try:
62
+ response = client.chat(
63
+ messages=[{"role": "user", "content": prompt}],
64
+ temperature=0.7,
65
+ model="command-r-plus-08-2024",
66
+ response_format={"type": "json_object"}
67
+ )
68
+
69
+ content = response.message.content[0].text
70
+ data = json.loads(content)
71
+
72
+ output_cats = data.get("labels", {}).get("categories", [])
73
+ print(f"Target: {category}")
74
+ print(f"Output: {output_cats}")
75
+ print(f"Match: {'YES' if category in output_cats else 'NO'}")
76
+
77
+ if data.get("conversation"):
78
+ print(f"First turn: {data['conversation'][0]['content'][:80]}...")
79
+ except Exception as e:
80
+ print(f"Error: {e}")
81
+
synthetic_data/test_connection.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cohere
3
+ import time
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ def test_connection():
9
+ api_key = os.getenv("COHERE_API_KEY")
10
+ if not api_key:
11
+ print("Error: COHERE_API_KEY not found.")
12
+ return
13
+
14
+ print(f"Testing connection with API Key: {api_key[:4]}...{api_key[-4:]}")
15
+ client = cohere.ClientV2(api_key=api_key)
16
+
17
+ print("Sending request to command-a-reasoning-08-2025...")
18
+ start_time = time.time()
19
+ try:
20
+ response = client.chat(
21
+ messages=[{"role": "user", "content": "Say 'Hello, World!'"}],
22
+ model="command-a-reasoning-08-2025",
23
+ thinking={"type": "enabled"},
24
+ temperature=0.7
25
+ )
26
+ elapsed = time.time() - start_time
27
+ print(f"Response received in {elapsed:.2f}s")
28
+ print("Response object:", response)
29
+
30
+ # Try to extract text
31
+ if hasattr(response, 'message') and response.message.content:
32
+ for block in response.message.content:
33
+ if block.type == 'text':
34
+ print(f"Text content: {block.text}")
35
+ except Exception as e:
36
+ print(f"Error: {e}")
37
+
38
+ if __name__ == "__main__":
39
+ test_connection()
40
+
synthetic_data/test_diverse.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick test of diverse generation with high temperature."""
2
+ import json
3
+ import random
4
+ import os
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ import cohere
9
+
10
+ client = cohere.ClientV2(api_key=os.getenv("COHERE_API_KEY"))
11
+
12
+ INDUSTRIES = ["fintech startup", "healthcare SaaS", "e-commerce fashion"]
13
+ STARTERS = ["So about", "Following up on", "I've been thinking about"]
14
+
15
+ # Test 3 different categories with HIGH temperature
16
+ test_cases = [
17
+ ("company.tools_config", "fintech startup", "growth hacker"),
18
+ ("user.communication_style", "healthcare SaaS", "CMO"),
19
+ ("none", "e-commerce fashion", "marketing manager")
20
+ ]
21
+
22
+ for category, industry, role in test_cases:
23
+ starter = random.choice(STARTERS)
24
+ turns = random.randint(3, 6)
25
+
26
+ if category == "none":
27
+ prompt = f"""Create a UNMEMORABLE conversation between a {role} at a {industry} and AI.
28
+ Purely transactional - status check, scheduling, confirmation. NO specific details.
29
+ {turns} turns. Start with "{starter}..."
30
+ Return JSON: {{"conversation": [...], "labels": {{"categories": ["none"]}}}}"""
31
+ else:
32
+ prompt = f"""Create a marketing conversation for a {role} at a {industry}.
33
+ Must demonstrate: {category}
34
+ {turns} turns. Start with "{starter}..."
35
+ Be SPECIFIC with realistic details unique to {industry}.
36
+ Return JSON: {{"conversation": [...], "labels": {{"categories": ["{category}"]}}}}"""
37
+
38
+ response = client.chat(
39
+ messages=[{"role": "user", "content": prompt}],
40
+ temperature=0.95,
41
+ model="command-r-plus-08-2024",
42
+ response_format={"type": "json_object"}
43
+ )
44
+
45
+ content = response.message.content[0].text
46
+ data = json.loads(content)
47
+
48
+ print(f"\n{'='*60}")
49
+ print(f"Category: {category} | Industry: {industry}")
50
+ print(f"Output categories: {data.get('labels', {}).get('categories', [])}")
51
+ conv = data.get("conversation", [])
52
+ if conv:
53
+ first = conv[0]
54
+ if isinstance(first, dict):
55
+ print(f"First turn: {first.get('content', '')[:120]}...")
56
+ else:
57
+ print(f"First turn: {str(first)[:120]}...")
58
+
synthetic_data/test_pipeline.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import json
3
+ import os
4
+ from synthetic_data.pipeline import SyntheticDataPipeline
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ class TestSyntheticDataPipeline(unittest.TestCase):
8
+
9
+ @patch('synthetic_data.pipeline.cohere.ClientV2')
10
+ def test_pipeline_structure(self, mock_client):
11
+ # Setup mock response
12
+ mock_instance = mock_client.return_value
13
+
14
+ # Mock scenario generation response
15
+ scenario_block = MagicMock()
16
+ scenario_block.type = "text"
17
+ scenario_block.text = json.dumps({
18
+ "scenario_description": "Test scenario",
19
+ "user_profile": "Test user",
20
+ "key_signals_to_include": ["signal1"],
21
+ "distractor_signals": [],
22
+ "suggested_turn_breakdown": "Test breakdown"
23
+ })
24
+ mock_scenario_response = MagicMock()
25
+ mock_scenario_response.message.content = [scenario_block]
26
+
27
+ # Mock conversation generation response
28
+ conv_block = MagicMock()
29
+ conv_block.type = "text"
30
+ conv_block.text = json.dumps({
31
+ "scenario_id": "test_id_001",
32
+ "conversation": [
33
+ {"role": "user", "content": "test"},
34
+ {"role": "assistant", "content": "response"}
35
+ ],
36
+ "labels": {
37
+ "categories": ["company.brand_core"],
38
+ "persistence_horizon": "long",
39
+ "memory_scope": "company",
40
+ "rationale": "test rationale"
41
+ },
42
+ "metadata": {
43
+ "scenario_type": "test",
44
+ "primary_category": "company.brand_core",
45
+ "distractor_present": False,
46
+ "turn_count": 2,
47
+ "signals_present": ["signal1"]
48
+ }
49
+ })
50
+ mock_conv_response = MagicMock()
51
+ mock_conv_response.message.content = [conv_block]
52
+
53
+ mock_instance.chat.side_effect = [mock_scenario_response, mock_conv_response]
54
+
55
+ # Run pipeline with dummy key
56
+ pipeline = SyntheticDataPipeline(api_key="dummy_key")
57
+ results = pipeline.run_batch(count=1)
58
+
59
+ self.assertEqual(len(results), 1)
60
+ self.assertIn('scenario_id', results[0])
61
+ self.assertIn('conversation', results[0])
62
+ self.assertIn('labels', results[0])
63
+ self.assertEqual(results[0]['labels']['categories'], ["company.brand_core"])
64
+
65
+ def test_conversation_structure(self):
66
+ # Validate structure of a sample output
67
+ sample_data = {
68
+ "scenario_id": "brand_core_test_001",
69
+ "conversation": [
70
+ {"role": "user", "content": "test"},
71
+ {"role": "assistant", "content": "response"}
72
+ ],
73
+ "labels": {
74
+ "categories": ["company.brand_core"],
75
+ "persistence_horizon": "long",
76
+ "memory_scope": "company",
77
+ "rationale": "test"
78
+ },
79
+ "metadata": {
80
+ "scenario_type": "test",
81
+ "primary_category": "company.brand_core",
82
+ "distractor_present": False,
83
+ "turn_count": 2,
84
+ "signals_present": []
85
+ }
86
+ }
87
+
88
+ # Check required fields
89
+ required_fields = ['scenario_id', 'conversation', 'labels', 'metadata']
90
+ for field in required_fields:
91
+ self.assertIn(field, sample_data)
92
+
93
+ # Check label structure
94
+ label_fields = ['categories', 'persistence_horizon', 'memory_scope', 'rationale']
95
+ for field in label_fields:
96
+ self.assertIn(field, sample_data['labels'])
97
+
98
+ if __name__ == '__main__':
99
+ unittest.main()
100
+
synthetic_data/training_dataset_1000.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
synthetic_data/validate.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from collections import Counter
4
+ from typing import Dict, List, Any
5
+
6
+ def validate_synthetic_data(filepath: str) -> Dict[str, Any]:
7
+ """Validate synthetic data quality based on the PRD guidelines."""
8
+
9
+ try:
10
+ with open(filepath, 'r') as f:
11
+ # Handle both single JSON array and JSONL formats
12
+ content = f.read().strip()
13
+ if content.startswith('[') and content.endswith(']'):
14
+ data = json.loads(content)
15
+ else:
16
+ data = [json.loads(line) for line in content.split('\n') if line.strip()]
17
+ except json.JSONDecodeError as e:
18
+ return {'error': f"Invalid JSON format: {e}"}
19
+ except Exception as e:
20
+ return {'error': f"Error reading file: {e}"}
21
+
22
+ if not data:
23
+ return {'error': "Empty dataset"}
24
+
25
+ # Category distribution
26
+ all_categories = []
27
+ for item in data:
28
+ if 'labels' in item and 'categories' in item['labels']:
29
+ all_categories.extend(item['labels']['categories'])
30
+ category_dist = Counter(all_categories)
31
+
32
+ # Multi-label frequency
33
+ multi_label_count = sum(1 for item in data
34
+ if 'labels' in item and 'categories' in item['labels']
35
+ and len(item['labels']['categories']) > 1)
36
+ multi_label_freq = multi_label_count / len(data) if len(data) > 0 else 0
37
+
38
+ # Turn count distribution
39
+ turn_counts = [item['metadata'].get('turn_count', 0) for item in data if 'metadata' in item]
40
+ avg_turns = sum(turn_counts) / len(turn_counts) if turn_counts else 0
41
+
42
+ # Persistence distribution
43
+ persistence_dist = Counter(item['labels'].get('persistence_horizon', 'unknown') for item in data if 'labels' in item)
44
+
45
+ # Memory scope distribution
46
+ scope_dist = Counter(item['labels'].get('memory_scope', 'unknown') for item in data if 'labels' in item)
47
+
48
+ return {
49
+ 'total_examples': len(data),
50
+ 'category_distribution': dict(category_dist),
51
+ 'multi_label_frequency': multi_label_freq,
52
+ 'avg_turns_per_conversation': avg_turns,
53
+ 'persistence_distribution': dict(persistence_dist),
54
+ 'scope_distribution': dict(scope_dist),
55
+ 'warnings': _generate_warnings(category_dist, multi_label_freq, avg_turns, len(data))
56
+ }
57
+
58
+ def _generate_warnings(cat_dist, ml_freq, avg_turns, total_count):
59
+ warnings = []
60
+
61
+ # Check for imbalanced categories (only if dataset is large enough)
62
+ if total_count > 20:
63
+ total_cats = sum(cat_dist.values())
64
+ for cat, count in cat_dist.items():
65
+ if count / total_cats < 0.05:
66
+ warnings.append(f"Category '{cat}' underrepresented: {count/total_cats:.1%}")
67
+
68
+ # Check multi-label frequency
69
+ if ml_freq < 0.15:
70
+ warnings.append(f"Low multi-label frequency: {ml_freq:.1%} (target: 20-25%)")
71
+
72
+ # Check turn length
73
+ if avg_turns < 4 or avg_turns > 10:
74
+ warnings.append(f"Average turns out of range: {avg_turns:.1f} (target: 6.5±1.5)")
75
+
76
+ return warnings
77
+
78
+ if __name__ == "__main__":
79
+ parser = argparse.ArgumentParser(description="Validate synthetic data quality")
80
+ parser.add_argument("filepath", help="Path to JSON/JSONL file")
81
+ args = parser.parse_args()
82
+
83
+ metrics = validate_synthetic_data(args.filepath)
84
+ print(json.dumps(metrics, indent=2))
85
+
synthetic_data/verify_key.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cohere
3
+ from dotenv import load_dotenv
4
+
5
+ load_dotenv()
6
+ key = os.getenv("CO_API_KEY")
7
+ print(f"Key: {key[:5]}... (len={len(key) if key else 0})")
8
+
9
+ try:
10
+ client = cohere.ClientV2(api_key=key)
11
+ resp = client.chat(model="command-r-plus", messages=[{"role": "user", "content": "hi"}])
12
+ print("API Connection Success!")
13
+ except Exception as e:
14
+ print(f"API Error: {e}")
15
+
training/benchmark.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark: Memory Routing Model Evaluation
3
+
4
+ This script evaluates our trained model against:
5
+ 1. Base model (untrained Llama-3.1-8B)
6
+ 2. Our SFT model
7
+ 3. Our RL model
8
+
9
+ We measure:
10
+ - Classification metrics (F1, precision, recall)
11
+ - Task-specific metrics (temporal alignment, scope parity)
12
+ - Efficiency (tokens generated, latency)
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import time
18
+ import os
19
+ import numpy as np
20
+ from typing import List, Dict, Any, Tuple
21
+ from collections import Counter
22
+ from dataclasses import dataclass
23
+
24
+
25
+ @dataclass
26
+ class BenchmarkConfig:
27
+ base_model: str = "meta-llama/Llama-3.1-8B"
28
+ renderer_name: str = "llama3"
29
+ test_data_path: str = "training/processed_data/test_data.json"
30
+ output_dir: str = "training/benchmarks"
31
+
32
+ # Model checkpoints to evaluate
33
+ sft_checkpoint: str = ""
34
+ rl_checkpoint: str = ""
35
+
36
+
37
+ VALID_CATEGORIES = {
38
+ "company.brand_core", "company.strategic_signatures", "company.knowledge_artifacts",
39
+ "company.business_priorities", "company.tools_config", "company.performance_context",
40
+ "user.communication_style", "user.strategic_approach", "user.role_context",
41
+ "user.workflow_patterns", "user.session_history", "user.interaction_preferences",
42
+ "none"
43
+ }
44
+
45
+ CATEGORY_PERSISTENCE = {
46
+ "company.brand_core": "long", "company.strategic_signatures": "long",
47
+ "company.knowledge_artifacts": "long", "company.business_priorities": "short",
48
+ "company.tools_config": "medium", "company.performance_context": "rolling",
49
+ "user.communication_style": "long", "user.strategic_approach": "long",
50
+ "user.role_context": "medium", "user.workflow_patterns": "medium",
51
+ "user.session_history": "short", "user.interaction_preferences": "evolving",
52
+ "none": "short"
53
+ }
54
+
55
+ SYSTEM_PROMPT = """You route marketing conversations into structured memory categories.
56
+
57
+ Available categories:
58
+ - company.brand_core: Voice, values, positioning
59
+ - company.strategic_signatures: Decision frameworks
60
+ - company.knowledge_artifacts: Docs, style guides
61
+ - company.business_priorities: Quarterly goals, campaigns
62
+ - company.tools_config: Integrations, settings
63
+ - company.performance_context: Campaign metrics
64
+ - user.communication_style: Tone, format expectations
65
+ - user.strategic_approach: Personal priorities
66
+ - user.role_context: Title, scope
67
+ - user.workflow_patterns: Review cadence
68
+ - user.session_history: Recent context
69
+ - user.interaction_preferences: Coaching style
70
+ - none: Irrelevant or transactional
71
+
72
+ Respond with comma-separated categories only. No explanations."""
73
+
74
+
75
+ def parse_prediction(text: str) -> set:
76
+ """Parse model output into category set."""
77
+ if not text:
78
+ return set()
79
+
80
+ categories = set()
81
+ for part in text.split(","):
82
+ cat = part.strip().lower()
83
+ if cat in VALID_CATEGORIES:
84
+ categories.add(cat)
85
+
86
+ # Remove "none" if mixed with others
87
+ if "none" in categories and len(categories) > 1:
88
+ categories.discard("none")
89
+
90
+ return categories
91
+
92
+
93
+ def compute_metrics(predicted: set, gold: set) -> Dict[str, float]:
94
+ """Compute all evaluation metrics for a single example."""
95
+ metrics = {}
96
+
97
+ # Basic classification
98
+ tp = len(predicted & gold)
99
+ metrics["precision"] = tp / len(predicted) if predicted else 0
100
+ metrics["recall"] = tp / len(gold) if gold else 0
101
+ metrics["f1"] = 2 * metrics["precision"] * metrics["recall"] / (metrics["precision"] + metrics["recall"]) if (metrics["precision"] + metrics["recall"]) > 0 else 0
102
+ metrics["exact_match"] = float(predicted == gold)
103
+ metrics["any_match"] = float(tp > 0)
104
+
105
+ # Temporal alignment
106
+ def majority_persistence(cats):
107
+ if not cats:
108
+ return "medium"
109
+ persis = [CATEGORY_PERSISTENCE.get(c, "medium") for c in cats]
110
+ return Counter(persis).most_common(1)[0][0]
111
+
112
+ pred_pers = majority_persistence(predicted)
113
+ gold_pers = majority_persistence(gold)
114
+ metrics["temporal_match"] = float(pred_pers == gold_pers)
115
+
116
+ # Scope parity
117
+ def get_scope(cats):
118
+ scopes = set()
119
+ for c in cats:
120
+ if c.startswith("company."):
121
+ scopes.add("company")
122
+ elif c.startswith("user."):
123
+ scopes.add("user")
124
+ if len(scopes) == 2:
125
+ return "mixed"
126
+ return scopes.pop() if scopes else "none"
127
+
128
+ metrics["scope_match"] = float(get_scope(predicted) == get_scope(gold))
129
+
130
+ # Efficiency
131
+ n = len(predicted)
132
+ metrics["n_categories"] = n
133
+ metrics["efficiency"] = 1.0 if n <= 3 else (0.7 if n == 4 else 0.4)
134
+
135
+ return metrics
136
+
137
+
138
+ async def evaluate_model(
139
+ service_client,
140
+ tokenizer,
141
+ renderer,
142
+ checkpoint: str,
143
+ test_data: List[Dict],
144
+ model_name: str
145
+ ) -> Tuple[Dict, List[Dict]]:
146
+ """Evaluate a single model checkpoint."""
147
+ from tinker import types
148
+
149
+ print(f"\nEvaluating: {model_name}")
150
+ print(f"Checkpoint: {checkpoint}")
151
+
152
+ sampling_client = service_client.create_sampling_client(model_path=checkpoint)
153
+ stop_sequences = renderer.get_stop_sequences()
154
+
155
+ results = []
156
+ latencies = []
157
+
158
+ for i, example in enumerate(test_data):
159
+ gold = set([c.lower() for c in example.get("categories", [])])
160
+ messages = example.get("messages", [])
161
+ prompt_messages = [m for m in messages if m.get("role") != "assistant"]
162
+
163
+ if not prompt_messages:
164
+ continue
165
+
166
+ prompt = renderer.build_generation_prompt(prompt_messages)
167
+ params = types.SamplingParams(max_tokens=50, temperature=0.1, stop=stop_sequences)
168
+
169
+ start_time = time.time()
170
+ result = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1).result()
171
+ latency = time.time() - start_time
172
+ latencies.append(latency)
173
+
174
+ response, success = renderer.parse_response(result.sequences[0].tokens)
175
+ predicted_text = response["content"] if success else ""
176
+ predicted = parse_prediction(predicted_text)
177
+
178
+ metrics = compute_metrics(predicted, gold)
179
+ metrics["gold"] = list(gold)
180
+ metrics["predicted"] = list(predicted)
181
+ metrics["predicted_text"] = predicted_text
182
+ metrics["latency"] = latency
183
+ metrics["format_valid"] = bool(predicted) or predicted_text.strip().lower() == "none"
184
+
185
+ results.append(metrics)
186
+
187
+ if (i + 1) % 50 == 0:
188
+ print(f" Progress: {i + 1}/{len(test_data)}")
189
+
190
+ # Aggregate
191
+ aggregate = {
192
+ "model_name": model_name,
193
+ "checkpoint": checkpoint,
194
+ "n_examples": len(results),
195
+ "f1": np.mean([r["f1"] for r in results]),
196
+ "precision": np.mean([r["precision"] for r in results]),
197
+ "recall": np.mean([r["recall"] for r in results]),
198
+ "exact_match": np.mean([r["exact_match"] for r in results]),
199
+ "any_match": np.mean([r["any_match"] for r in results]),
200
+ "temporal_match": np.mean([r["temporal_match"] for r in results]),
201
+ "scope_match": np.mean([r["scope_match"] for r in results]),
202
+ "efficiency": np.mean([r["efficiency"] for r in results]),
203
+ "format_valid": np.mean([r["format_valid"] for r in results]),
204
+ "mean_latency": np.mean(latencies),
205
+ "p95_latency": np.percentile(latencies, 95),
206
+ }
207
+
208
+ return aggregate, results
209
+
210
+
211
+ async def run_benchmark(config: BenchmarkConfig):
212
+ """Run full benchmark suite."""
213
+ import tinker
214
+ from tinker_cookbook import renderers
215
+ from tinker_cookbook.tokenizer_utils import get_tokenizer
216
+ from dotenv import load_dotenv
217
+ from datetime import datetime
218
+
219
+ load_dotenv()
220
+
221
+ print("=" * 70)
222
+ print("MEMORY ROUTING BENCHMARK")
223
+ print("=" * 70)
224
+
225
+ # Setup
226
+ os.makedirs(config.output_dir, exist_ok=True)
227
+ service_client = tinker.ServiceClient()
228
+ tokenizer = get_tokenizer(config.base_model)
229
+ renderer = renderers.get_renderer(name=config.renderer_name, tokenizer=tokenizer)
230
+
231
+ # Load test data
232
+ with open(config.test_data_path, "r") as f:
233
+ test_data = json.load(f)
234
+
235
+ print(f"Test examples: {len(test_data)}")
236
+
237
+ # Models to evaluate
238
+ models = []
239
+
240
+ if config.sft_checkpoint:
241
+ models.append(("SFT Model (Llama-3.1-8B + LoRA)", config.sft_checkpoint))
242
+
243
+ if config.rl_checkpoint:
244
+ models.append(("RL Model (Llama-3.1-8B + LoRA)", config.rl_checkpoint))
245
+
246
+ # Run evaluations
247
+ all_results = {}
248
+
249
+ for model_name, checkpoint in models:
250
+ aggregate, details = await evaluate_model(
251
+ service_client, tokenizer, renderer, checkpoint, test_data, model_name
252
+ )
253
+ all_results[model_name] = {
254
+ "aggregate": aggregate,
255
+ "details": details
256
+ }
257
+
258
+ # Print comparison table
259
+ print("\n" + "=" * 70)
260
+ print("BENCHMARK RESULTS")
261
+ print("=" * 70)
262
+
263
+ print(f"\n{'Metric':<20} ", end="")
264
+ for model_name in all_results:
265
+ short_name = model_name.split(" (")[0]
266
+ print(f"{short_name:<15} ", end="")
267
+ print()
268
+ print("-" * 70)
269
+
270
+ metrics_to_show = [
271
+ ("F1 Score", "f1"),
272
+ ("Precision", "precision"),
273
+ ("Recall", "recall"),
274
+ ("Exact Match", "exact_match"),
275
+ ("Any Match", "any_match"),
276
+ ("Temporal Match", "temporal_match"),
277
+ ("Scope Match", "scope_match"),
278
+ ("Format Valid", "format_valid"),
279
+ ("Mean Latency", "mean_latency"),
280
+ ]
281
+
282
+ for display_name, key in metrics_to_show:
283
+ print(f"{display_name:<20} ", end="")
284
+ for model_name in all_results:
285
+ value = all_results[model_name]["aggregate"][key]
286
+ if key == "mean_latency":
287
+ print(f"{value:.3f}s ", end="")
288
+ else:
289
+ print(f"{value:.1%} ", end="")
290
+ print()
291
+
292
+ # Save results
293
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
294
+ output_path = os.path.join(config.output_dir, f"benchmark_{timestamp}.json")
295
+
296
+ with open(output_path, "w") as f:
297
+ json.dump({
298
+ "config": {
299
+ "base_model": config.base_model,
300
+ "test_examples": len(test_data),
301
+ },
302
+ "results": {k: v["aggregate"] for k, v in all_results.items()},
303
+ "details": {k: v["details"] for k, v in all_results.items()}
304
+ }, f, indent=2, default=str)
305
+
306
+ print(f"\nResults saved to: {output_path}")
307
+
308
+ return all_results
309
+
310
+
311
+ async def main():
312
+ import sys
313
+
314
+ config = BenchmarkConfig()
315
+
316
+ # Parse command line args
317
+ for arg in sys.argv[1:]:
318
+ if "=" in arg:
319
+ key, value = arg.split("=", 1)
320
+ if hasattr(config, key):
321
+ setattr(config, key, value)
322
+
323
+ await run_benchmark(config)
324
+
325
+
326
+ if __name__ == "__main__":
327
+ asyncio.run(main())
328
+