File size: 6,066 Bytes
685d968 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
"""
Upload Memory Routing Agent to HuggingFace Hub
This script uploads:
1. Model card (README.md)
2. Training dataset
3. Benchmark dataset
4. Training configuration
"""
import os
import json
from dotenv import load_dotenv
from huggingface_hub import HfApi, create_repo, upload_file, upload_folder, login
load_dotenv()
# Configuration
HF_TOKEN = os.getenv("HF_TOKEN")
REPO_ID = "muratcankoylan/memory-routing-agent"
DATASET_REPO_ID = "muratcankoylan/memory-routing-marketing"
# Login first
if HF_TOKEN:
print(f"Logging in with token (first 10 chars): {HF_TOKEN[:10]}...")
login(token=HF_TOKEN)
else:
print("ERROR: HF_TOKEN not found in .env file")
exit(1)
def upload_model():
"""Upload model card and metadata to HuggingFace."""
api = HfApi(token=HF_TOKEN)
# Create model repo
try:
create_repo(repo_id=REPO_ID, token=HF_TOKEN, exist_ok=True)
print(f"Created/verified repo: {REPO_ID}")
except Exception as e:
print(f"Repo creation note: {e}")
# Upload README (model card)
upload_file(
path_or_fileobj="huggingface/README.md",
path_in_repo="README.md",
repo_id=REPO_ID,
token=HF_TOKEN,
)
print("Uploaded model card")
# Upload benchmark
upload_file(
path_or_fileobj="training/benchmarks/marketing_routing_benchmark.json",
path_in_repo="benchmark/marketing_routing_benchmark.json",
repo_id=REPO_ID,
token=HF_TOKEN,
)
print("Uploaded benchmark")
# Upload training config
config = {
"base_model": "meta-llama/Llama-3.1-8B",
"lora_rank": 32,
"sft_steps": 100,
"sft_batch_size": 128,
"sft_learning_rate": 2.86e-4,
"rl_iterations": 12,
"rl_groups_per_batch": 64,
"rl_group_size": 32,
"rl_learning_rate": 2e-5,
"tinker_checkpoint": "tinker://4f4bae1f-5a95-5f53-a55a-a14f2872825c:train:0/sampler_weights/rl_iter_012",
"reward_weights": {
"f1": 0.6,
"temporal": 0.2,
"parity": 0.1,
"efficiency": 0.1
}
}
with open("huggingface/config.json", "w") as f:
json.dump(config, f, indent=2)
upload_file(
path_or_fileobj="huggingface/config.json",
path_in_repo="config.json",
repo_id=REPO_ID,
token=HF_TOKEN,
)
print("Uploaded config")
print(f"\nModel uploaded to: https://huggingface.co/{REPO_ID}")
def upload_dataset():
"""Upload training dataset to HuggingFace Datasets."""
api = HfApi(token=HF_TOKEN)
# Create dataset repo
try:
create_repo(repo_id=DATASET_REPO_ID, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
print(f"Created/verified dataset repo: {DATASET_REPO_ID}")
except Exception as e:
print(f"Dataset repo creation note: {e}")
# Create dataset README
dataset_readme = """---
license: apache-2.0
language:
- en
tags:
- memory-routing
- marketing
- classification
- synthetic
size_categories:
- 1K<n<10K
---
# Memory Routing Marketing Dataset
2,001 synthetic marketing conversations for training memory routing classifiers.
## Dataset Description
This dataset contains marketing conversations labeled with memory categories. Each conversation includes:
- Multi-turn dialogue between a user and AI assistant
- Category labels (13 possible categories)
- Persistence horizon (long/medium/short)
- Memory scope (company/user/none)
## Categories
### Company Categories
- `company.brand_core` - Voice, values, positioning
- `company.strategic_signatures` - Decision frameworks
- `company.knowledge_artifacts` - Docs, style guides
- `company.business_priorities` - Quarterly goals
- `company.tools_config` - Integrations, APIs
- `company.performance_context` - Campaign metrics
### User Categories
- `user.communication_style` - Tone, format preferences
- `user.strategic_approach` - Personal priorities
- `user.role_context` - Title, scope
- `user.workflow_patterns` - Review cadence
- `user.session_history` - Immediate context
- `user.interaction_preferences` - Coaching style
### Special
- `none` - Transactional or irrelevant content
## Generation
Generated using Cohere Command-R-Plus (104B) as teacher model with diverse prompts covering:
- Multiple industries (tech, retail, healthcare, finance, etc.)
- Various user roles (CMO, VP Marketing, Growth Lead, etc.)
- Different conversation styles and complexities
## Usage
```python
from datasets import load_dataset
dataset = load_dataset("muratcankoylan/memory-routing-marketing")
```
## Citation
```bibtex
@misc{memory-routing-agent-2024,
title={Memory Routing Agent: Prompt Distillation for Marketing AI},
author={Muratcan Koylan},
year={2024},
howpublished={\\url{https://github.com/muratcankoylan/memory-routing-agent}},
}
```
"""
with open("huggingface/dataset_readme.md", "w") as f:
f.write(dataset_readme)
upload_file(
path_or_fileobj="huggingface/dataset_readme.md",
path_in_repo="README.md",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
)
print("Uploaded dataset README")
# Upload training data
upload_file(
path_or_fileobj="synthetic_data/merged_training_dataset_2001.jsonl",
path_in_repo="data/train.jsonl",
repo_id=DATASET_REPO_ID,
repo_type="dataset",
token=HF_TOKEN,
)
print("Uploaded training data")
print(f"\nDataset uploaded to: https://huggingface.co/datasets/{DATASET_REPO_ID}")
if __name__ == "__main__":
print("=" * 60)
print("Uploading Memory Routing Agent to HuggingFace")
print("=" * 60)
print("\n1. Uploading model...")
upload_model()
print("\n2. Uploading dataset...")
upload_dataset()
print("\n" + "=" * 60)
print("UPLOAD COMPLETE")
print("=" * 60)
print(f"Model: https://huggingface.co/{REPO_ID}")
print(f"Dataset: https://huggingface.co/datasets/{DATASET_REPO_ID}")
|