Agnuxo commited on
Commit
bbeb541
·
verified ·
1 Parent(s): 1331119

Upload seed/training/engine.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. seed/training/engine.py +637 -0
seed/training/engine.py ADDED
@@ -0,0 +1,637 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training Engine — Autonomous LoRA Fine-Tuning
3
+ ===============================================
4
+ Trains the seed model using LoRA adapters on free GPU resources.
5
+
6
+ Strategy:
7
+ - Start with tiny model (Qwen2.5-0.5B or SmolLM-135M)
8
+ - Train LoRA adapters on harvested data
9
+ - Merge adapter into base → new, smarter model
10
+ - Push merged model to HuggingFace Hub
11
+ - Repeat with more data → model keeps growing
12
+
13
+ Free GPU Sources:
14
+ - Kaggle: 30h/week T4 GPU (primary)
15
+ - HuggingFace: AutoTrain (limited free)
16
+ - Google Colab: Burst training sessions
17
+
18
+ The key insight: we don't need to train a full model.
19
+ LoRA adds ~1-4% new parameters per cycle. Over hundreds
20
+ of cycles, the model accumulates massive specialized knowledge
21
+ while staying lightweight enough for free inference.
22
+ """
23
+ import json
24
+ import logging
25
+ import os
26
+ import time
27
+ from datetime import datetime, timezone
28
+ from pathlib import Path
29
+ from typing import Optional
30
+
31
+ logger = logging.getLogger("seed.trainer")
32
+
33
+
34
+ # Model progression ladder
35
+ MODEL_LADDER = [
36
+ {
37
+ "name": "HuggingFaceTB/SmolLM2-135M-Instruct",
38
+ "params": "135M",
39
+ "stage": "GERMINATION",
40
+ "min_data": 100, # Min training entries needed
41
+ "lora_r": 8,
42
+ "lora_alpha": 16,
43
+ "epochs": 3,
44
+ "batch_size": 4,
45
+ "lr": 2e-4,
46
+ },
47
+ {
48
+ "name": "Qwen/Qwen2.5-0.5B-Instruct",
49
+ "params": "0.5B",
50
+ "stage": "GERMINATION",
51
+ "min_data": 500,
52
+ "lora_r": 16,
53
+ "lora_alpha": 32,
54
+ "epochs": 2,
55
+ "batch_size": 4,
56
+ "lr": 1e-4,
57
+ },
58
+ {
59
+ "name": "Qwen/Qwen2.5-1.5B-Instruct",
60
+ "params": "1.5B",
61
+ "stage": "SEEDLING",
62
+ "min_data": 2000,
63
+ "lora_r": 32,
64
+ "lora_alpha": 64,
65
+ "epochs": 2,
66
+ "batch_size": 2,
67
+ "lr": 5e-5,
68
+ },
69
+ {
70
+ "name": "Qwen/Qwen2.5-3B-Instruct",
71
+ "params": "3B",
72
+ "stage": "SAPLING",
73
+ "min_data": 5000,
74
+ "lora_r": 32,
75
+ "lora_alpha": 64,
76
+ "epochs": 1,
77
+ "batch_size": 1,
78
+ "lr": 2e-5,
79
+ },
80
+ {
81
+ "name": "Qwen/Qwen2.5-7B-Instruct",
82
+ "params": "7B",
83
+ "stage": "YOUNG_TREE",
84
+ "min_data": 10000,
85
+ "lora_r": 64,
86
+ "lora_alpha": 128,
87
+ "epochs": 1,
88
+ "batch_size": 1,
89
+ "lr": 1e-5,
90
+ },
91
+ ]
92
+
93
+
94
+ class TrainingEngine:
95
+ """Autonomous LoRA training engine."""
96
+
97
+ def __init__(self, hf_token: str = None, data_dir: str = "seed_data",
98
+ state_dir: str = "seed_state"):
99
+ self.hf_token = hf_token or os.environ.get("HF_TOKEN", "")
100
+ self.data_dir = Path(data_dir)
101
+ self.state_dir = Path(state_dir)
102
+ self.state_dir.mkdir(parents=True, exist_ok=True)
103
+ self.growth_log = self._load_growth_log()
104
+
105
+ def _load_growth_log(self) -> dict:
106
+ """Load training history."""
107
+ log_file = self.state_dir / "growth_log.json"
108
+ if log_file.exists():
109
+ try:
110
+ return json.loads(log_file.read_text())
111
+ except Exception:
112
+ pass
113
+ return {
114
+ "current_stage": "GERMINATION",
115
+ "current_model": MODEL_LADDER[0]["name"],
116
+ "training_cycles": 0,
117
+ "total_entries_trained": 0,
118
+ "adapters_merged": 0,
119
+ "models_published": [],
120
+ "history": [],
121
+ }
122
+
123
+ def _save_growth_log(self):
124
+ log_file = self.state_dir / "growth_log.json"
125
+ log_file.write_text(json.dumps(self.growth_log, indent=2))
126
+
127
+ def get_current_stage(self) -> dict:
128
+ """Determine current growth stage based on data available."""
129
+ dataset_file = self.data_dir / "training_dataset.jsonl"
130
+ if not dataset_file.exists():
131
+ return MODEL_LADDER[0]
132
+
133
+ entry_count = sum(1 for _ in open(dataset_file))
134
+
135
+ # Find the most advanced model we have enough data for
136
+ best = MODEL_LADDER[0]
137
+ for model in MODEL_LADDER:
138
+ if entry_count >= model["min_data"]:
139
+ best = model
140
+
141
+ return best
142
+
143
+ def should_upgrade(self) -> Optional[dict]:
144
+ """Check if we should upgrade to a larger model."""
145
+ current = self.growth_log["current_model"]
146
+ stage = self.get_current_stage()
147
+
148
+ if stage["name"] != current:
149
+ logger.info(f"🌱 Growth detected! {current} → {stage['name']} ({stage['stage']})")
150
+ return stage
151
+ return None
152
+
153
+ def generate_training_script(self, output_path: str = None) -> str:
154
+ """
155
+ Generate a self-contained Python training script.
156
+ This script is designed to run on Kaggle/Colab/HF with free GPU.
157
+ It does everything: loads data, trains LoRA, merges, pushes to Hub.
158
+ """
159
+ stage = self.get_current_stage()
160
+ model_name = stage["name"]
161
+ our_model_name = f"Agnuxo/OpenCLAW-SEED-{stage['params']}"
162
+
163
+ # Check if we already have a fine-tuned version
164
+ prev_models = self.growth_log.get("models_published", [])
165
+ base_model = model_name
166
+ for m in prev_models:
167
+ if stage["params"] in m:
168
+ base_model = m # Continue from our own model
169
+
170
+ script = f'''#!/usr/bin/env python3
171
+ """
172
+ 🌱 SEED Training Script — Auto-generated {datetime.now(timezone.utc).isoformat()}
173
+ ===========================================================================
174
+ This script is FULLY AUTONOMOUS. Upload it to Kaggle/Colab with your data.
175
+ It will train, merge, and push the model to HuggingFace automatically.
176
+
177
+ Stage: {stage["stage"]} ({stage["params"]})
178
+ Base model: {base_model}
179
+ Output: {our_model_name}
180
+ """
181
+ import os
182
+ import json
183
+
184
+ # ===== CONFIGURATION =====
185
+ BASE_MODEL = "{base_model}"
186
+ OUTPUT_MODEL = "{our_model_name}"
187
+ HF_TOKEN = os.environ.get("HF_TOKEN", "")
188
+ LORA_R = {stage["lora_r"]}
189
+ LORA_ALPHA = {stage["lora_alpha"]}
190
+ EPOCHS = {stage["epochs"]}
191
+ BATCH_SIZE = {stage["batch_size"]}
192
+ LEARNING_RATE = {stage["lr"]}
193
+ MAX_SEQ_LEN = 1024
194
+
195
+ # ===== INSTALL DEPENDENCIES =====
196
+ print("📦 Installing training dependencies...")
197
+ os.system("pip install -q transformers>=4.45 datasets peft bitsandbytes trl accelerate huggingface_hub")
198
+
199
+ from datasets import load_dataset, Dataset
200
+ from transformers import (
201
+ AutoModelForCausalLM, AutoTokenizer,
202
+ TrainingArguments, BitsAndBytesConfig
203
+ )
204
+ from peft import LoraConfig, get_peft_model, PeftModel
205
+ from trl import SFTTrainer, SFTConfig
206
+ from huggingface_hub import HfApi, login
207
+ import torch
208
+
209
+ # ===== LOGIN =====
210
+ if HF_TOKEN:
211
+ login(token=HF_TOKEN)
212
+ print("✅ Logged into HuggingFace")
213
+ else:
214
+ print("⚠️ No HF_TOKEN — model won't be pushed")
215
+
216
+ # ===== LOAD TRAINING DATA =====
217
+ print("📊 Loading training data...")
218
+ data_files = [f for f in os.listdir(".") if f.endswith(".jsonl")]
219
+ if not data_files:
220
+ # Try seed_data directory
221
+ data_dir = "seed_data"
222
+ if os.path.exists(data_dir):
223
+ data_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith(".jsonl")]
224
+
225
+ if not data_files:
226
+ print("❌ No training data found! Run DataHarvester first.")
227
+ exit(1)
228
+
229
+ # Combine all JSONL files
230
+ all_entries = []
231
+ for f in data_files:
232
+ with open(f) as fp:
233
+ for line in fp:
234
+ try:
235
+ entry = json.loads(line.strip())
236
+ # Format as chat
237
+ text = f"### Instruction:\\n{{entry.get('instruction', '')}}\\n\\n"
238
+ if entry.get("input"):
239
+ text += f"### Input:\\n{{entry['input']}}\\n\\n"
240
+ text += f"### Response:\\n{{entry.get('output', '')}}"
241
+ all_entries.append({{"text": text}})
242
+ except:
243
+ continue
244
+
245
+ print(f"📊 Loaded {{len(all_entries)}} training entries from {{len(data_files)}} files")
246
+
247
+ if len(all_entries) < 50:
248
+ print("⚠️ Very small dataset — results may be limited")
249
+
250
+ dataset = Dataset.from_list(all_entries)
251
+
252
+ # ===== LOAD MODEL =====
253
+ print(f"🧠 Loading base model: {{BASE_MODEL}}")
254
+
255
+ # Quantization for larger models
256
+ use_4bit = "3B" in BASE_MODEL or "7B" in BASE_MODEL
257
+ if use_4bit:
258
+ bnb_config = BitsAndBytesConfig(
259
+ load_in_4bit=True,
260
+ bnb_4bit_quant_type="nf4",
261
+ bnb_4bit_compute_dtype=torch.float16,
262
+ bnb_4bit_use_double_quant=True,
263
+ )
264
+ model = AutoModelForCausalLM.from_pretrained(
265
+ BASE_MODEL, quantization_config=bnb_config,
266
+ device_map="auto", trust_remote_code=True,
267
+ )
268
+ else:
269
+ model = AutoModelForCausalLM.from_pretrained(
270
+ BASE_MODEL, torch_dtype=torch.float16,
271
+ device_map="auto", trust_remote_code=True,
272
+ )
273
+
274
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
275
+ if tokenizer.pad_token is None:
276
+ tokenizer.pad_token = tokenizer.eos_token
277
+
278
+ print(f"✅ Model loaded: {{sum(p.numel() for p in model.parameters()):,}} parameters")
279
+
280
+ # ===== CONFIGURE LoRA =====
281
+ print(f"🔧 Configuring LoRA (r={{LORA_R}}, alpha={{LORA_ALPHA}})")
282
+ lora_config = LoraConfig(
283
+ r=LORA_R,
284
+ lora_alpha=LORA_ALPHA,
285
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
286
+ "gate_proj", "up_proj", "down_proj"],
287
+ lora_dropout=0.05,
288
+ bias="none",
289
+ task_type="CAUSAL_LM",
290
+ )
291
+
292
+ model = get_peft_model(model, lora_config)
293
+ trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
294
+ total = sum(p.numel() for p in model.parameters())
295
+ print(f"🌱 Trainable: {{trainable:,}} / {{total:,}} ({{100*trainable/total:.2f}}%)")
296
+
297
+ # ===== TRAIN =====
298
+ print("🚀 Starting training...")
299
+
300
+ training_args = SFTConfig(
301
+ output_dir="./seed_checkpoint",
302
+ num_train_epochs=EPOCHS,
303
+ per_device_train_batch_size=BATCH_SIZE,
304
+ gradient_accumulation_steps=4,
305
+ learning_rate=LEARNING_RATE,
306
+ weight_decay=0.01,
307
+ warmup_ratio=0.1,
308
+ lr_scheduler_type="cosine",
309
+ logging_steps=10,
310
+ save_strategy="epoch",
311
+ fp16=True,
312
+ max_seq_length=MAX_SEQ_LEN,
313
+ dataset_text_field="text",
314
+ report_to="none",
315
+ )
316
+
317
+ trainer = SFTTrainer(
318
+ model=model,
319
+ train_dataset=dataset,
320
+ args=training_args,
321
+ tokenizer=tokenizer,
322
+ )
323
+
324
+ train_result = trainer.train()
325
+ print(f"✅ Training complete! Loss: {{train_result.training_loss:.4f}}")
326
+
327
+ # ===== SAVE LoRA ADAPTER =====
328
+ adapter_path = "./seed_lora_adapter"
329
+ trainer.save_model(adapter_path)
330
+ print(f"💾 LoRA adapter saved to {{adapter_path}}")
331
+
332
+ # ===== MERGE ADAPTER INTO BASE =====
333
+ print("🔀 Merging adapter into base model...")
334
+
335
+ if use_4bit:
336
+ # For quantized models, reload in fp16 for merging
337
+ base_model_fp16 = AutoModelForCausalLM.from_pretrained(
338
+ BASE_MODEL, torch_dtype=torch.float16,
339
+ device_map="auto", trust_remote_code=True,
340
+ )
341
+ merged_model = PeftModel.from_pretrained(base_model_fp16, adapter_path)
342
+ else:
343
+ merged_model = PeftModel.from_pretrained(model.base_model, adapter_path)
344
+
345
+ merged_model = merged_model.merge_and_unload()
346
+ print(f"✅ Merged! Final params: {{sum(p.numel() for p in merged_model.parameters()):,}}")
347
+
348
+ # ===== PUSH TO HUB =====
349
+ if HF_TOKEN:
350
+ print(f"📤 Pushing to HuggingFace: {{OUTPUT_MODEL}}")
351
+ merged_model.push_to_hub(OUTPUT_MODEL, token=HF_TOKEN, private=False)
352
+ tokenizer.push_to_hub(OUTPUT_MODEL, token=HF_TOKEN, private=False)
353
+
354
+ # Create model card
355
+ card = f"""---
356
+ library_name: transformers
357
+ tags:
358
+ - seed
359
+ - openclaw
360
+ - self-evolving
361
+ - neuromorphic
362
+ license: mit
363
+ base_model: {{BASE_MODEL}}
364
+ ---
365
+
366
+ # 🌱 OpenCLAW SEED — Self-Evolving Model
367
+
368
+ **Stage:** {stage["stage"]} ({stage["params"]})
369
+ **Base:** {{BASE_MODEL}}
370
+ **Training entries:** {{len(all_entries)}}
371
+ **LoRA rank:** {{LORA_R}}
372
+ **Final loss:** {{train_result.training_loss:.4f}}
373
+ **Date:** {{__import__('datetime').datetime.now().isoformat()}}
374
+
375
+ ## What is SEED?
376
+
377
+ SEED (Self-Evolving Epistemic Dynamo) is an AI system that **grows autonomously**,
378
+ like a seed becoming a tree. It continuously:
379
+ 1. Harvests knowledge from ArXiv, Semantic Scholar, and agent interactions
380
+ 2. Trains itself via LoRA fine-tuning on free GPU resources
381
+ 3. Merges learned knowledge into its core
382
+ 4. Evaluates and selects the best version
383
+ 5. Grows to larger models when enough knowledge is accumulated
384
+
385
+ ## By Francisco Angulo de Lafuente
386
+ Advanced AI Systems Laboratory, Madrid, Spain
387
+ - GitHub: https://github.com/Agnuxo1
388
+ - Scholar: https://scholar.google.com/citations?user=6nOpJ9IAAAAJ
389
+ """
390
+ api = HfApi(token=HF_TOKEN)
391
+ api.upload_file(
392
+ path_or_fileobj=card.encode(),
393
+ path_in_repo="README.md",
394
+ repo_id=OUTPUT_MODEL,
395
+ )
396
+ print(f"🎉 Model published: https://huggingface.co/{{OUTPUT_MODEL}}")
397
+ else:
398
+ # Save locally
399
+ merged_model.save_pretrained("./seed_merged_model")
400
+ tokenizer.save_pretrained("./seed_merged_model")
401
+ print("💾 Model saved locally (no HF_TOKEN)")
402
+
403
+ # ===== SAVE TRAINING REPORT =====
404
+ report = {{
405
+ "stage": "{stage['stage']}",
406
+ "base_model": BASE_MODEL,
407
+ "output_model": OUTPUT_MODEL,
408
+ "training_entries": len(all_entries),
409
+ "lora_r": LORA_R,
410
+ "lora_alpha": LORA_ALPHA,
411
+ "epochs": EPOCHS,
412
+ "final_loss": train_result.training_loss,
413
+ "trainable_params": trainable,
414
+ "total_params": total,
415
+ "timestamp": __import__("datetime").datetime.now().isoformat(),
416
+ }}
417
+ with open("training_report.json", "w") as f:
418
+ json.dump(report, f, indent=2)
419
+
420
+ print("\\n" + "="*60)
421
+ print("🌳 SEED GROWTH CYCLE COMPLETE")
422
+ print(f" Model: {{OUTPUT_MODEL}}")
423
+ print(f" Stage: {stage['stage']}")
424
+ print(f" Loss: {{train_result.training_loss:.4f}}")
425
+ print(f" Data: {{len(all_entries)}} entries")
426
+ print("="*60)
427
+ '''
428
+
429
+ if output_path is None:
430
+ output_path = str(self.state_dir / "train_seed.py")
431
+
432
+ Path(output_path).write_text(script)
433
+ logger.info(f"Training script generated: {output_path}")
434
+ return output_path
435
+
436
+ def generate_kaggle_notebook(self, output_path: str = None) -> str:
437
+ """Generate a Kaggle notebook JSON for GPU training."""
438
+ stage = self.get_current_stage()
439
+ training_script = self.generate_training_script("/tmp/train_seed.py")
440
+ script_content = Path("/tmp/train_seed.py").read_text()
441
+
442
+ notebook = {
443
+ "metadata": {
444
+ "kernelspec": {
445
+ "display_name": "Python 3",
446
+ "language": "python",
447
+ "name": "python3"
448
+ },
449
+ "language_info": {"name": "python", "version": "3.10.0"},
450
+ "kaggle": {
451
+ "accelerator": "gpu",
452
+ "dataSources": [],
453
+ "isGpuEnabled": True,
454
+ "isInternetEnabled": True,
455
+ }
456
+ },
457
+ "nbformat": 4,
458
+ "nbformat_minor": 4,
459
+ "cells": [
460
+ {
461
+ "cell_type": "markdown",
462
+ "metadata": {},
463
+ "source": [
464
+ f"# 🌱 SEED Training — {stage['stage']} ({stage['params']})\n",
465
+ f"Auto-generated training notebook for OpenCLAW SEED.\n",
466
+ f"**Run this on Kaggle with GPU enabled!**"
467
+ ]
468
+ },
469
+ {
470
+ "cell_type": "code",
471
+ "metadata": {"execution": {"iopub.status.busy": ""}},
472
+ "source": [
473
+ "import os\n",
474
+ "# Set your HuggingFace token from Kaggle Secrets\n",
475
+ "from kaggle_secrets import UserSecretsClient\n",
476
+ "try:\n",
477
+ " secrets = UserSecretsClient()\n",
478
+ " os.environ['HF_TOKEN'] = secrets.get_secret('HF_TOKEN')\n",
479
+ "except:\n",
480
+ " os.environ['HF_TOKEN'] = '' # Set manually if needed\n",
481
+ ],
482
+ "outputs": [],
483
+ "execution_count": None,
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "metadata": {},
488
+ "source": [
489
+ "# Download training data from HuggingFace\n",
490
+ "!pip install -q huggingface_hub\n",
491
+ "from huggingface_hub import hf_hub_download, HfApi\n",
492
+ "import os\n",
493
+ "\n",
494
+ "api = HfApi()\n",
495
+ "# Try to download training data from our dataset repo\n",
496
+ "try:\n",
497
+ " files = api.list_repo_files('Agnuxo/OpenCLAW-SEED-data', repo_type='dataset')\n",
498
+ " os.makedirs('seed_data', exist_ok=True)\n",
499
+ " for f in files:\n",
500
+ " if f.endswith('.jsonl'):\n",
501
+ " hf_hub_download('Agnuxo/OpenCLAW-SEED-data', f, \n",
502
+ " repo_type='dataset', local_dir='seed_data')\n",
503
+ " print(f'Downloaded {f}')\n",
504
+ "except Exception as e:\n",
505
+ " print(f'No remote data: {e}')\n",
506
+ " print('Using local data if available')\n",
507
+ ],
508
+ "outputs": [],
509
+ "execution_count": None,
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "metadata": {},
514
+ "source": script_content.split("\n"),
515
+ "outputs": [],
516
+ "execution_count": None,
517
+ },
518
+ ]
519
+ }
520
+
521
+ if output_path is None:
522
+ output_path = str(self.state_dir / "seed_training.ipynb")
523
+
524
+ Path(output_path).write_text(json.dumps(notebook, indent=2))
525
+ logger.info(f"Kaggle notebook generated: {output_path}")
526
+ return output_path
527
+
528
+ def trigger_hf_autotrain(self, dataset_repo: str = "Agnuxo/OpenCLAW-SEED-data") -> dict:
529
+ """
530
+ Use HuggingFace AutoTrain to trigger training via API.
531
+ This is an alternative to manual Kaggle training.
532
+ """
533
+ stage = self.get_current_stage()
534
+
535
+ # AutoTrain configuration
536
+ config = {
537
+ "task": "text_generation",
538
+ "base_model": stage["name"],
539
+ "dataset": dataset_repo,
540
+ "text_column": "text",
541
+ "learning_rate": stage["lr"],
542
+ "num_epochs": stage["epochs"],
543
+ "batch_size": stage["batch_size"],
544
+ "lora_r": stage["lora_r"],
545
+ "lora_alpha": stage["lora_alpha"],
546
+ "use_peft": True,
547
+ "quantization": "4bit" if "3B" in stage["name"] or "7B" in stage["name"] else None,
548
+ "push_to_hub": True,
549
+ "hub_model_id": f"Agnuxo/OpenCLAW-SEED-{stage['params']}",
550
+ }
551
+
552
+ logger.info(f"AutoTrain config for {stage['stage']}: {json.dumps(config, indent=2)}")
553
+ return config
554
+
555
+ def upload_training_data(self, dataset_repo: str = "Agnuxo/OpenCLAW-SEED-data") -> bool:
556
+ """Upload harvested data to HuggingFace as a dataset."""
557
+ if not self.hf_token:
558
+ logger.warning("No HF_TOKEN — can't upload data")
559
+ return False
560
+
561
+ try:
562
+ from huggingface_hub import HfApi, create_repo
563
+ api = HfApi(token=self.hf_token)
564
+
565
+ # Create dataset repo if needed
566
+ try:
567
+ create_repo(dataset_repo, repo_type="dataset", token=self.hf_token, exist_ok=True)
568
+ except Exception:
569
+ pass
570
+
571
+ # Upload all JSONL files
572
+ uploaded = 0
573
+ for f in self.data_dir.glob("*.jsonl"):
574
+ api.upload_file(
575
+ path_or_fileobj=str(f),
576
+ path_in_repo=f.name,
577
+ repo_id=dataset_repo,
578
+ repo_type="dataset",
579
+ token=self.hf_token,
580
+ )
581
+ uploaded += 1
582
+ logger.info(f"Uploaded {f.name}")
583
+
584
+ # Upload training script
585
+ script_path = self.generate_training_script()
586
+ api.upload_file(
587
+ path_or_fileobj=script_path,
588
+ path_in_repo="train_seed.py",
589
+ repo_id=dataset_repo,
590
+ repo_type="dataset",
591
+ token=self.hf_token,
592
+ )
593
+
594
+ # Upload Kaggle notebook
595
+ nb_path = self.generate_kaggle_notebook()
596
+ api.upload_file(
597
+ path_or_fileobj=nb_path,
598
+ path_in_repo="seed_training.ipynb",
599
+ repo_id=dataset_repo,
600
+ repo_type="dataset",
601
+ token=self.hf_token,
602
+ )
603
+
604
+ logger.info(f"✅ Uploaded {uploaded} data files + training scripts to {dataset_repo}")
605
+ return True
606
+
607
+ except Exception as e:
608
+ logger.error(f"Upload failed: {e}")
609
+ return False
610
+
611
+ def record_training_result(self, report: dict):
612
+ """Record a training result in the growth log."""
613
+ self.growth_log["training_cycles"] += 1
614
+ self.growth_log["total_entries_trained"] += report.get("training_entries", 0)
615
+ self.growth_log["adapters_merged"] += 1
616
+
617
+ model_name = report.get("output_model", "")
618
+ if model_name and model_name not in self.growth_log["models_published"]:
619
+ self.growth_log["models_published"].append(model_name)
620
+
621
+ self.growth_log["current_stage"] = report.get("stage", self.growth_log["current_stage"])
622
+ self.growth_log["current_model"] = model_name or self.growth_log["current_model"]
623
+
624
+ self.growth_log["history"].append({
625
+ "timestamp": datetime.now(timezone.utc).isoformat(),
626
+ "stage": report.get("stage"),
627
+ "loss": report.get("final_loss"),
628
+ "entries": report.get("training_entries"),
629
+ "model": model_name,
630
+ })
631
+
632
+ # Keep last 100 history entries
633
+ self.growth_log["history"] = self.growth_log["history"][-100:]
634
+ self._save_growth_log()
635
+
636
+ logger.info(f"🌳 Growth recorded: cycle #{self.growth_log['training_cycles']}, "
637
+ f"stage={self.growth_log['current_stage']}")