Agnuxo commited on
Commit
673b33e
Β·
verified Β·
1 Parent(s): e84be9f

Upload seed/growth_engine.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. seed/growth_engine.py +340 -0
seed/growth_engine.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Growth Engine β€” The Master Orchestrator
3
+ ==========================================
4
+ This is the BRAIN of the seed. It orchestrates the full growth cycle:
5
+
6
+ 🌱 Plant β†’ 🌿 Sprout β†’ 🌳 Grow β†’ 🍎 Fruit
7
+
8
+ Each cycle:
9
+ 1. Harvest data (ArXiv, interactions, web)
10
+ 2. Prepare training dataset
11
+ 3. Upload to HuggingFace dataset repo
12
+ 4. Generate training script/notebook
13
+ 5. Trigger training (Kaggle/HF AutoTrain)
14
+ 6. Evaluate results
15
+ 7. Select best model (evolution)
16
+ 8. Check if ready to grow to next stage
17
+ 9. Update all state and logs
18
+ 10. Sleep and repeat
19
+
20
+ The engine is designed to run FOREVER with zero human intervention.
21
+ Like a real seed β€” you plant it, water it once, and it grows by itself.
22
+ """
23
+ import json
24
+ import logging
25
+ import os
26
+ import time
27
+ from datetime import datetime, timezone
28
+ from pathlib import Path
29
+
30
+ logger = logging.getLogger("seed.growth")
31
+
32
+
33
+ class GrowthEngine:
34
+ """Master orchestrator for autonomous model growth."""
35
+
36
+ def __init__(self, hf_token: str = None, state_dir: str = "seed_state",
37
+ data_dir: str = "seed_data"):
38
+ self.hf_token = hf_token or os.environ.get("HF_TOKEN", "")
39
+ self.state_dir = Path(state_dir)
40
+ self.data_dir = Path(data_dir)
41
+ self.state_dir.mkdir(parents=True, exist_ok=True)
42
+ self.data_dir.mkdir(parents=True, exist_ok=True)
43
+
44
+ # Initialize sub-engines lazily
45
+ self._harvester = None
46
+ self._trainer = None
47
+ self._evolver = None
48
+
49
+ self.cycle_log = self._load_cycle_log()
50
+
51
+ @property
52
+ def harvester(self):
53
+ if self._harvester is None:
54
+ from seed.data.harvester import DataHarvester
55
+ self._harvester = DataHarvester(str(self.data_dir))
56
+ return self._harvester
57
+
58
+ @property
59
+ def trainer(self):
60
+ if self._trainer is None:
61
+ from seed.training.engine import TrainingEngine
62
+ self._trainer = TrainingEngine(self.hf_token, str(self.data_dir), str(self.state_dir))
63
+ return self._trainer
64
+
65
+ @property
66
+ def evolver(self):
67
+ if self._evolver is None:
68
+ from seed.evolution.selector import EvolutionEngine
69
+ self._evolver = EvolutionEngine(self.hf_token, str(self.state_dir))
70
+ return self._evolver
71
+
72
+ def _load_cycle_log(self) -> dict:
73
+ log_file = self.state_dir / "cycle_log.json"
74
+ if log_file.exists():
75
+ try:
76
+ return json.loads(log_file.read_text())
77
+ except Exception:
78
+ pass
79
+ return {
80
+ "total_cycles": 0,
81
+ "last_harvest": None,
82
+ "last_training": None,
83
+ "last_evaluation": None,
84
+ "current_stage": "GERMINATION",
85
+ "total_data_harvested": 0,
86
+ "created_at": datetime.now(timezone.utc).isoformat(),
87
+ }
88
+
89
+ def _save_cycle_log(self):
90
+ log_file = self.state_dir / "cycle_log.json"
91
+ log_file.write_text(json.dumps(self.cycle_log, indent=2))
92
+
93
+ # ==========================================================================
94
+ # PHASE 1: HARVEST
95
+ # ==========================================================================
96
+ def harvest(self) -> dict:
97
+ """Collect training data from all sources."""
98
+ logger.info("🌾 Phase 1: HARVESTING data...")
99
+
100
+ stats = self.harvester.harvest_all()
101
+
102
+ self.cycle_log["last_harvest"] = datetime.now(timezone.utc).isoformat()
103
+ self.cycle_log["total_data_harvested"] += stats.get("total", 0)
104
+ self._save_cycle_log()
105
+
106
+ logger.info(f"🌾 Harvested {stats['total']} new entries "
107
+ f"(total: {self.cycle_log['total_data_harvested']})")
108
+ return stats
109
+
110
+ # ==========================================================================
111
+ # PHASE 2: PREPARE
112
+ # ==========================================================================
113
+ def prepare(self) -> dict:
114
+ """Prepare and export training dataset."""
115
+ logger.info("πŸ“¦ Phase 2: PREPARING training data...")
116
+
117
+ # Export combined dataset
118
+ output = self.harvester.export_for_training()
119
+ sizes = self.harvester.get_dataset_size()
120
+
121
+ logger.info(f"πŸ“¦ Dataset ready: {sizes.get('total', 0)} entries β†’ {output}")
122
+ return {"dataset_path": output, "sizes": sizes}
123
+
124
+ # ==========================================================================
125
+ # PHASE 3: UPLOAD
126
+ # ==========================================================================
127
+ def upload(self) -> bool:
128
+ """Upload training data and scripts to HuggingFace."""
129
+ logger.info("☁️ Phase 3: UPLOADING to HuggingFace...")
130
+
131
+ success = self.trainer.upload_training_data()
132
+
133
+ if success:
134
+ logger.info("☁️ Data uploaded to Agnuxo/OpenCLAW-SEED-data")
135
+ else:
136
+ logger.warning("☁️ Upload failed β€” training can still run locally")
137
+
138
+ return success
139
+
140
+ # ==========================================================================
141
+ # PHASE 4: TRAIN
142
+ # ==========================================================================
143
+ def train(self) -> dict:
144
+ """
145
+ Generate training scripts and attempt to trigger training.
146
+
147
+ Note: Actual GPU training happens externally (Kaggle/HF/Colab).
148
+ This method prepares everything and triggers what it can.
149
+ """
150
+ logger.info("πŸ”₯ Phase 4: TRAINING setup...")
151
+
152
+ # Generate training script
153
+ script_path = self.trainer.generate_training_script()
154
+ nb_path = self.trainer.generate_kaggle_notebook()
155
+
156
+ # Check for growth opportunity
157
+ upgrade = self.trainer.should_upgrade()
158
+
159
+ result = {
160
+ "script_generated": script_path,
161
+ "notebook_generated": nb_path,
162
+ "current_stage": self.trainer.get_current_stage(),
163
+ "upgrade_available": upgrade is not None,
164
+ }
165
+
166
+ # If we have enough data, try HF AutoTrain config
167
+ stage = self.trainer.get_current_stage()
168
+ dataset_size = self.harvester.get_dataset_size().get("total", 0)
169
+
170
+ if dataset_size >= stage.get("min_data", 100):
171
+ result["autotrain_config"] = self.trainer.trigger_hf_autotrain()
172
+ result["ready_to_train"] = True
173
+ logger.info(f"πŸ”₯ Ready to train! {dataset_size} entries for {stage['name']}")
174
+ else:
175
+ result["ready_to_train"] = False
176
+ needed = stage.get("min_data", 100) - dataset_size
177
+ logger.info(f"πŸ”₯ Need {needed} more entries before training")
178
+
179
+ self.cycle_log["last_training"] = datetime.now(timezone.utc).isoformat()
180
+ self._save_cycle_log()
181
+
182
+ return result
183
+
184
+ # ==========================================================================
185
+ # PHASE 5: EVALUATE & EVOLVE
186
+ # ==========================================================================
187
+ def evaluate(self) -> dict:
188
+ """Evaluate current model and apply evolution."""
189
+ logger.info("πŸ§ͺ Phase 5: EVALUATING...")
190
+
191
+ # Get published models
192
+ published = self.trainer.growth_log.get("models_published", [])
193
+
194
+ candidates = []
195
+ for model in published[-5:]: # Last 5 models
196
+ try:
197
+ score = self.evolver.evaluate_model(model)
198
+ candidates.append(score)
199
+ logger.info(f" Evaluated {model}: {score.get('overall', 0):.3f}")
200
+ except Exception as e:
201
+ logger.warning(f" Failed to evaluate {model}: {e}")
202
+
203
+ if candidates:
204
+ best = self.evolver.select_best(candidates)
205
+
206
+ # Check growth signal
207
+ growth_signal = self.evolver.should_grow()
208
+ if growth_signal:
209
+ logger.info(f"🌳 GROWTH SIGNAL: {growth_signal} β€” Time to upgrade!")
210
+
211
+ self.cycle_log["last_evaluation"] = datetime.now(timezone.utc).isoformat()
212
+ self._save_cycle_log()
213
+
214
+ return {
215
+ "candidates_evaluated": len(candidates),
216
+ "best": best,
217
+ "growth_signal": growth_signal,
218
+ }
219
+
220
+ return {"candidates_evaluated": 0, "message": "No models to evaluate yet"}
221
+
222
+ # ==========================================================================
223
+ # FULL CYCLE
224
+ # ==========================================================================
225
+ def run_cycle(self) -> dict:
226
+ """
227
+ Execute one complete growth cycle.
228
+ This is the heartbeat of the seed.
229
+ """
230
+ self.cycle_log["total_cycles"] += 1
231
+ cycle_num = self.cycle_log["total_cycles"]
232
+
233
+ logger.info(f"{'='*60}")
234
+ logger.info(f"🌱 SEED Growth Cycle #{cycle_num}")
235
+ logger.info(f" Stage: {self.cycle_log['current_stage']}")
236
+ logger.info(f" Time: {datetime.now(timezone.utc).isoformat()}")
237
+ logger.info(f"{'='*60}")
238
+
239
+ results = {
240
+ "cycle": cycle_num,
241
+ "timestamp": datetime.now(timezone.utc).isoformat(),
242
+ "phases": {}
243
+ }
244
+
245
+ # Phase 1: Harvest
246
+ try:
247
+ results["phases"]["harvest"] = self.harvest()
248
+ except Exception as e:
249
+ logger.error(f"Harvest failed: {e}")
250
+ results["phases"]["harvest"] = {"error": str(e)}
251
+
252
+ # Phase 2: Prepare
253
+ try:
254
+ results["phases"]["prepare"] = self.prepare()
255
+ except Exception as e:
256
+ logger.error(f"Prepare failed: {e}")
257
+ results["phases"]["prepare"] = {"error": str(e)}
258
+
259
+ # Phase 3: Upload
260
+ try:
261
+ results["phases"]["upload"] = self.upload()
262
+ except Exception as e:
263
+ logger.error(f"Upload failed: {e}")
264
+ results["phases"]["upload"] = {"error": str(e)}
265
+
266
+ # Phase 4: Train
267
+ try:
268
+ results["phases"]["train"] = self.train()
269
+ except Exception as e:
270
+ logger.error(f"Train setup failed: {e}")
271
+ results["phases"]["train"] = {"error": str(e)}
272
+
273
+ # Phase 5: Evaluate
274
+ try:
275
+ results["phases"]["evaluate"] = self.evaluate()
276
+ except Exception as e:
277
+ logger.error(f"Evaluate failed: {e}")
278
+ results["phases"]["evaluate"] = {"error": str(e)}
279
+
280
+ # Update stage
281
+ stage = self.trainer.get_current_stage()
282
+ self.cycle_log["current_stage"] = stage.get("stage", "GERMINATION")
283
+ self._save_cycle_log()
284
+
285
+ # Save cycle results
286
+ results_file = self.state_dir / "last_growth_cycle.json"
287
+ results_file.write_text(json.dumps(results, indent=2, default=str))
288
+
289
+ logger.info(f"{'='*60}")
290
+ logger.info(f"🌱 Cycle #{cycle_num} complete!")
291
+ logger.info(f" Data: {self.cycle_log['total_data_harvested']} total entries")
292
+ logger.info(f" Stage: {self.cycle_log['current_stage']}")
293
+ logger.info(f"{'='*60}")
294
+
295
+ return results
296
+
297
+ def get_status(self) -> dict:
298
+ """Get full status of the seed."""
299
+ data_sizes = {}
300
+ try:
301
+ data_sizes = self.harvester.get_dataset_size()
302
+ except Exception:
303
+ pass
304
+
305
+ evolution_status = {}
306
+ try:
307
+ evolution_status = self.evolver.get_status()
308
+ except Exception:
309
+ pass
310
+
311
+ return {
312
+ "seed_version": "1.0.0",
313
+ "codename": "Apple Seed",
314
+ "current_stage": self.cycle_log.get("current_stage", "GERMINATION"),
315
+ "total_cycles": self.cycle_log.get("total_cycles", 0),
316
+ "total_data": self.cycle_log.get("total_data_harvested", 0),
317
+ "dataset_files": data_sizes,
318
+ "evolution": evolution_status,
319
+ "last_harvest": self.cycle_log.get("last_harvest"),
320
+ "last_training": self.cycle_log.get("last_training"),
321
+ "created": self.cycle_log.get("created_at"),
322
+ }
323
+
324
+ def run_forever(self, interval_hours: float = 6):
325
+ """
326
+ Run the growth cycle forever.
327
+ The seed grows endlessly, like nature intended.
328
+ """
329
+ logger.info("🌱 SEED planted! Beginning autonomous growth...")
330
+ logger.info(f" Growth cycle interval: {interval_hours}h")
331
+
332
+ while True:
333
+ try:
334
+ self.run_cycle()
335
+ except Exception as e:
336
+ logger.error(f"Cycle error (will retry): {e}")
337
+
338
+ sleep_seconds = interval_hours * 3600
339
+ logger.info(f"πŸ’€ Sleeping {interval_hours}h until next growth cycle...")
340
+ time.sleep(sleep_seconds)