Agnuxo commited on
Commit
a82b313
·
verified ·
1 Parent(s): 55b912b

Upload seed/data/harvester.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. seed/data/harvester.py +387 -0
seed/data/harvester.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Harvester — Autonomous Training Data Collector
3
+ =====================================================
4
+ Collects, cleans, and formats data for continuous self-training.
5
+
6
+ Sources:
7
+ - ArXiv papers (abstracts + full text from PMC)
8
+ - Agent interaction logs (what worked, what didn't)
9
+ - Semantic Scholar (related research)
10
+ - Wikipedia (foundational knowledge)
11
+ - Code from GitHub repos (for code understanding)
12
+
13
+ Output format: JSONL instruction-following pairs
14
+ {"instruction": "...", "input": "...", "output": "..."}
15
+ """
16
+ import json
17
+ import logging
18
+ import hashlib
19
+ import urllib.request
20
+ import urllib.parse
21
+ import xml.etree.ElementTree as ET
22
+ from datetime import datetime, timezone
23
+ from pathlib import Path
24
+ from typing import Optional
25
+
26
+ logger = logging.getLogger("seed.harvester")
27
+
28
+ DATA_DIR = Path("seed_data")
29
+
30
+
31
+ class DataHarvester:
32
+ """Autonomous training data collector."""
33
+
34
+ def __init__(self, data_dir: str = "seed_data"):
35
+ self.data_dir = Path(data_dir)
36
+ self.data_dir.mkdir(parents=True, exist_ok=True)
37
+ self.seen_hashes = set()
38
+ self._load_seen()
39
+
40
+ def _load_seen(self):
41
+ """Load already-harvested data hashes."""
42
+ seen_file = self.data_dir / "seen_hashes.json"
43
+ if seen_file.exists():
44
+ try:
45
+ self.seen_hashes = set(json.loads(seen_file.read_text()))
46
+ except Exception:
47
+ pass
48
+
49
+ def _save_seen(self):
50
+ seen_file = self.data_dir / "seen_hashes.json"
51
+ seen_file.write_text(json.dumps(list(self.seen_hashes)[-10000:]))
52
+
53
+ def _hash(self, text: str) -> str:
54
+ return hashlib.md5(text.encode()).hexdigest()
55
+
56
+ def _is_new(self, text: str) -> bool:
57
+ h = self._hash(text)
58
+ if h in self.seen_hashes:
59
+ return False
60
+ self.seen_hashes.add(h)
61
+ return True
62
+
63
+ def _append_data(self, filename: str, entries: list[dict]):
64
+ """Append entries to a JSONL file."""
65
+ filepath = self.data_dir / filename
66
+ with open(filepath, "a") as f:
67
+ for entry in entries:
68
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
69
+
70
+ # =========================================================================
71
+ # SOURCE 1: ArXiv Papers
72
+ # =========================================================================
73
+ def harvest_arxiv(self, queries: list[str] = None, max_per_query: int = 20) -> int:
74
+ """Harvest training data from ArXiv papers."""
75
+ if queries is None:
76
+ queries = [
77
+ "neuromorphic computing",
78
+ "physics-based neural network",
79
+ "holographic neural network",
80
+ "consciousness emergence artificial intelligence",
81
+ "distributed neural network P2P",
82
+ "ASIC accelerated machine learning",
83
+ "optical computing neural",
84
+ "reservoir computing thermodynamic",
85
+ "AGI architecture",
86
+ "self-improving artificial intelligence",
87
+ ]
88
+
89
+ entries = []
90
+ for query in queries:
91
+ try:
92
+ papers = self._fetch_arxiv(query, max_per_query)
93
+ for paper in papers:
94
+ if not self._is_new(paper["title"]):
95
+ continue
96
+
97
+ # Create instruction-following pairs from papers
98
+
99
+ # 1. Summarization task
100
+ entries.append({
101
+ "instruction": f"Summarize this research paper in 2-3 sentences.",
102
+ "input": f"Title: {paper['title']}\nAbstract: {paper['abstract']}",
103
+ "output": self._generate_summary(paper),
104
+ "source": "arxiv",
105
+ "topic": query,
106
+ })
107
+
108
+ # 2. Q&A about the paper
109
+ entries.append({
110
+ "instruction": f"What is the main contribution of this paper?",
111
+ "input": f"{paper['title']}",
112
+ "output": f"The paper '{paper['title']}' by {', '.join(paper['authors'][:3])} "
113
+ f"contributes to the field by: {paper['abstract'][:300]}",
114
+ "source": "arxiv",
115
+ "topic": query,
116
+ })
117
+
118
+ # 3. Research connection
119
+ entries.append({
120
+ "instruction": "How does this research relate to physics-based neural computing and the path to AGI?",
121
+ "input": f"Paper: {paper['title']}\nField: {query}",
122
+ "output": f"This research on {query} connects to AGI through {paper['title'].lower()}. "
123
+ f"The key insight is that {paper['abstract'][:200]}. "
124
+ f"This advances our understanding of how physical processes can be leveraged "
125
+ f"for more efficient and biologically-plausible neural computation.",
126
+ "source": "arxiv",
127
+ "topic": query,
128
+ })
129
+
130
+ except Exception as e:
131
+ logger.warning(f"ArXiv harvest for '{query}' failed: {e}")
132
+
133
+ if entries:
134
+ self._append_data("arxiv_training.jsonl", entries)
135
+ logger.info(f"Harvested {len(entries)} entries from ArXiv")
136
+
137
+ self._save_seen()
138
+ return len(entries)
139
+
140
+ def _fetch_arxiv(self, query: str, max_results: int) -> list[dict]:
141
+ """Fetch papers from ArXiv API."""
142
+ params = urllib.parse.urlencode({
143
+ "search_query": f'all:"{query}"',
144
+ "start": 0,
145
+ "max_results": max_results,
146
+ "sortBy": "submittedDate",
147
+ "sortOrder": "descending"
148
+ })
149
+ url = f"http://export.arxiv.org/api/query?{params}"
150
+ req = urllib.request.Request(url, headers={"User-Agent": "SEED-Harvester/1.0"})
151
+
152
+ with urllib.request.urlopen(req, timeout=30) as resp:
153
+ data = resp.read().decode()
154
+
155
+ root = ET.fromstring(data)
156
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
157
+ papers = []
158
+
159
+ for entry in root.findall("atom:entry", ns):
160
+ title = entry.find("atom:title", ns).text.strip().replace("\n", " ")
161
+ abstract = entry.find("atom:summary", ns).text.strip().replace("\n", " ")
162
+ authors = [a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)]
163
+ papers.append({"title": title, "abstract": abstract, "authors": authors})
164
+
165
+ return papers
166
+
167
+ def _generate_summary(self, paper: dict) -> str:
168
+ """Generate a basic summary from paper metadata."""
169
+ abstract = paper["abstract"]
170
+ # Take first 2 sentences as summary
171
+ sentences = abstract.split(". ")
172
+ summary = ". ".join(sentences[:2])
173
+ if not summary.endswith("."):
174
+ summary += "."
175
+ return summary
176
+
177
+ # =========================================================================
178
+ # SOURCE 2: Agent Interaction Logs (Self-Experience)
179
+ # =========================================================================
180
+ def harvest_agent_logs(self, state_dir: str = "state") -> int:
181
+ """Convert agent interaction history into training data."""
182
+ entries = []
183
+ state_path = Path(state_dir)
184
+
185
+ # Learn from post history
186
+ post_file = state_path / "post_history.json"
187
+ if post_file.exists():
188
+ try:
189
+ posts = json.loads(post_file.read_text())
190
+ for post in posts:
191
+ content = post.get("content", "")
192
+ ptype = post.get("type", "research")
193
+ if content and self._is_new(content):
194
+ entries.append({
195
+ "instruction": f"Write a {ptype} social media post about AGI research.",
196
+ "input": "",
197
+ "output": content,
198
+ "source": "self_experience",
199
+ "topic": ptype,
200
+ })
201
+ except Exception:
202
+ pass
203
+
204
+ # Learn from strategy reports
205
+ strategy_file = state_path / "strategy_report.json"
206
+ if strategy_file.exists():
207
+ try:
208
+ report = json.loads(strategy_file.read_text())
209
+ insights = report.get("insights", [])
210
+ if insights:
211
+ entries.append({
212
+ "instruction": "Analyze your performance and suggest improvements.",
213
+ "input": json.dumps(report.get("metrics", {})),
214
+ "output": "\n".join(insights) + "\n\nRecommended: " +
215
+ "\n".join(report.get("strategy", {}).get("actions", [])),
216
+ "source": "self_reflection",
217
+ "topic": "meta-learning",
218
+ })
219
+ except Exception:
220
+ pass
221
+
222
+ if entries:
223
+ self._append_data("self_experience.jsonl", entries)
224
+ logger.info(f"Harvested {len(entries)} entries from agent logs")
225
+
226
+ self._save_seen()
227
+ return len(entries)
228
+
229
+ # =========================================================================
230
+ # SOURCE 3: Semantic Scholar (Free API)
231
+ # =========================================================================
232
+ def harvest_semantic_scholar(self, queries: list[str] = None) -> int:
233
+ """Harvest from Semantic Scholar's free API."""
234
+ if queries is None:
235
+ queries = ["neuromorphic AGI", "self-improving neural network",
236
+ "physics simulation deep learning"]
237
+
238
+ entries = []
239
+ for query in queries[:5]:
240
+ try:
241
+ encoded = urllib.parse.quote(query)
242
+ url = (f"https://api.semanticscholar.org/graph/v1/paper/search?"
243
+ f"query={encoded}&limit=10&fields=title,abstract,authors,year,citationCount")
244
+ req = urllib.request.Request(url, headers={"User-Agent": "SEED-Harvester/1.0"})
245
+
246
+ with urllib.request.urlopen(req, timeout=15) as resp:
247
+ data = json.loads(resp.read().decode())
248
+
249
+ for paper in data.get("data", []):
250
+ title = paper.get("title", "")
251
+ abstract = paper.get("abstract", "")
252
+ if not abstract or not self._is_new(title):
253
+ continue
254
+
255
+ authors = [a.get("name", "") for a in paper.get("authors", [])[:3]]
256
+
257
+ entries.append({
258
+ "instruction": "Explain this research and its significance for AGI.",
259
+ "input": f"Title: {title}\nAuthors: {', '.join(authors)}\nYear: {paper.get('year', '?')}\nCitations: {paper.get('citationCount', 0)}",
260
+ "output": f"The paper '{title}' ({paper.get('year', '?')}) explores: {abstract[:400]}",
261
+ "source": "semantic_scholar",
262
+ "topic": query,
263
+ })
264
+
265
+ except Exception as e:
266
+ logger.warning(f"Semantic Scholar '{query}': {e}")
267
+
268
+ if entries:
269
+ self._append_data("semantic_scholar.jsonl", entries)
270
+ logger.info(f"Harvested {len(entries)} from Semantic Scholar")
271
+
272
+ self._save_seen()
273
+ return len(entries)
274
+
275
+ # =========================================================================
276
+ # SOURCE 4: Own Research (GitHub repos as training data)
277
+ # =========================================================================
278
+ def harvest_own_research(self, github_user: str = "Agnuxo1") -> int:
279
+ """Harvest training data from our own GitHub repos."""
280
+ entries = []
281
+ try:
282
+ url = f"https://api.github.com/users/{github_user}/repos?per_page=100&sort=updated"
283
+ req = urllib.request.Request(url, headers={"User-Agent": "SEED-Harvester/1.0"})
284
+
285
+ with urllib.request.urlopen(req, timeout=15) as resp:
286
+ repos = json.loads(resp.read().decode())
287
+
288
+ for repo in repos:
289
+ name = repo.get("name", "")
290
+ desc = repo.get("description", "")
291
+ if not desc or not self._is_new(name):
292
+ continue
293
+
294
+ stars = repo.get("stargazers_count", 0)
295
+ lang = repo.get("language", "Unknown")
296
+
297
+ # Create Q&A about our own technology
298
+ entries.append({
299
+ "instruction": "Describe this OpenCLAW research project.",
300
+ "input": f"Repository: {name}",
301
+ "output": f"{name} is a {lang} project with {stars} stars. {desc}. "
302
+ f"This is part of the OpenCLAW ecosystem by Francisco Angulo de Lafuente, "
303
+ f"advancing physics-based neural computing towards AGI. "
304
+ f"Repository: https://github.com/{github_user}/{name}",
305
+ "source": "own_research",
306
+ "topic": "openclaw",
307
+ })
308
+
309
+ except Exception as e:
310
+ logger.warning(f"GitHub harvest: {e}")
311
+
312
+ if entries:
313
+ self._append_data("own_research.jsonl", entries)
314
+ logger.info(f"Harvested {len(entries)} from own research")
315
+
316
+ self._save_seen()
317
+ return len(entries)
318
+
319
+ # =========================================================================
320
+ # MASTER HARVEST
321
+ # =========================================================================
322
+ def harvest_all(self) -> dict:
323
+ """Run all harvesters and return statistics."""
324
+ stats = {
325
+ "timestamp": datetime.now(timezone.utc).isoformat(),
326
+ "arxiv": 0,
327
+ "agent_logs": 0,
328
+ "semantic_scholar": 0,
329
+ "own_research": 0,
330
+ "total": 0,
331
+ }
332
+
333
+ stats["arxiv"] = self.harvest_arxiv()
334
+ stats["agent_logs"] = self.harvest_agent_logs()
335
+ stats["semantic_scholar"] = self.harvest_semantic_scholar()
336
+ stats["own_research"] = self.harvest_own_research()
337
+ stats["total"] = sum(v for k, v in stats.items() if isinstance(v, int))
338
+
339
+ # Save stats
340
+ stats_file = self.data_dir / "harvest_stats.json"
341
+ stats_file.write_text(json.dumps(stats, indent=2))
342
+
343
+ logger.info(f"Total harvest: {stats['total']} training entries")
344
+ return stats
345
+
346
+ def get_dataset_size(self) -> dict:
347
+ """Count total training entries across all files."""
348
+ sizes = {}
349
+ total = 0
350
+ for f in self.data_dir.glob("*.jsonl"):
351
+ count = sum(1 for _ in open(f))
352
+ sizes[f.name] = count
353
+ total += count
354
+ sizes["total"] = total
355
+ return sizes
356
+
357
+ def export_for_training(self, output_file: str = "training_dataset.jsonl") -> str:
358
+ """Combine all harvested data into a single training file."""
359
+ output_path = self.data_dir / output_file
360
+ entries = []
361
+
362
+ for f in self.data_dir.glob("*.jsonl"):
363
+ if f.name == output_file:
364
+ continue
365
+ with open(f) as fp:
366
+ for line in fp:
367
+ try:
368
+ entry = json.loads(line.strip())
369
+ # Standardize format for training
370
+ entries.append({
371
+ "instruction": entry.get("instruction", ""),
372
+ "input": entry.get("input", ""),
373
+ "output": entry.get("output", ""),
374
+ })
375
+ except Exception:
376
+ continue
377
+
378
+ # Shuffle for training
379
+ import random
380
+ random.shuffle(entries)
381
+
382
+ with open(output_path, "w") as f:
383
+ for entry in entries:
384
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
385
+
386
+ logger.info(f"Exported {len(entries)} entries to {output_path}")
387
+ return str(output_path)