mike1210 commited on
Commit
4971a7f
·
verified ·
1 Parent(s): e9a0cd5

Upload scripts/collect_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/collect_data.py +524 -0
scripts/collect_data.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Training Data Collection Pipeline for Crowe Logic Mini
4
+ Target: 1-2 billion tokens from scientific and domain-specific sources
5
+
6
+ Data sources:
7
+ 1. Public datasets (The Pile, RedPajama, arXiv, Wikipedia) - 1.5B tokens
8
+ 2. Domain-specific scraping (mycology, drug discovery) - 200M tokens
9
+ 3. Proprietary data (Southwest Mushrooms, CrowLogic) - 20M tokens
10
+ 4. Curated examples - 30M tokens
11
+ """
12
+
13
+ import os
14
+ import json
15
+ import requests
16
+ import subprocess
17
+ from pathlib import Path
18
+ from typing import List, Dict, Optional
19
+ from dataclasses import dataclass
20
+ from tqdm import tqdm
21
+ import hashlib
22
+
23
+
24
+ @dataclass
25
+ class DataSource:
26
+ """Configuration for a data source"""
27
+ name: str
28
+ url: Optional[str]
29
+ estimated_tokens: int
30
+ priority: str # "critical", "high", "medium", "low"
31
+ collection_method: str # "download", "api", "scrape", "manual"
32
+ status: str = "pending"
33
+
34
+
35
+ class DataCollectionPipeline:
36
+ """Automated pipeline to collect 1-2B tokens of training data"""
37
+
38
+ def __init__(self, output_dir: str = "./data/raw", target_tokens: int = 1_500_000_000):
39
+ self.output_dir = Path(output_dir)
40
+ self.output_dir.mkdir(parents=True, exist_ok=True)
41
+ self.target_tokens = target_tokens
42
+ self.collected_tokens = 0
43
+
44
+ self.sources = self._define_data_sources()
45
+
46
+ def _define_data_sources(self) -> List[DataSource]:
47
+ """Define all data sources with metadata"""
48
+
49
+ return [
50
+ # ===== PUBLIC DATASETS (Automated) =====
51
+
52
+ DataSource(
53
+ name="Wikipedia Science",
54
+ url="https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
55
+ estimated_tokens=500_000_000,
56
+ priority="critical",
57
+ collection_method="download"
58
+ ),
59
+
60
+ DataSource(
61
+ name="arXiv Papers",
62
+ url="s3://arxiv/", # AWS S3 bucket
63
+ estimated_tokens=300_000_000,
64
+ priority="critical",
65
+ collection_method="download"
66
+ ),
67
+
68
+ DataSource(
69
+ name="The Pile - arXiv subset",
70
+ url="https://the-eye.eu/public/AI/pile/train/",
71
+ estimated_tokens=200_000_000,
72
+ priority="high",
73
+ collection_method="download"
74
+ ),
75
+
76
+ DataSource(
77
+ name="The Pile - PubMed subset",
78
+ url="https://the-eye.eu/public/AI/pile/train/",
79
+ estimated_tokens=150_000_000,
80
+ priority="high",
81
+ collection_method="download"
82
+ ),
83
+
84
+ DataSource(
85
+ name="PubMed Abstracts",
86
+ url="https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/",
87
+ estimated_tokens=200_000_000,
88
+ priority="high",
89
+ collection_method="download"
90
+ ),
91
+
92
+ DataSource(
93
+ name="RedPajama - Wikipedia",
94
+ url="https://data.together.xyz/redpajama-data-1T/v1.0.0/",
95
+ estimated_tokens=100_000_000,
96
+ priority="medium",
97
+ collection_method="download"
98
+ ),
99
+
100
+ # ===== DOMAIN-SPECIFIC SOURCES =====
101
+
102
+ DataSource(
103
+ name="Mycology Literature",
104
+ url=None, # Multiple sources
105
+ estimated_tokens=50_000_000,
106
+ priority="critical",
107
+ collection_method="scrape"
108
+ ),
109
+
110
+ DataSource(
111
+ name="Drug Discovery Papers",
112
+ url="https://www.ebi.ac.uk/chembl/",
113
+ estimated_tokens=50_000_000,
114
+ priority="critical",
115
+ collection_method="api"
116
+ ),
117
+
118
+ DataSource(
119
+ name="AI/ML Papers (arXiv cs.AI)",
120
+ url="https://arxiv.org/list/cs.AI/recent",
121
+ estimated_tokens=100_000_000,
122
+ priority="high",
123
+ collection_method="api"
124
+ ),
125
+
126
+ DataSource(
127
+ name="GitHub AI Documentation",
128
+ url="https://github.com/",
129
+ estimated_tokens=50_000_000,
130
+ priority="medium",
131
+ collection_method="api"
132
+ ),
133
+
134
+ # ===== PROPRIETARY DATA =====
135
+
136
+ DataSource(
137
+ name="Southwest Mushrooms Data",
138
+ url=None,
139
+ estimated_tokens=10_000_000,
140
+ priority="critical",
141
+ collection_method="manual"
142
+ ),
143
+
144
+ DataSource(
145
+ name="CrowLogic Documentation",
146
+ url=None,
147
+ estimated_tokens=5_000_000,
148
+ priority="critical",
149
+ collection_method="manual"
150
+ ),
151
+
152
+ DataSource(
153
+ name="Prologic Methodology Examples",
154
+ url=None,
155
+ estimated_tokens=5_000_000,
156
+ priority="critical",
157
+ collection_method="manual"
158
+ ),
159
+
160
+ # ===== CURATED EXAMPLES =====
161
+
162
+ DataSource(
163
+ name="Chain-of-Thought Examples",
164
+ url=None,
165
+ estimated_tokens=10_000_000,
166
+ priority="high",
167
+ collection_method="manual"
168
+ ),
169
+
170
+ DataSource(
171
+ name="Domain Q&A Pairs",
172
+ url=None,
173
+ estimated_tokens=20_000_000,
174
+ priority="high",
175
+ collection_method="manual"
176
+ ),
177
+ ]
178
+
179
+ def download_wikipedia(self) -> Dict:
180
+ """Download and extract Wikipedia dump"""
181
+
182
+ print("\n" + "="*70)
183
+ print("Downloading Wikipedia Science Articles")
184
+ print("="*70)
185
+
186
+ wiki_dir = self.output_dir / "wikipedia"
187
+ wiki_dir.mkdir(exist_ok=True)
188
+
189
+ # Download latest dump
190
+ dump_url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
191
+ dump_file = wiki_dir / "enwiki-latest.xml.bz2"
192
+
193
+ print(f"\n📥 Downloading from: {dump_url}")
194
+ print(f" Destination: {dump_file}")
195
+ print(f" Size: ~20GB (compressed), ~80GB (uncompressed)")
196
+ print("\n⚠️ This will take 1-4 hours depending on connection speed")
197
+ print("\nCommands to run:")
198
+ print(f" wget {dump_url} -O {dump_file}")
199
+ print(f" python -m wikiextractor.WikiExtractor {dump_file} -o {wiki_dir / 'extracted'} --json")
200
+
201
+ return {
202
+ "status": "manual_steps_needed",
203
+ "instructions": "Run wget and WikiExtractor commands above",
204
+ "estimated_tokens": 500_000_000
205
+ }
206
+
207
+ def download_arxiv(self) -> Dict:
208
+ """Download arXiv papers"""
209
+
210
+ print("\n" + "="*70)
211
+ print("Downloading arXiv Papers")
212
+ print("="*70)
213
+
214
+ arxiv_dir = self.output_dir / "arxiv"
215
+ arxiv_dir.mkdir(exist_ok=True)
216
+
217
+ print("\nOptions for arXiv data:")
218
+ print("\n1. Bulk download from S3 (recommended):")
219
+ print(" aws s3 sync s3://arxiv/src/ ./data/raw/arxiv/ --no-sign-request")
220
+ print("\n2. Use arXiv API:")
221
+ print(" pip install arxiv")
222
+ print(" python scripts/download_arxiv_api.py")
223
+ print("\n3. Use existing preprocessed datasets:")
224
+ print(" - RedPajama arXiv subset")
225
+ print(" - The Pile arXiv subset")
226
+
227
+ return {
228
+ "status": "manual_steps_needed",
229
+ "instructions": "Choose one of the methods above",
230
+ "estimated_tokens": 300_000_000
231
+ }
232
+
233
+ def download_pubmed(self) -> Dict:
234
+ """Download PubMed abstracts"""
235
+
236
+ print("\n" + "="*70)
237
+ print("Downloading PubMed Abstracts")
238
+ print("="*70)
239
+
240
+ pubmed_dir = self.output_dir / "pubmed"
241
+ pubmed_dir.mkdir(exist_ok=True)
242
+
243
+ base_url = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/"
244
+
245
+ print(f"\n📥 PubMed Baseline Files")
246
+ print(f" URL: {base_url}")
247
+ print(f" Files: pubmed24n*.xml.gz (1000+ files)")
248
+ print(f" Total size: ~30GB")
249
+ print("\nCommand to download:")
250
+ print(f" wget -r -np -nd -A 'pubmed24n*.xml.gz' {base_url} -P {pubmed_dir}")
251
+
252
+ return {
253
+ "status": "manual_steps_needed",
254
+ "instructions": "Run wget command above",
255
+ "estimated_tokens": 200_000_000
256
+ }
257
+
258
+ def download_the_pile_subset(self, subset: str = "arxiv") -> Dict:
259
+ """Download specific subset from The Pile"""
260
+
261
+ print(f"\n" + "="*70)
262
+ print(f"Downloading The Pile - {subset} subset")
263
+ print("="*70)
264
+
265
+ pile_dir = self.output_dir / "the_pile" / subset
266
+ pile_dir.mkdir(parents=True, exist_ok=True)
267
+
268
+ print(f"\nThe Pile subsets available:")
269
+ print(" - ArXiv")
270
+ print(" - PubMed Abstracts")
271
+ print(" - PubMed Central")
272
+ print(" - FreeLaw")
273
+ print(" - USPTO Backgrounds")
274
+ print(" - Wikipedia (en)")
275
+ print("\nDownload from: https://the-eye.eu/public/AI/pile/train/")
276
+ print(f"Save to: {pile_dir}")
277
+
278
+ return {
279
+ "status": "manual_steps_needed",
280
+ "instructions": f"Download {subset} subset from The Pile",
281
+ "estimated_tokens": 200_000_000
282
+ }
283
+
284
+ def create_mycology_corpus(self) -> Dict:
285
+ """Instructions for collecting mycology data"""
286
+
287
+ print("\n" + "="*70)
288
+ print("Collecting Mycology Domain Data")
289
+ print("="*70)
290
+
291
+ myco_dir = self.output_dir / "mycology"
292
+ myco_dir.mkdir(exist_ok=True)
293
+
294
+ sources = {
295
+ "MushroomExpert.com": "http://www.mushroomexpert.com/",
296
+ "Shroomery": "https://www.shroomery.org/",
297
+ "MycoWorks Papers": "Research papers on fungal materials",
298
+ "Cultivation Guides": "Paul Stamets, Tradd Cotter books",
299
+ "Scientific Papers": "Search PubMed/arXiv for mycology",
300
+ "Southwest Mushrooms Data": "Your proprietary cultivation data",
301
+ }
302
+
303
+ print("\nMycology data sources:")
304
+ for name, desc in sources.items():
305
+ print(f" ✓ {name}: {desc}")
306
+
307
+ print(f"\nSave all mycology text to: {myco_dir}")
308
+ print("\nRecommended structure:")
309
+ print(" mycology/")
310
+ print(" ├── cultivation_guides.txt")
311
+ print(" ├── species_descriptions.txt")
312
+ print(" ├── scientific_papers.txt")
313
+ print(" ├── forum_discussions.txt")
314
+ print(" └── southwest_mushrooms.txt")
315
+
316
+ return {
317
+ "status": "manual_collection_needed",
318
+ "directory": str(myco_dir),
319
+ "estimated_tokens": 50_000_000
320
+ }
321
+
322
+ def create_drug_discovery_corpus(self) -> Dict:
323
+ """Instructions for collecting drug discovery data"""
324
+
325
+ print("\n" + "="*70)
326
+ print("Collecting Drug Discovery Domain Data")
327
+ print("="*70)
328
+
329
+ drug_dir = self.output_dir / "drug_discovery"
330
+ drug_dir.mkdir(exist_ok=True)
331
+
332
+ sources = {
333
+ "ChEMBL": "https://www.ebi.ac.uk/chembl/ (API available)",
334
+ "PubChem": "https://pubchem.ncbi.nlm.nih.gov/",
335
+ "DrugBank": "https://www.drugbank.com/",
336
+ "Clinical Trials": "https://clinicaltrials.gov/",
337
+ "Patents": "USPTO chemical patents",
338
+ "Papers": "PubMed chemistry/pharmacology papers",
339
+ }
340
+
341
+ print("\nDrug discovery data sources:")
342
+ for name, url in sources.items():
343
+ print(f" ✓ {name}: {url}")
344
+
345
+ print(f"\nSave to: {drug_dir}")
346
+
347
+ return {
348
+ "status": "manual_collection_needed",
349
+ "directory": str(drug_dir),
350
+ "estimated_tokens": 50_000_000
351
+ }
352
+
353
+ def estimate_tokens(self, text_file: Path) -> int:
354
+ """Estimate token count in a text file"""
355
+
356
+ if not text_file.exists():
357
+ return 0
358
+
359
+ # Rough estimate: 1 token ≈ 0.75 words ≈ 4 characters
360
+ file_size = text_file.stat().st_size
361
+ estimated_tokens = file_size // 4
362
+
363
+ return estimated_tokens
364
+
365
+ def generate_collection_plan(self) -> Dict:
366
+ """Generate a detailed data collection plan"""
367
+
368
+ print("\n" + "="*70)
369
+ print("CROWE LOGIC MINI - DATA COLLECTION PLAN")
370
+ print("Target: 1-2 Billion Tokens")
371
+ print("="*70)
372
+
373
+ plan = {
374
+ "target_tokens": self.target_tokens,
375
+ "phases": []
376
+ }
377
+
378
+ # Phase 1: Automated downloads (1 week)
379
+ phase1 = {
380
+ "name": "Phase 1: Public Datasets (Automated)",
381
+ "timeline": "Week 1",
382
+ "target_tokens": 1_200_000_000,
383
+ "sources": [
384
+ {"name": "Wikipedia", "tokens": 500_000_000, "time": "6-12 hours"},
385
+ {"name": "arXiv", "tokens": 300_000_000, "time": "12-24 hours"},
386
+ {"name": "PubMed", "tokens": 200_000_000, "time": "6-12 hours"},
387
+ {"name": "The Pile subsets", "tokens": 200_000_000, "time": "6-12 hours"},
388
+ ]
389
+ }
390
+
391
+ # Phase 2: Domain-specific collection (3-5 days)
392
+ phase2 = {
393
+ "name": "Phase 2: Domain-Specific Data",
394
+ "timeline": "Week 2 (3-5 days)",
395
+ "target_tokens": 200_000_000,
396
+ "sources": [
397
+ {"name": "Mycology", "tokens": 50_000_000, "method": "web scraping + papers"},
398
+ {"name": "Drug Discovery", "tokens": 50_000_000, "method": "APIs + databases"},
399
+ {"name": "AI/ML", "tokens": 100_000_000, "method": "arXiv subset + docs"},
400
+ ]
401
+ }
402
+
403
+ # Phase 3: Proprietary data (1-2 days)
404
+ phase3 = {
405
+ "name": "Phase 3: Proprietary Data",
406
+ "timeline": "Week 2 (1-2 days)",
407
+ "target_tokens": 20_000_000,
408
+ "sources": [
409
+ {"name": "Southwest Mushrooms", "tokens": 10_000_000, "method": "extract from records"},
410
+ {"name": "CrowLogic/CriOS", "tokens": 10_000_000, "method": "documentation"},
411
+ ]
412
+ }
413
+
414
+ # Phase 4: Curated examples (2-3 days)
415
+ phase4 = {
416
+ "name": "Phase 4: Curated Examples",
417
+ "timeline": "Week 2-3 (2-3 days)",
418
+ "target_tokens": 30_000_000,
419
+ "sources": [
420
+ {"name": "Chain-of-thought", "tokens": 10_000_000, "method": "manual creation"},
421
+ {"name": "Domain Q&A", "tokens": 20_000_000, "method": "curated + generated"},
422
+ ]
423
+ }
424
+
425
+ plan["phases"] = [phase1, phase2, phase3, phase4]
426
+
427
+ # Print plan
428
+ for phase in plan["phases"]:
429
+ print(f"\n{phase['name']}")
430
+ print(f"Timeline: {phase['timeline']}")
431
+ print(f"Target: {phase['target_tokens']:,} tokens")
432
+ print(f"\nSources:")
433
+ for source in phase["sources"]:
434
+ print(f" ✓ {source['name']}: {source['tokens']:,} tokens")
435
+
436
+ total_tokens = sum(p["target_tokens"] for p in plan["phases"])
437
+ print(f"\n{'='*70}")
438
+ print(f"TOTAL: {total_tokens:,} tokens ({total_tokens/1e9:.1f}B)")
439
+ print(f"Timeline: 2-3 weeks")
440
+ print(f"{'='*70}")
441
+
442
+ return plan
443
+
444
+ def check_existing_data(self) -> Dict:
445
+ """Check what data has already been collected"""
446
+
447
+ print("\n" + "="*70)
448
+ print("Checking Existing Data")
449
+ print("="*70)
450
+
451
+ collected = {}
452
+ total_tokens = 0
453
+
454
+ if not self.output_dir.exists():
455
+ print("\n⚠️ No data directory found. Starting from scratch.")
456
+ return collected
457
+
458
+ for subdir in self.output_dir.iterdir():
459
+ if subdir.is_dir():
460
+ tokens = 0
461
+ files = list(subdir.glob("**/*.txt")) + list(subdir.glob("**/*.json"))
462
+ for f in files:
463
+ tokens += self.estimate_tokens(f)
464
+
465
+ if tokens > 0:
466
+ collected[subdir.name] = {
467
+ "files": len(files),
468
+ "tokens": tokens
469
+ }
470
+ total_tokens += tokens
471
+
472
+ if collected:
473
+ print(f"\n✓ Found existing data:")
474
+ for name, info in collected.items():
475
+ print(f" {name}: {info['files']} files, ~{info['tokens']:,} tokens")
476
+ print(f"\nTotal collected: ~{total_tokens:,} tokens ({total_tokens/1e9:.2f}B)")
477
+ else:
478
+ print("\n⚠️ No existing data found.")
479
+
480
+ remaining = max(0, self.target_tokens - total_tokens)
481
+ print(f"Remaining to collect: ~{remaining:,} tokens ({remaining/1e9:.2f}B)")
482
+
483
+ return collected
484
+
485
+
486
+ def main():
487
+ """Main execution"""
488
+
489
+ print("\n🚀 Crowe Logic Mini - Training Data Collection Pipeline\n")
490
+
491
+ pipeline = DataCollectionPipeline(
492
+ output_dir="./data/raw",
493
+ target_tokens=1_500_000_000 # 1.5B tokens
494
+ )
495
+
496
+ # Check existing data
497
+ pipeline.check_existing_data()
498
+
499
+ # Generate collection plan
500
+ plan = pipeline.generate_collection_plan()
501
+
502
+ # Provide next steps
503
+ print("\n" + "="*70)
504
+ print("📋 NEXT STEPS")
505
+ print("="*70)
506
+ print("\n1. Review the collection plan above")
507
+ print("2. Start with Phase 1 (automated downloads)")
508
+ print("3. Run individual collection scripts:")
509
+ print("\n python data_collection/download_wikipedia.py")
510
+ print(" python data_collection/download_arxiv.py")
511
+ print(" python data_collection/download_pubmed.py")
512
+ print("\n4. For domain-specific data, see instructions above")
513
+ print("5. Once data is collected, run preprocessing:")
514
+ print("\n python data_collection/preprocess_training_data.py")
515
+ print("\n6. Train tokenizer:")
516
+ print("\n python tokenizer/build_scientific_tokenizer.py")
517
+
518
+ print("\n" + "="*70)
519
+ print("For detailed instructions, see: DATA_COLLECTION_GUIDE.md")
520
+ print("="*70)
521
+
522
+
523
+ if __name__ == "__main__":
524
+ main()