File size: 17,901 Bytes

4971a7f

#!/usr/bin/env python3
"""
Training Data Collection Pipeline for Crowe Logic Mini
Target: 1-2 billion tokens from scientific and domain-specific sources

Data sources:
1. Public datasets (The Pile, RedPajama, arXiv, Wikipedia) - 1.5B tokens
2. Domain-specific scraping (mycology, drug discovery) - 200M tokens
3. Proprietary data (Southwest Mushrooms, CrowLogic) - 20M tokens
4. Curated examples - 30M tokens
"""

import os
import json
import requests
import subprocess
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass
from tqdm import tqdm
import hashlib


@dataclass
class DataSource:
    """Configuration for a data source"""
    name: str
    url: Optional[str]
    estimated_tokens: int
    priority: str  # "critical", "high", "medium", "low"
    collection_method: str  # "download", "api", "scrape", "manual"
    status: str = "pending"


class DataCollectionPipeline:
    """Automated pipeline to collect 1-2B tokens of training data"""

    def __init__(self, output_dir: str = "./data/raw", target_tokens: int = 1_500_000_000):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.target_tokens = target_tokens
        self.collected_tokens = 0

        self.sources = self._define_data_sources()

    def _define_data_sources(self) -> List[DataSource]:
        """Define all data sources with metadata"""

        return [
            # ===== PUBLIC DATASETS (Automated) =====

            DataSource(
                name="Wikipedia Science",
                url="https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
                estimated_tokens=500_000_000,
                priority="critical",
                collection_method="download"
            ),

            DataSource(
                name="arXiv Papers",
                url="s3://arxiv/",  # AWS S3 bucket
                estimated_tokens=300_000_000,
                priority="critical",
                collection_method="download"
            ),

            DataSource(
                name="The Pile - arXiv subset",
                url="https://the-eye.eu/public/AI/pile/train/",
                estimated_tokens=200_000_000,
                priority="high",
                collection_method="download"
            ),

            DataSource(
                name="The Pile - PubMed subset",
                url="https://the-eye.eu/public/AI/pile/train/",
                estimated_tokens=150_000_000,
                priority="high",
                collection_method="download"
            ),

            DataSource(
                name="PubMed Abstracts",
                url="https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/",
                estimated_tokens=200_000_000,
                priority="high",
                collection_method="download"
            ),

            DataSource(
                name="RedPajama - Wikipedia",
                url="https://data.together.xyz/redpajama-data-1T/v1.0.0/",
                estimated_tokens=100_000_000,
                priority="medium",
                collection_method="download"
            ),

            # ===== DOMAIN-SPECIFIC SOURCES =====

            DataSource(
                name="Mycology Literature",
                url=None,  # Multiple sources
                estimated_tokens=50_000_000,
                priority="critical",
                collection_method="scrape"
            ),

            DataSource(
                name="Drug Discovery Papers",
                url="https://www.ebi.ac.uk/chembl/",
                estimated_tokens=50_000_000,
                priority="critical",
                collection_method="api"
            ),

            DataSource(
                name="AI/ML Papers (arXiv cs.AI)",
                url="https://arxiv.org/list/cs.AI/recent",
                estimated_tokens=100_000_000,
                priority="high",
                collection_method="api"
            ),

            DataSource(
                name="GitHub AI Documentation",
                url="https://github.com/",
                estimated_tokens=50_000_000,
                priority="medium",
                collection_method="api"
            ),

            # ===== PROPRIETARY DATA =====

            DataSource(
                name="Southwest Mushrooms Data",
                url=None,
                estimated_tokens=10_000_000,
                priority="critical",
                collection_method="manual"
            ),

            DataSource(
                name="CrowLogic Documentation",
                url=None,
                estimated_tokens=5_000_000,
                priority="critical",
                collection_method="manual"
            ),

            DataSource(
                name="Prologic Methodology Examples",
                url=None,
                estimated_tokens=5_000_000,
                priority="critical",
                collection_method="manual"
            ),

            # ===== CURATED EXAMPLES =====

            DataSource(
                name="Chain-of-Thought Examples",
                url=None,
                estimated_tokens=10_000_000,
                priority="high",
                collection_method="manual"
            ),

            DataSource(
                name="Domain Q&A Pairs",
                url=None,
                estimated_tokens=20_000_000,
                priority="high",
                collection_method="manual"
            ),
        ]

    def download_wikipedia(self) -> Dict:
        """Download and extract Wikipedia dump"""

        print("\n" + "="*70)
        print("Downloading Wikipedia Science Articles")
        print("="*70)

        wiki_dir = self.output_dir / "wikipedia"
        wiki_dir.mkdir(exist_ok=True)

        # Download latest dump
        dump_url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
        dump_file = wiki_dir / "enwiki-latest.xml.bz2"

        print(f"\n📥 Downloading from: {dump_url}")
        print(f"   Destination: {dump_file}")
        print(f"   Size: ~20GB (compressed), ~80GB (uncompressed)")
        print("\n⚠️  This will take 1-4 hours depending on connection speed")
        print("\nCommands to run:")
        print(f"   wget {dump_url} -O {dump_file}")
        print(f"   python -m wikiextractor.WikiExtractor {dump_file} -o {wiki_dir / 'extracted'} --json")

        return {
            "status": "manual_steps_needed",
            "instructions": "Run wget and WikiExtractor commands above",
            "estimated_tokens": 500_000_000
        }

    def download_arxiv(self) -> Dict:
        """Download arXiv papers"""

        print("\n" + "="*70)
        print("Downloading arXiv Papers")
        print("="*70)

        arxiv_dir = self.output_dir / "arxiv"
        arxiv_dir.mkdir(exist_ok=True)

        print("\nOptions for arXiv data:")
        print("\n1. Bulk download from S3 (recommended):")
        print("   aws s3 sync s3://arxiv/src/ ./data/raw/arxiv/ --no-sign-request")
        print("\n2. Use arXiv API:")
        print("   pip install arxiv")
        print("   python scripts/download_arxiv_api.py")
        print("\n3. Use existing preprocessed datasets:")
        print("   - RedPajama arXiv subset")
        print("   - The Pile arXiv subset")

        return {
            "status": "manual_steps_needed",
            "instructions": "Choose one of the methods above",
            "estimated_tokens": 300_000_000
        }

    def download_pubmed(self) -> Dict:
        """Download PubMed abstracts"""

        print("\n" + "="*70)
        print("Downloading PubMed Abstracts")
        print("="*70)

        pubmed_dir = self.output_dir / "pubmed"
        pubmed_dir.mkdir(exist_ok=True)

        base_url = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/"

        print(f"\n📥 PubMed Baseline Files")
        print(f"   URL: {base_url}")
        print(f"   Files: pubmed24n*.xml.gz (1000+ files)")
        print(f"   Total size: ~30GB")
        print("\nCommand to download:")
        print(f"   wget -r -np -nd -A 'pubmed24n*.xml.gz' {base_url} -P {pubmed_dir}")

        return {
            "status": "manual_steps_needed",
            "instructions": "Run wget command above",
            "estimated_tokens": 200_000_000
        }

    def download_the_pile_subset(self, subset: str = "arxiv") -> Dict:
        """Download specific subset from The Pile"""

        print(f"\n" + "="*70)
        print(f"Downloading The Pile - {subset} subset")
        print("="*70)

        pile_dir = self.output_dir / "the_pile" / subset
        pile_dir.mkdir(parents=True, exist_ok=True)

        print(f"\nThe Pile subsets available:")
        print("   - ArXiv")
        print("   - PubMed Abstracts")
        print("   - PubMed Central")
        print("   - FreeLaw")
        print("   - USPTO Backgrounds")
        print("   - Wikipedia (en)")
        print("\nDownload from: https://the-eye.eu/public/AI/pile/train/")
        print(f"Save to: {pile_dir}")

        return {
            "status": "manual_steps_needed",
            "instructions": f"Download {subset} subset from The Pile",
            "estimated_tokens": 200_000_000
        }

    def create_mycology_corpus(self) -> Dict:
        """Instructions for collecting mycology data"""

        print("\n" + "="*70)
        print("Collecting Mycology Domain Data")
        print("="*70)

        myco_dir = self.output_dir / "mycology"
        myco_dir.mkdir(exist_ok=True)

        sources = {
            "MushroomExpert.com": "http://www.mushroomexpert.com/",
            "Shroomery": "https://www.shroomery.org/",
            "MycoWorks Papers": "Research papers on fungal materials",
            "Cultivation Guides": "Paul Stamets, Tradd Cotter books",
            "Scientific Papers": "Search PubMed/arXiv for mycology",
            "Southwest Mushrooms Data": "Your proprietary cultivation data",
        }

        print("\nMycology data sources:")
        for name, desc in sources.items():
            print(f"   ✓ {name}: {desc}")

        print(f"\nSave all mycology text to: {myco_dir}")
        print("\nRecommended structure:")
        print("   mycology/")
        print("   ├── cultivation_guides.txt")
        print("   ├── species_descriptions.txt")
        print("   ├── scientific_papers.txt")
        print("   ├── forum_discussions.txt")
        print("   └── southwest_mushrooms.txt")

        return {
            "status": "manual_collection_needed",
            "directory": str(myco_dir),
            "estimated_tokens": 50_000_000
        }

    def create_drug_discovery_corpus(self) -> Dict:
        """Instructions for collecting drug discovery data"""

        print("\n" + "="*70)
        print("Collecting Drug Discovery Domain Data")
        print("="*70)

        drug_dir = self.output_dir / "drug_discovery"
        drug_dir.mkdir(exist_ok=True)

        sources = {
            "ChEMBL": "https://www.ebi.ac.uk/chembl/ (API available)",
            "PubChem": "https://pubchem.ncbi.nlm.nih.gov/",
            "DrugBank": "https://www.drugbank.com/",
            "Clinical Trials": "https://clinicaltrials.gov/",
            "Patents": "USPTO chemical patents",
            "Papers": "PubMed chemistry/pharmacology papers",
        }

        print("\nDrug discovery data sources:")
        for name, url in sources.items():
            print(f"   ✓ {name}: {url}")

        print(f"\nSave to: {drug_dir}")

        return {
            "status": "manual_collection_needed",
            "directory": str(drug_dir),
            "estimated_tokens": 50_000_000
        }

    def estimate_tokens(self, text_file: Path) -> int:
        """Estimate token count in a text file"""

        if not text_file.exists():
            return 0

        # Rough estimate: 1 token ≈ 0.75 words ≈ 4 characters
        file_size = text_file.stat().st_size
        estimated_tokens = file_size // 4

        return estimated_tokens

    def generate_collection_plan(self) -> Dict:
        """Generate a detailed data collection plan"""

        print("\n" + "="*70)
        print("CROWE LOGIC MINI - DATA COLLECTION PLAN")
        print("Target: 1-2 Billion Tokens")
        print("="*70)

        plan = {
            "target_tokens": self.target_tokens,
            "phases": []
        }

        # Phase 1: Automated downloads (1 week)
        phase1 = {
            "name": "Phase 1: Public Datasets (Automated)",
            "timeline": "Week 1",
            "target_tokens": 1_200_000_000,
            "sources": [
                {"name": "Wikipedia", "tokens": 500_000_000, "time": "6-12 hours"},
                {"name": "arXiv", "tokens": 300_000_000, "time": "12-24 hours"},
                {"name": "PubMed", "tokens": 200_000_000, "time": "6-12 hours"},
                {"name": "The Pile subsets", "tokens": 200_000_000, "time": "6-12 hours"},
            ]
        }

        # Phase 2: Domain-specific collection (3-5 days)
        phase2 = {
            "name": "Phase 2: Domain-Specific Data",
            "timeline": "Week 2 (3-5 days)",
            "target_tokens": 200_000_000,
            "sources": [
                {"name": "Mycology", "tokens": 50_000_000, "method": "web scraping + papers"},
                {"name": "Drug Discovery", "tokens": 50_000_000, "method": "APIs + databases"},
                {"name": "AI/ML", "tokens": 100_000_000, "method": "arXiv subset + docs"},
            ]
        }

        # Phase 3: Proprietary data (1-2 days)
        phase3 = {
            "name": "Phase 3: Proprietary Data",
            "timeline": "Week 2 (1-2 days)",
            "target_tokens": 20_000_000,
            "sources": [
                {"name": "Southwest Mushrooms", "tokens": 10_000_000, "method": "extract from records"},
                {"name": "CrowLogic/CriOS", "tokens": 10_000_000, "method": "documentation"},
            ]
        }

        # Phase 4: Curated examples (2-3 days)
        phase4 = {
            "name": "Phase 4: Curated Examples",
            "timeline": "Week 2-3 (2-3 days)",
            "target_tokens": 30_000_000,
            "sources": [
                {"name": "Chain-of-thought", "tokens": 10_000_000, "method": "manual creation"},
                {"name": "Domain Q&A", "tokens": 20_000_000, "method": "curated + generated"},
            ]
        }

        plan["phases"] = [phase1, phase2, phase3, phase4]

        # Print plan
        for phase in plan["phases"]:
            print(f"\n{phase['name']}")
            print(f"Timeline: {phase['timeline']}")
            print(f"Target: {phase['target_tokens']:,} tokens")
            print(f"\nSources:")
            for source in phase["sources"]:
                print(f"  ✓ {source['name']}: {source['tokens']:,} tokens")

        total_tokens = sum(p["target_tokens"] for p in plan["phases"])
        print(f"\n{'='*70}")
        print(f"TOTAL: {total_tokens:,} tokens ({total_tokens/1e9:.1f}B)")
        print(f"Timeline: 2-3 weeks")
        print(f"{'='*70}")

        return plan

    def check_existing_data(self) -> Dict:
        """Check what data has already been collected"""

        print("\n" + "="*70)
        print("Checking Existing Data")
        print("="*70)

        collected = {}
        total_tokens = 0

        if not self.output_dir.exists():
            print("\n⚠️  No data directory found. Starting from scratch.")
            return collected

        for subdir in self.output_dir.iterdir():
            if subdir.is_dir():
                tokens = 0
                files = list(subdir.glob("**/*.txt")) + list(subdir.glob("**/*.json"))
                for f in files:
                    tokens += self.estimate_tokens(f)

                if tokens > 0:
                    collected[subdir.name] = {
                        "files": len(files),
                        "tokens": tokens
                    }
                    total_tokens += tokens

        if collected:
            print(f"\n✓ Found existing data:")
            for name, info in collected.items():
                print(f"  {name}: {info['files']} files, ~{info['tokens']:,} tokens")
            print(f"\nTotal collected: ~{total_tokens:,} tokens ({total_tokens/1e9:.2f}B)")
        else:
            print("\n⚠️  No existing data found.")

        remaining = max(0, self.target_tokens - total_tokens)
        print(f"Remaining to collect: ~{remaining:,} tokens ({remaining/1e9:.2f}B)")

        return collected


def main():
    """Main execution"""

    print("\n🚀 Crowe Logic Mini - Training Data Collection Pipeline\n")

    pipeline = DataCollectionPipeline(
        output_dir="./data/raw",
        target_tokens=1_500_000_000  # 1.5B tokens
    )

    # Check existing data
    pipeline.check_existing_data()

    # Generate collection plan
    plan = pipeline.generate_collection_plan()

    # Provide next steps
    print("\n" + "="*70)
    print("📋 NEXT STEPS")
    print("="*70)
    print("\n1. Review the collection plan above")
    print("2. Start with Phase 1 (automated downloads)")
    print("3. Run individual collection scripts:")
    print("\n   python data_collection/download_wikipedia.py")
    print("   python data_collection/download_arxiv.py")
    print("   python data_collection/download_pubmed.py")
    print("\n4. For domain-specific data, see instructions above")
    print("5. Once data is collected, run preprocessing:")
    print("\n   python data_collection/preprocess_training_data.py")
    print("\n6. Train tokenizer:")
    print("\n   python tokenizer/build_scientific_tokenizer.py")

    print("\n" + "="*70)
    print("For detailed instructions, see: DATA_COLLECTION_GUIDE.md")
    print("="*70)


if __name__ == "__main__":
    main()