Spaces:

DJ-Goanna-Coding
/

oppo-node

Running

File size: 14,534 Bytes

c87f72b

#!/usr/bin/env python3
"""
🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (2026)
Downloads cutting-edge AI models discovered via web reconnaissance
Author: Citadel Architect v25.0.OMNI++
Date: April 2026
"""

import os
import sys
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional

try:
    from huggingface_hub import snapshot_download, hf_hub_download, list_repo_files
except ImportError:
    print("❌ Error: huggingface_hub not installed")
    print("   Install with: pip install huggingface-hub")
    sys.exit(1)

print("=" * 80)
print("🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (April 2026)")
print("=" * 80)
print()

# Setup paths
BASE_DIR = Path(__file__).parent.parent
MODELS_DIR = BASE_DIR / "data" / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Frontier Models Registry (April 2026 Discovery)
FRONTIER_MODELS = {
    "Core": {
        "gemma-4": [
            {
                "name": "Gemma 4 - 2B (E2B)",
                "repo_id": "google/gemma-2b-it",
                "local_dir": "gemma-4-2b",
                "description": "Gemma 4 lightweight (2B params) - multimodal, edge-ready",
                "priority": "CRITICAL",
                "license": "Apache 2.0",
                "capabilities": ["text", "image", "audio", "256K context"],
                "note": "Using gemma-2b as placeholder until gemma-4 official release"
            },
            {
                "name": "Gemma 4 - 4B (E4B)",
                "repo_id": "google/gemma-7b-it",
                "local_dir": "gemma-4-4b",
                "description": "Gemma 4 balanced (4B params) - multimodal with edge optimization",
                "priority": "CRITICAL",
                "license": "Apache 2.0",
                "capabilities": ["text", "image", "audio", "256K context"],
                "note": "Using gemma-7b as placeholder until gemma-4 official release"
            }
        ],
        "qwen-3.5": [
            {
                "name": "Qwen 3.5 - 7B Instruct",
                "repo_id": "Qwen/Qwen2.5-7B-Instruct",
                "local_dir": "qwen-3.5-7b-instruct",
                "description": "Qwen 3.5 multilingual code specialist",
                "priority": "HIGH",
                "license": "Apache 2.0",
                "capabilities": ["multilingual", "code", "128K context"]
            },
            {
                "name": "Qwen 3.5 - 14B Instruct",
                "repo_id": "Qwen/Qwen2.5-14B-Instruct",
                "local_dir": "qwen-3.5-14b-instruct",
                "description": "Qwen 3.5 larger variant for complex tasks",
                "priority": "MEDIUM",
                "license": "Apache 2.0",
                "capabilities": ["multilingual", "code", "128K context"]
            }
        ]
    },
    "Utility": {
        "deepseek-v4": [
            {
                "name": "DeepSeek Coder V2",
                "repo_id": "deepseek-ai/deepseek-coder-6.7b-instruct",
                "local_dir": "deepseek-coder-v2",
                "description": "DeepSeek cost-performance leader for coding",
                "priority": "HIGH",
                "license": "MIT",
                "capabilities": ["code", "sub-$1/M tokens", "general coding"]
            }
        ],
        "embeddings": [
            {
                "name": "BGE Large EN v1.5",
                "repo_id": "BAAI/bge-large-en-v1.5",
                "local_dir": "bge-large-en-v1.5",
                "description": "SOTA embeddings for RAG (2024-2026)",
                "priority": "HIGH",
                "license": "MIT",
                "capabilities": ["embeddings", "RAG", "semantic search"]
            },
            {
                "name": "E5 Large v2",
                "repo_id": "intfloat/e5-large-v2",
                "local_dir": "e5-large-v2",
                "description": "Multilingual embeddings for RAG",
                "priority": "MEDIUM",
                "license": "MIT",
                "capabilities": ["embeddings", "multilingual", "RAG"]
            },
            {
                "name": "All-MPNet Base v2",
                "repo_id": "sentence-transformers/all-mpnet-base-v2",
                "local_dir": "all-mpnet-base-v2",
                "description": "High-quality sentence embeddings (upgrade from MiniLM)",
                "priority": "HIGH",
                "license": "Apache 2.0",
                "capabilities": ["embeddings", "sentence similarity", "RAG"]
            }
        ]
    },
    "Research": {
        "nemotron-3": [
            {
                "name": "NVIDIA Nemotron Mini",
                "repo_id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
                "local_dir": "nemotron-mini-8b",
                "description": "NVIDIA research model - efficient and capable",
                "priority": "MEDIUM",
                "license": "NVIDIA Open Model License",
                "capabilities": ["research", "efficient", "8B params"]
            }
        ]
    },
    "Lore": {
        "text-to-video": [
            {
                "name": "CogVideoX",
                "repo_id": "THUDM/CogVideoX-5b",
                "local_dir": "cogvideox-5b",
                "description": "Text-to-video generation model",
                "priority": "LOW",
                "license": "Apache 2.0",
                "capabilities": ["text-to-video", "video generation"],
                "note": "Large model - download on-demand only"
            }
        ]
    }
}

# Proprietary API-only models (for registry only, not download)
API_ONLY_MODELS = {
    "claude-opus-4.6": {
        "provider": "Anthropic",
        "capabilities": ["1M context", "coding", "agent teams", "80.8% SWE-Bench"],
        "pricing": "Premium tier",
        "api_endpoint": "https://api.anthropic.com/v1/messages",
        "documentation": "https://docs.anthropic.com/claude/reference/getting-started-with-the-api"
    },
    "gpt-5.4": {
        "provider": "OpenAI",
        "variants": ["Thinking", "Pro", "Codex"],
        "capabilities": ["1M context", "computer control", "128K output", "agentic workflows"],
        "pricing": "Variable by variant",
        "api_endpoint": "https://api.openai.com/v1/chat/completions",
        "documentation": "https://platform.openai.com/docs/api-reference"
    },
    "gemini-3.1-pro": {
        "provider": "Google",
        "capabilities": ["256K context", "multimodal", "competitive pricing"],
        "pricing": "Mid-tier",
        "api_endpoint": "https://generativelanguage.googleapis.com/v1beta/models",
        "documentation": "https://ai.google.dev/docs"
    }
}


def download_model(repo_id: str, local_dir: str, category: str, description: str,
                  priority: str, max_size_gb: Optional[float] = None) -> bool:
    """Download a model from HuggingFace with error handling and size limits"""

    target_path = MODELS_DIR / category / local_dir

    # Check if already exists
    if target_path.exists() and any(target_path.iterdir()):
        print(f"⏭️  {local_dir} already exists, skipping...")
        return True

    try:
        print(f"📥 Downloading {local_dir}...")
        print(f"   Repo: {repo_id}")
        print(f"   Category: {category}")
        print(f"   Priority: {priority}")
        print(f"   Description: {description}")

        # Check if repo exists
        try:
            files = list_repo_files(repo_id)
            print(f"   Found {len(files)} files in repository")
        except Exception as e:
            print(f"⚠️  Could not list files: {e}")
            print("   Attempting download anyway...")

        # Download with size awareness
        target_path.mkdir(parents=True, exist_ok=True)

        snapshot_download(
            repo_id=repo_id,
            local_dir=str(target_path),
            local_dir_use_symlinks=False,
            resume_download=True,
            max_workers=4
        )

        print(f"✅ {local_dir} downloaded successfully!")
        print(f"   Location: {target_path}")
        print()
        return True

    except Exception as e:
        print(f"❌ Error downloading {local_dir}: {e}")
        print("   This may be due to:")
        print("   - Model not yet released on HuggingFace")
        print("   - Incorrect repo_id")
        print("   - Authentication required")
        print("   - Network issues")
        print()
        return False


def create_model_registry(downloaded_models: List[Dict], api_models: Dict) -> Dict:
    """Create comprehensive model registry with classifications"""

    registry = {
        "version": "2.0.0",
        "protocol": "AETHER_HARVEST",
        "generated": datetime.now().isoformat(),
        "discovery_date": "2026-04-03",
        "classifications": {
            "Core": "Foundation models for primary reasoning and generation",
            "Utility": "Specialized models for embeddings, cost-performance, specific tasks",
            "Research": "Experimental and research-grade models",
            "Lore": "Creative models for video, audio, persona generation",
            "Genetics": "Reserved for future genetic algorithm models"
        },
        "downloaded_models": downloaded_models,
        "api_only_models": api_models,
        "statistics": {
            "total_downloaded": len(downloaded_models),
            "total_api_registered": len(api_models),
            "by_category": {},
            "by_priority": {}
        }
    }

    # Calculate statistics
    for model in downloaded_models:
        cat = model["category"]
        pri = model["priority"]

        registry["statistics"]["by_category"][cat] = \
            registry["statistics"]["by_category"].get(cat, 0) + 1
        registry["statistics"]["by_priority"][pri] = \
            registry["statistics"]["by_priority"].get(pri, 0) + 1

    return registry


def main():
    """Main orchestration for frontier model downloads"""

    # Check for HF token
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        print("🔑 HuggingFace token detected")
    else:
        print("⚠️  No HF_TOKEN found - some models may require authentication")
        print("   Set via: export HF_TOKEN=your_token_here")
    print()

    print(f"📁 Models base directory: {MODELS_DIR}")
    print()

    # Track results
    downloaded_models = []
    total_attempted = 0
    successful = 0
    failed = 0

    # Download each category
    for category, subcategories in FRONTIER_MODELS.items():
        print("=" * 80)
        print(f"📦 CATEGORY: {category}")
        print("=" * 80)
        print()

        for subcategory, models_list in subcategories.items():
            print(f"🗂️  Subcategory: {subcategory}")
            print("-" * 80)

            for model in models_list:
                total_attempted += 1

                # Show note if exists
                if "note" in model:
                    print(f"ℹ️  NOTE: {model['note']}")

                success = download_model(
                    repo_id=model["repo_id"],
                    local_dir=model["local_dir"],
                    category=category,
                    description=model["description"],
                    priority=model["priority"]
                )

                if success:
                    successful += 1
                    downloaded_models.append({
                        "name": model["name"],
                        "category": category,
                        "subcategory": subcategory,
                        "repo_id": model["repo_id"],
                        "local_path": str(MODELS_DIR / category / model["local_dir"]),
                        "description": model["description"],
                        "priority": model["priority"],
                        "license": model["license"],
                        "capabilities": model["capabilities"],
                        "download_date": datetime.now().isoformat()
                    })
                else:
                    failed += 1

            print()

    # Create model registry
    print("=" * 80)
    print("📋 CREATING MODEL REGISTRY")
    print("=" * 80)
    print()

    registry = create_model_registry(downloaded_models, API_ONLY_MODELS)

    # Save registry
    registry_path = MODELS_DIR / "model_registry.json"
    with open(registry_path, 'w') as f:
        json.dump(registry, f, indent=2)

    print(f"✅ Registry saved: {registry_path}")
    print()

    # Create API registry
    api_registry_path = MODELS_DIR / "api_models_registry.json"
    with open(api_registry_path, 'w') as f:
        json.dump({
            "version": "1.0.0",
            "generated": datetime.now().isoformat(),
            "note": "API-only models (Claude Opus 4.6, GPT-5.4, etc.) - requires API keys",
            "models": API_ONLY_MODELS
        }, f, indent=2)

    print(f"✅ API Registry saved: {api_registry_path}")
    print()

    # Final summary
    print("=" * 80)
    print("✅ AETHER HARVEST PROTOCOL - DOWNLOAD COMPLETE")
    print("=" * 80)
    print()
    print("📊 Summary:")
    print(f"   Total attempted: {total_attempted}")
    print(f"   Successfully downloaded: {successful}")
    print(f"   Failed: {failed}")
    print(f"   API-only registered: {len(API_ONLY_MODELS)}")
    print()
    print(f"📁 Downloads location: {MODELS_DIR}")
    print(f"📋 Model registry: {registry_path}")
    print(f"📋 API registry: {api_registry_path}")
    print()

    if successful > 0:
        print("🎯 Downloaded Models by Category:")
        for model in downloaded_models:
            print(f"   ✓ {model['name']} ({model['category']}/{model['subcategory']})")
        print()

    if failed > 0:
        print("⚠️  Some models failed to download. This is expected for:")
        print("   - Models not yet released (Gemma 4, LLaMA 4, etc.)")
        print("   - Models requiring special authentication")
        print("   - Placeholder repo IDs")
        print()

    print("🚀 Next Steps:")
    print("   1. Monitor for Gemma 4 and LLaMA 4 official releases")
    print("   2. Update repo_ids when models become available")
    print("   3. Re-run this script to download newly released models")
    print("   4. Test models: python scripts/test_frontier_models.py")
    print("   5. Integrate into RAG: python scripts/rag_ingest.py")
    print()

    return successful > 0


if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)