#!/usr/bin/env python3 """ 🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (2026) Downloads cutting-edge AI models discovered via web reconnaissance Author: Citadel Architect v25.0.OMNI++ Date: April 2026 """ import os import sys import json from pathlib import Path from datetime import datetime from typing import Dict, List, Optional try: from huggingface_hub import snapshot_download, hf_hub_download, list_repo_files except ImportError: print("❌ Error: huggingface_hub not installed") print(" Install with: pip install huggingface-hub") sys.exit(1) print("=" * 80) print("🔮 AETHER HARVEST PROTOCOL - Frontier Models Downloader (April 2026)") print("=" * 80) print() # Setup paths BASE_DIR = Path(__file__).parent.parent MODELS_DIR = BASE_DIR / "data" / "models" MODELS_DIR.mkdir(parents=True, exist_ok=True) # Frontier Models Registry (April 2026 Discovery) FRONTIER_MODELS = { "Core": { "gemma-4": [ { "name": "Gemma 4 - 2B (E2B)", "repo_id": "google/gemma-2b-it", "local_dir": "gemma-4-2b", "description": "Gemma 4 lightweight (2B params) - multimodal, edge-ready", "priority": "CRITICAL", "license": "Apache 2.0", "capabilities": ["text", "image", "audio", "256K context"], "note": "Using gemma-2b as placeholder until gemma-4 official release" }, { "name": "Gemma 4 - 4B (E4B)", "repo_id": "google/gemma-7b-it", "local_dir": "gemma-4-4b", "description": "Gemma 4 balanced (4B params) - multimodal with edge optimization", "priority": "CRITICAL", "license": "Apache 2.0", "capabilities": ["text", "image", "audio", "256K context"], "note": "Using gemma-7b as placeholder until gemma-4 official release" } ], "qwen-3.5": [ { "name": "Qwen 3.5 - 7B Instruct", "repo_id": "Qwen/Qwen2.5-7B-Instruct", "local_dir": "qwen-3.5-7b-instruct", "description": "Qwen 3.5 multilingual code specialist", "priority": "HIGH", "license": "Apache 2.0", "capabilities": ["multilingual", "code", "128K context"] }, { "name": "Qwen 3.5 - 14B Instruct", "repo_id": "Qwen/Qwen2.5-14B-Instruct", "local_dir": "qwen-3.5-14b-instruct", "description": "Qwen 3.5 larger variant for complex tasks", "priority": "MEDIUM", "license": "Apache 2.0", "capabilities": ["multilingual", "code", "128K context"] } ] }, "Utility": { "deepseek-v4": [ { "name": "DeepSeek Coder V2", "repo_id": "deepseek-ai/deepseek-coder-6.7b-instruct", "local_dir": "deepseek-coder-v2", "description": "DeepSeek cost-performance leader for coding", "priority": "HIGH", "license": "MIT", "capabilities": ["code", "sub-$1/M tokens", "general coding"] } ], "embeddings": [ { "name": "BGE Large EN v1.5", "repo_id": "BAAI/bge-large-en-v1.5", "local_dir": "bge-large-en-v1.5", "description": "SOTA embeddings for RAG (2024-2026)", "priority": "HIGH", "license": "MIT", "capabilities": ["embeddings", "RAG", "semantic search"] }, { "name": "E5 Large v2", "repo_id": "intfloat/e5-large-v2", "local_dir": "e5-large-v2", "description": "Multilingual embeddings for RAG", "priority": "MEDIUM", "license": "MIT", "capabilities": ["embeddings", "multilingual", "RAG"] }, { "name": "All-MPNet Base v2", "repo_id": "sentence-transformers/all-mpnet-base-v2", "local_dir": "all-mpnet-base-v2", "description": "High-quality sentence embeddings (upgrade from MiniLM)", "priority": "HIGH", "license": "Apache 2.0", "capabilities": ["embeddings", "sentence similarity", "RAG"] } ] }, "Research": { "nemotron-3": [ { "name": "NVIDIA Nemotron Mini", "repo_id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct", "local_dir": "nemotron-mini-8b", "description": "NVIDIA research model - efficient and capable", "priority": "MEDIUM", "license": "NVIDIA Open Model License", "capabilities": ["research", "efficient", "8B params"] } ] }, "Lore": { "text-to-video": [ { "name": "CogVideoX", "repo_id": "THUDM/CogVideoX-5b", "local_dir": "cogvideox-5b", "description": "Text-to-video generation model", "priority": "LOW", "license": "Apache 2.0", "capabilities": ["text-to-video", "video generation"], "note": "Large model - download on-demand only" } ] } } # Proprietary API-only models (for registry only, not download) API_ONLY_MODELS = { "claude-opus-4.6": { "provider": "Anthropic", "capabilities": ["1M context", "coding", "agent teams", "80.8% SWE-Bench"], "pricing": "Premium tier", "api_endpoint": "https://api.anthropic.com/v1/messages", "documentation": "https://docs.anthropic.com/claude/reference/getting-started-with-the-api" }, "gpt-5.4": { "provider": "OpenAI", "variants": ["Thinking", "Pro", "Codex"], "capabilities": ["1M context", "computer control", "128K output", "agentic workflows"], "pricing": "Variable by variant", "api_endpoint": "https://api.openai.com/v1/chat/completions", "documentation": "https://platform.openai.com/docs/api-reference" }, "gemini-3.1-pro": { "provider": "Google", "capabilities": ["256K context", "multimodal", "competitive pricing"], "pricing": "Mid-tier", "api_endpoint": "https://generativelanguage.googleapis.com/v1beta/models", "documentation": "https://ai.google.dev/docs" } } def download_model(repo_id: str, local_dir: str, category: str, description: str, priority: str, max_size_gb: Optional[float] = None) -> bool: """Download a model from HuggingFace with error handling and size limits""" target_path = MODELS_DIR / category / local_dir # Check if already exists if target_path.exists() and any(target_path.iterdir()): print(f"â­ī¸ {local_dir} already exists, skipping...") return True try: print(f"đŸ“Ĩ Downloading {local_dir}...") print(f" Repo: {repo_id}") print(f" Category: {category}") print(f" Priority: {priority}") print(f" Description: {description}") # Check if repo exists try: files = list_repo_files(repo_id) print(f" Found {len(files)} files in repository") except Exception as e: print(f"âš ī¸ Could not list files: {e}") print(" Attempting download anyway...") # Download with size awareness target_path.mkdir(parents=True, exist_ok=True) snapshot_download( repo_id=repo_id, local_dir=str(target_path), local_dir_use_symlinks=False, resume_download=True, max_workers=4 ) print(f"✅ {local_dir} downloaded successfully!") print(f" Location: {target_path}") print() return True except Exception as e: print(f"❌ Error downloading {local_dir}: {e}") print(" This may be due to:") print(" - Model not yet released on HuggingFace") print(" - Incorrect repo_id") print(" - Authentication required") print(" - Network issues") print() return False def create_model_registry(downloaded_models: List[Dict], api_models: Dict) -> Dict: """Create comprehensive model registry with classifications""" registry = { "version": "2.0.0", "protocol": "AETHER_HARVEST", "generated": datetime.now().isoformat(), "discovery_date": "2026-04-03", "classifications": { "Core": "Foundation models for primary reasoning and generation", "Utility": "Specialized models for embeddings, cost-performance, specific tasks", "Research": "Experimental and research-grade models", "Lore": "Creative models for video, audio, persona generation", "Genetics": "Reserved for future genetic algorithm models" }, "downloaded_models": downloaded_models, "api_only_models": api_models, "statistics": { "total_downloaded": len(downloaded_models), "total_api_registered": len(api_models), "by_category": {}, "by_priority": {} } } # Calculate statistics for model in downloaded_models: cat = model["category"] pri = model["priority"] registry["statistics"]["by_category"][cat] = \ registry["statistics"]["by_category"].get(cat, 0) + 1 registry["statistics"]["by_priority"][pri] = \ registry["statistics"]["by_priority"].get(pri, 0) + 1 return registry def main(): """Main orchestration for frontier model downloads""" # Check for HF token hf_token = os.getenv("HF_TOKEN") if hf_token: print("🔑 HuggingFace token detected") else: print("âš ī¸ No HF_TOKEN found - some models may require authentication") print(" Set via: export HF_TOKEN=your_token_here") print() print(f"📁 Models base directory: {MODELS_DIR}") print() # Track results downloaded_models = [] total_attempted = 0 successful = 0 failed = 0 # Download each category for category, subcategories in FRONTIER_MODELS.items(): print("=" * 80) print(f"đŸ“Ļ CATEGORY: {category}") print("=" * 80) print() for subcategory, models_list in subcategories.items(): print(f"đŸ—‚ī¸ Subcategory: {subcategory}") print("-" * 80) for model in models_list: total_attempted += 1 # Show note if exists if "note" in model: print(f"â„šī¸ NOTE: {model['note']}") success = download_model( repo_id=model["repo_id"], local_dir=model["local_dir"], category=category, description=model["description"], priority=model["priority"] ) if success: successful += 1 downloaded_models.append({ "name": model["name"], "category": category, "subcategory": subcategory, "repo_id": model["repo_id"], "local_path": str(MODELS_DIR / category / model["local_dir"]), "description": model["description"], "priority": model["priority"], "license": model["license"], "capabilities": model["capabilities"], "download_date": datetime.now().isoformat() }) else: failed += 1 print() # Create model registry print("=" * 80) print("📋 CREATING MODEL REGISTRY") print("=" * 80) print() registry = create_model_registry(downloaded_models, API_ONLY_MODELS) # Save registry registry_path = MODELS_DIR / "model_registry.json" with open(registry_path, 'w') as f: json.dump(registry, f, indent=2) print(f"✅ Registry saved: {registry_path}") print() # Create API registry api_registry_path = MODELS_DIR / "api_models_registry.json" with open(api_registry_path, 'w') as f: json.dump({ "version": "1.0.0", "generated": datetime.now().isoformat(), "note": "API-only models (Claude Opus 4.6, GPT-5.4, etc.) - requires API keys", "models": API_ONLY_MODELS }, f, indent=2) print(f"✅ API Registry saved: {api_registry_path}") print() # Final summary print("=" * 80) print("✅ AETHER HARVEST PROTOCOL - DOWNLOAD COMPLETE") print("=" * 80) print() print("📊 Summary:") print(f" Total attempted: {total_attempted}") print(f" Successfully downloaded: {successful}") print(f" Failed: {failed}") print(f" API-only registered: {len(API_ONLY_MODELS)}") print() print(f"📁 Downloads location: {MODELS_DIR}") print(f"📋 Model registry: {registry_path}") print(f"📋 API registry: {api_registry_path}") print() if successful > 0: print("đŸŽ¯ Downloaded Models by Category:") for model in downloaded_models: print(f" ✓ {model['name']} ({model['category']}/{model['subcategory']})") print() if failed > 0: print("âš ī¸ Some models failed to download. This is expected for:") print(" - Models not yet released (Gemma 4, LLaMA 4, etc.)") print(" - Models requiring special authentication") print(" - Placeholder repo IDs") print() print("🚀 Next Steps:") print(" 1. Monitor for Gemma 4 and LLaMA 4 official releases") print(" 2. Update repo_ids when models become available") print(" 3. Re-run this script to download newly released models") print(" 4. Test models: python scripts/test_frontier_models.py") print(" 5. Integrate into RAG: python scripts/rag_ingest.py") print() return successful > 0 if __name__ == "__main__": success = main() sys.exit(0 if success else 1)