#!/usr/bin/env python3 """ Config Splitter for Large Documentation Sites Splits large documentation configs into multiple smaller, focused skill configs. Supports multiple splitting strategies: category-based, size-based, and automatic. """ import json import sys import argparse from pathlib import Path from typing import Dict, List, Any, Tuple from collections import defaultdict class ConfigSplitter: """Splits large documentation configs into multiple focused configs""" def __init__(self, config_path: str, strategy: str = "auto", target_pages: int = 5000): self.config_path = Path(config_path) self.strategy = strategy self.target_pages = target_pages self.config = self.load_config() self.base_name = self.config['name'] def load_config(self) -> Dict[str, Any]: """Load configuration from file""" try: with open(self.config_path, 'r') as f: return json.load(f) except FileNotFoundError: print(f"❌ Error: Config file not found: {self.config_path}") sys.exit(1) except json.JSONDecodeError as e: print(f"❌ Error: Invalid JSON in config file: {e}") sys.exit(1) def get_split_strategy(self) -> str: """Determine split strategy""" # Check if strategy is defined in config if 'split_strategy' in self.config: config_strategy = self.config['split_strategy'] if config_strategy != "none": return config_strategy # Use provided strategy or auto-detect if self.strategy == "auto": max_pages = self.config.get('max_pages', 500) if max_pages < 5000: print(f"ℹ️ Small documentation ({max_pages} pages) - no splitting needed") return "none" elif max_pages < 10000 and 'categories' in self.config: print(f"ℹ️ Medium documentation ({max_pages} pages) - category split recommended") return "category" elif 'categories' in self.config and len(self.config['categories']) >= 3: print(f"ℹ️ Large documentation ({max_pages} pages) - router + categories recommended") return "router" else: print(f"ℹ️ Large documentation ({max_pages} pages) - size-based split") return "size" return self.strategy def split_by_category(self, create_router: bool = False) -> List[Dict[str, Any]]: """Split config by categories""" if 'categories' not in self.config: print("❌ Error: No categories defined in config") sys.exit(1) categories = self.config['categories'] split_categories = self.config.get('split_config', {}).get('split_by_categories') # If specific categories specified, use only those if split_categories: categories = {k: v for k, v in categories.items() if k in split_categories} configs = [] for category_name, keywords in categories.items(): # Create new config for this category new_config = self.config.copy() new_config['name'] = f"{self.base_name}-{category_name}" new_config['description'] = f"{self.base_name.capitalize()} - {category_name.replace('_', ' ').title()}. {self.config.get('description', '')}" # Update URL patterns to focus on this category url_patterns = new_config.get('url_patterns', {}) # Add category keywords to includes includes = url_patterns.get('include', []) for keyword in keywords: if keyword.startswith('/'): includes.append(keyword) if includes: url_patterns['include'] = list(set(includes)) new_config['url_patterns'] = url_patterns # Keep only this category new_config['categories'] = {category_name: keywords} # Remove split config from child if 'split_strategy' in new_config: del new_config['split_strategy'] if 'split_config' in new_config: del new_config['split_config'] # Adjust max_pages estimate if 'max_pages' in new_config: new_config['max_pages'] = self.target_pages configs.append(new_config) print(f"✅ Created {len(configs)} category-based configs") # Optionally create router config if create_router: router_config = self.create_router_config(configs) configs.insert(0, router_config) print(f"✅ Created router config: {router_config['name']}") return configs def split_by_size(self) -> List[Dict[str, Any]]: """Split config by size (page count)""" max_pages = self.config.get('max_pages', 500) num_splits = (max_pages + self.target_pages - 1) // self.target_pages configs = [] for i in range(num_splits): new_config = self.config.copy() part_num = i + 1 new_config['name'] = f"{self.base_name}-part{part_num}" new_config['description'] = f"{self.base_name.capitalize()} - Part {part_num}. {self.config.get('description', '')}" new_config['max_pages'] = self.target_pages # Remove split config from child if 'split_strategy' in new_config: del new_config['split_strategy'] if 'split_config' in new_config: del new_config['split_config'] configs.append(new_config) print(f"✅ Created {len(configs)} size-based configs ({self.target_pages} pages each)") return configs def create_router_config(self, sub_configs: List[Dict[str, Any]]) -> Dict[str, Any]: """Create a router config that references sub-skills""" router_name = self.config.get('split_config', {}).get('router_name', self.base_name) router_config = { "name": router_name, "description": self.config.get('description', ''), "base_url": self.config['base_url'], "selectors": self.config['selectors'], "url_patterns": self.config.get('url_patterns', {}), "rate_limit": self.config.get('rate_limit', 0.5), "max_pages": 500, # Router only needs overview pages "_router": True, "_sub_skills": [cfg['name'] for cfg in sub_configs], "_routing_keywords": { cfg['name']: list(cfg.get('categories', {}).keys()) for cfg in sub_configs } } return router_config def split(self) -> List[Dict[str, Any]]: """Execute split based on strategy""" strategy = self.get_split_strategy() print(f"\n{'='*60}") print(f"CONFIG SPLITTER: {self.base_name}") print(f"{'='*60}") print(f"Strategy: {strategy}") print(f"Target pages per skill: {self.target_pages}") print("") if strategy == "none": print("ℹ️ No splitting required") return [self.config] elif strategy == "category": return self.split_by_category(create_router=False) elif strategy == "router": create_router = self.config.get('split_config', {}).get('create_router', True) return self.split_by_category(create_router=create_router) elif strategy == "size": return self.split_by_size() else: print(f"❌ Error: Unknown strategy: {strategy}") sys.exit(1) def save_configs(self, configs: List[Dict[str, Any]], output_dir: Path = None) -> List[Path]: """Save configs to files""" if output_dir is None: output_dir = self.config_path.parent output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) saved_files = [] for config in configs: filename = f"{config['name']}.json" filepath = output_dir / filename with open(filepath, 'w') as f: json.dump(config, f, indent=2) saved_files.append(filepath) print(f" 💾 Saved: {filepath}") return saved_files def main(): parser = argparse.ArgumentParser( description="Split large documentation configs into multiple focused skills", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Auto-detect strategy python3 split_config.py configs/godot.json # Use category-based split python3 split_config.py configs/godot.json --strategy category # Use router + categories python3 split_config.py configs/godot.json --strategy router # Custom target size python3 split_config.py configs/godot.json --target-pages 3000 # Dry run (don't save files) python3 split_config.py configs/godot.json --dry-run Split Strategies: none - No splitting (single skill) auto - Automatically choose best strategy category - Split by categories defined in config router - Create router + category-based sub-skills size - Split by page count """ ) parser.add_argument( 'config', help='Path to config file (e.g., configs/godot.json)' ) parser.add_argument( '--strategy', choices=['auto', 'none', 'category', 'router', 'size'], default='auto', help='Splitting strategy (default: auto)' ) parser.add_argument( '--target-pages', type=int, default=5000, help='Target pages per skill (default: 5000)' ) parser.add_argument( '--output-dir', help='Output directory for configs (default: same as input)' ) parser.add_argument( '--dry-run', action='store_true', help='Show what would be created without saving files' ) args = parser.parse_args() # Create splitter splitter = ConfigSplitter(args.config, args.strategy, args.target_pages) # Split config configs = splitter.split() if args.dry_run: print(f"\n{'='*60}") print("DRY RUN - No files saved") print(f"{'='*60}") print(f"Would create {len(configs)} config files:") for cfg in configs: is_router = cfg.get('_router', False) router_marker = " (ROUTER)" if is_router else "" print(f" 📄 {cfg['name']}.json{router_marker}") else: print(f"\n{'='*60}") print("SAVING CONFIGS") print(f"{'='*60}") saved_files = splitter.save_configs(configs, args.output_dir) print(f"\n{'='*60}") print("NEXT STEPS") print(f"{'='*60}") print("1. Review generated configs") print("2. Scrape each config:") for filepath in saved_files: print(f" python3 cli/doc_scraper.py --config {filepath}") print("3. Package skills:") print(" python3 cli/package_multi.py configs/-*.json") print("") if __name__ == "__main__": main()