Spaces:

F22Labs
/

skill-seeker

Sleeping

kiruthika0111

first commit

37d13e8 5 months ago

39.3 kB

	#!/usr/bin/env python3
	"""
	Documentation to Claude Skill Converter
	Single tool to scrape any documentation and create high-quality Claude skills.

	Usage:
	python3 doc_scraper.py --interactive
	python3 doc_scraper.py --config configs/godot.json
	python3 doc_scraper.py --url https://react.dev/ --name react
	"""

	import os
	import sys
	import json
	import time
	import re
	import argparse
	import hashlib
	import requests
	from pathlib import Path
	from urllib.parse import urljoin, urlparse
	from bs4 import BeautifulSoup
	from collections import deque, defaultdict


	class DocToSkillConverter:
	def __init__(self, config, dry_run=False, resume=False):
	self.config = config
	self.name = config['name']
	self.base_url = config['base_url']
	self.dry_run = dry_run
	self.resume = resume

	# Paths
	self.data_dir = f"output/{self.name}_data"
	self.skill_dir = f"output/{self.name}"
	self.checkpoint_file = f"{self.data_dir}/checkpoint.json"

	# Checkpoint config
	checkpoint_config = config.get('checkpoint', {})
	self.checkpoint_enabled = checkpoint_config.get('enabled', False)
	self.checkpoint_interval = checkpoint_config.get('interval', 1000)

	# State
	self.visited_urls = set()
	# Support multiple starting URLs
	start_urls = config.get('start_urls', [self.base_url])
	self.pending_urls = deque(start_urls)
	self.pages = []
	self.pages_scraped = 0

	# Create directories (unless dry-run)
	if not dry_run:
	os.makedirs(f"{self.data_dir}/pages", exist_ok=True)
	os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
	os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
	os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)

	# Load checkpoint if resuming
	if resume and not dry_run:
	self.load_checkpoint()

	def is_valid_url(self, url):
	"""Check if URL should be scraped"""
	if not url.startswith(self.base_url):
	return False

	# Include patterns
	includes = self.config.get('url_patterns', {}).get('include', [])
	if includes and not any(pattern in url for pattern in includes):
	return False

	# Exclude patterns
	excludes = self.config.get('url_patterns', {}).get('exclude', [])
	if any(pattern in url for pattern in excludes):
	return False

	return True

	def save_checkpoint(self):
	"""Save progress checkpoint"""
	if not self.checkpoint_enabled or self.dry_run:
	return

	checkpoint_data = {
	"config": self.config,
	"visited_urls": list(self.visited_urls),
	"pending_urls": list(self.pending_urls),
	"pages_scraped": self.pages_scraped,
	"last_updated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
	"checkpoint_interval": self.checkpoint_interval
	}

	try:
	with open(self.checkpoint_file, 'w') as f:
	json.dump(checkpoint_data, f, indent=2)
	print(f" 💾 Checkpoint saved ({self.pages_scraped} pages)")
	except Exception as e:
	print(f" ⚠️ Failed to save checkpoint: {e}")

	def load_checkpoint(self):
	"""Load progress from checkpoint"""
	if not os.path.exists(self.checkpoint_file):
	print("ℹ️ No checkpoint found, starting fresh")
	return

	try:
	with open(self.checkpoint_file, 'r') as f:
	checkpoint_data = json.load(f)

	self.visited_urls = set(checkpoint_data["visited_urls"])
	self.pending_urls = deque(checkpoint_data["pending_urls"])
	self.pages_scraped = checkpoint_data["pages_scraped"]

	print(f"✅ Resumed from checkpoint")
	print(f" Pages already scraped: {self.pages_scraped}")
	print(f" URLs visited: {len(self.visited_urls)}")
	print(f" URLs pending: {len(self.pending_urls)}")
	print(f" Last updated: {checkpoint_data['last_updated']}")
	print("")

	except Exception as e:
	print(f"⚠️ Failed to load checkpoint: {e}")
	print(" Starting fresh")

	def clear_checkpoint(self):
	"""Remove checkpoint file"""
	if os.path.exists(self.checkpoint_file):
	try:
	os.remove(self.checkpoint_file)
	print(f"✅ Checkpoint cleared")
	except Exception as e:
	print(f"⚠️ Failed to clear checkpoint: {e}")

	def extract_content(self, soup, url):
	"""Extract content with improved code and pattern detection"""
	page = {
	'url': url,
	'title': '',
	'content': '',
	'headings': [],
	'code_samples': [],
	'patterns': [], # NEW: Extract common patterns
	'links': []
	}

	selectors = self.config.get('selectors', {})

	# Extract title
	title_elem = soup.select_one(selectors.get('title', 'title'))
	if title_elem:
	page['title'] = self.clean_text(title_elem.get_text())

	# Find main content
	main_selector = selectors.get('main_content', 'div[role="main"]')
	main = soup.select_one(main_selector)

	if not main:
	print(f"⚠ No content: {url}")
	return page

	# Extract headings with better structure
	for h in main.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
	text = self.clean_text(h.get_text())
	if text:
	page['headings'].append({
	'level': h.name,
	'text': text,
	'id': h.get('id', '')
	})

	# Extract code with language detection
	code_selector = selectors.get('code_blocks', 'pre code')
	for code_elem in main.select(code_selector):
	code = code_elem.get_text()
	if len(code.strip()) > 10:
	# Try to detect language
	lang = self.detect_language(code_elem, code)
	page['code_samples'].append({
	'code': code.strip(),
	'language': lang
	})

	# Extract patterns (NEW: common code patterns)
	page['patterns'] = self.extract_patterns(main, page['code_samples'])

	# Extract paragraphs
	paragraphs = []
	for p in main.find_all('p'):
	text = self.clean_text(p.get_text())
	if text and len(text) > 20: # Skip very short paragraphs
	paragraphs.append(text)

	page['content'] = '\n\n'.join(paragraphs)

	# Extract links
	for link in main.find_all('a', href=True):
	href = urljoin(url, link['href'])
	# Strip anchor fragments to avoid treating #anchors as separate pages
	href = href.split('#')[0]
	if self.is_valid_url(href) and href not in page['links']:
	page['links'].append(href)

	return page

	def detect_language(self, elem, code):
	"""Detect programming language from code block"""
	# Check class attribute
	classes = elem.get('class', [])
	for cls in classes:
	if 'language-' in cls:
	return cls.replace('language-', '')
	if 'lang-' in cls:
	return cls.replace('lang-', '')

	# Check parent pre element
	parent = elem.parent
	if parent and parent.name == 'pre':
	classes = parent.get('class', [])
	for cls in classes:
	if 'language-' in cls:
	return cls.replace('language-', '')

	# Heuristic detection
	if 'import ' in code and 'from ' in code:
	return 'python'
	if 'const ' in code or 'let ' in code or '=>' in code:
	return 'javascript'
	if 'func ' in code and 'var ' in code:
	return 'gdscript'
	if 'def ' in code and ':' in code:
	return 'python'
	if '#include' in code or 'int main' in code:
	return 'cpp'

	return 'unknown'

	def extract_patterns(self, main, code_samples):
	"""Extract common coding patterns (NEW FEATURE)"""
	patterns = []

	# Look for "Example:" or "Pattern:" sections
	for elem in main.find_all(['p', 'div']):
	text = elem.get_text().lower()
	if any(word in text for word in ['example:', 'pattern:', 'usage:', 'typical use']):
	# Get the code that follows
	next_code = elem.find_next(['pre', 'code'])
	if next_code:
	patterns.append({
	'description': self.clean_text(elem.get_text()),
	'code': next_code.get_text().strip()
	})

	return patterns[:5] # Limit to 5 most relevant patterns

	def clean_text(self, text):
	"""Clean text content"""
	text = re.sub(r'\s+', ' ', text)
	return text.strip()

	def save_page(self, page):
	"""Save page data"""
	url_hash = hashlib.md5(page['url'].encode()).hexdigest()[:10]
	safe_title = re.sub(r'[^\w\s-]', '', page['title'])[:50]
	safe_title = re.sub(r'[-\s]+', '_', safe_title)

	filename = f"{safe_title}_{url_hash}.json"
	filepath = os.path.join(self.data_dir, "pages", filename)

	with open(filepath, 'w', encoding='utf-8') as f:
	json.dump(page, f, indent=2, ensure_ascii=False)

	def scrape_page(self, url):
	"""Scrape a single page"""
	try:
	print(f" {url}")

	headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper)'}
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')
	page = self.extract_content(soup, url)

	self.save_page(page)
	self.pages.append(page)

	# Add new URLs
	for link in page['links']:
	if link not in self.visited_urls and link not in self.pending_urls:
	self.pending_urls.append(link)

	# Rate limiting
	time.sleep(self.config.get('rate_limit', 0.5))

	except Exception as e:
	print(f" ✗ Error: {e}")

	def scrape_all(self):
	"""Scrape all pages"""
	print(f"\n{'='*60}")
	if self.dry_run:
	print(f"DRY RUN: {self.name}")
	else:
	print(f"SCRAPING: {self.name}")
	print(f"{'='*60}")
	print(f"Base URL: {self.base_url}")

	if self.dry_run:
	print(f"Mode: Preview only (no actual scraping)\n")
	else:
	print(f"Output: {self.data_dir}\n")

	max_pages = self.config.get('max_pages', 500)

	# Dry run: preview first 20 URLs
	preview_limit = 20 if self.dry_run else max_pages

	while self.pending_urls and len(self.visited_urls) < preview_limit:
	url = self.pending_urls.popleft()

	if url in self.visited_urls:
	continue

	self.visited_urls.add(url)

	if self.dry_run:
	# Just show what would be scraped
	print(f" [Preview] {url}")
	# Simulate finding links without actually scraping
	try:
	headers = {'User-Agent': 'Mozilla/5.0 (Documentation Scraper - Dry Run)'}
	response = requests.get(url, headers=headers, timeout=10)
	soup = BeautifulSoup(response.content, 'html.parser')

	main_selector = self.config.get('selectors', {}).get('main_content', 'div[role="main"]')
	main = soup.select_one(main_selector)

	if main:
	for link in main.find_all('a', href=True):
	href = urljoin(url, link['href'])
	if self.is_valid_url(href) and href not in self.visited_urls:
	self.pending_urls.append(href)
	except:
	pass # Ignore errors in dry run
	else:
	self.scrape_page(url)
	self.pages_scraped += 1

	# Save checkpoint at interval
	if self.checkpoint_enabled and self.pages_scraped % self.checkpoint_interval == 0:
	self.save_checkpoint()

	if len(self.visited_urls) % 10 == 0:
	print(f" [{len(self.visited_urls)} pages]")

	if self.dry_run:
	print(f"\n✅ Dry run complete: would scrape ~{len(self.visited_urls)} pages")
	if len(self.visited_urls) >= preview_limit:
	print(f" (showing first {preview_limit}, actual scraping may find more)")
	print(f"\n💡 To actually scrape, run without --dry-run")
	else:
	print(f"\n✅ Scraped {len(self.visited_urls)} pages")
	self.save_summary()

	def save_summary(self):
	"""Save scraping summary"""
	summary = {
	'name': self.name,
	'total_pages': len(self.pages),
	'base_url': self.base_url,
	'pages': [{'title': p['title'], 'url': p['url']} for p in self.pages]
	}

	with open(f"{self.data_dir}/summary.json", 'w', encoding='utf-8') as f:
	json.dump(summary, f, indent=2, ensure_ascii=False)

	def load_scraped_data(self):
	"""Load previously scraped data"""
	pages = []
	pages_dir = Path(self.data_dir) / "pages"

	if not pages_dir.exists():
	return []

	for json_file in pages_dir.glob("*.json"):
	try:
	with open(json_file, 'r', encoding='utf-8') as f:
	pages.append(json.load(f))
	except Exception as e:
	print(f"⚠ Error loading {json_file}: {e}")

	return pages

	def smart_categorize(self, pages):
	"""Improved categorization with better pattern matching"""
	category_defs = self.config.get('categories', {})

	# Default smart categories if none provided
	if not category_defs:
	category_defs = self.infer_categories(pages)

	categories = {cat: [] for cat in category_defs.keys()}
	categories['other'] = []

	for page in pages:
	url = page['url'].lower()
	title = page['title'].lower()
	content = page.get('content', '').lower()[:500] # Check first 500 chars

	categorized = False

	# Match against keywords
	for cat, keywords in category_defs.items():
	score = 0
	for keyword in keywords:
	keyword = keyword.lower()
	if keyword in url:
	score += 3
	if keyword in title:
	score += 2
	if keyword in content:
	score += 1

	if score >= 2: # Threshold for categorization
	categories[cat].append(page)
	categorized = True
	break

	if not categorized:
	categories['other'].append(page)

	# Remove empty categories
	categories = {k: v for k, v in categories.items() if v}

	return categories

	def infer_categories(self, pages):
	"""Infer categories from URL patterns (IMPROVED)"""
	url_segments = defaultdict(int)

	for page in pages:
	path = urlparse(page['url']).path
	segments = [s for s in path.split('/') if s and s not in ['en', 'stable', 'latest', 'docs']]

	for seg in segments:
	url_segments[seg] += 1

	# Top segments become categories
	top_segments = sorted(url_segments.items(), key=lambda x: x[1], reverse=True)[:8]

	categories = {}
	for seg, count in top_segments:
	if count >= 3: # At least 3 pages
	categories[seg] = [seg]

	# Add common defaults
	if 'tutorial' not in categories and any('tutorial' in url for url in [p['url'] for p in pages]):
	categories['tutorials'] = ['tutorial', 'guide', 'getting-started']

	if 'api' not in categories and any('api' in url or 'reference' in url for url in [p['url'] for p in pages]):
	categories['api'] = ['api', 'reference', 'class']

	return categories

	def generate_quick_reference(self, pages):
	"""Generate quick reference from common patterns (NEW FEATURE)"""
	quick_ref = []

	# Collect all patterns
	all_patterns = []
	for page in pages:
	all_patterns.extend(page.get('patterns', []))

	# Get most common code patterns
	seen_codes = set()
	for pattern in all_patterns:
	code = pattern['code']
	if code not in seen_codes and len(code) < 300:
	quick_ref.append(pattern)
	seen_codes.add(code)
	if len(quick_ref) >= 15:
	break

	return quick_ref

	def create_reference_file(self, category, pages):
	"""Create enhanced reference file"""
	if not pages:
	return

	lines = []
	lines.append(f"# {self.name.title()} - {category.replace('_', ' ').title()}\n")
	lines.append(f"Pages: {len(pages)}\n")
	lines.append("---\n")

	for page in pages:
	lines.append(f"## {page['title']}\n")
	lines.append(f"URL: {page['url']}\n")

	# Table of contents from headings
	if page.get('headings'):
	lines.append("Contents:")
	for h in page['headings'][:10]:
	level = int(h['level'][1]) if len(h['level']) > 1 else 1
	indent = " " * max(0, level - 2)
	lines.append(f"{indent}- {h['text']}")
	lines.append("")

	# Content
	if page.get('content'):
	content = page['content'][:2500]
	if len(page['content']) > 2500:
	content += "\n\n[Content truncated]"
	lines.append(content)
	lines.append("")

	# Code examples with language
	if page.get('code_samples'):
	lines.append("Examples:\n")
	for i, sample in enumerate(page['code_samples'][:4], 1):
	lang = sample.get('language', 'unknown')
	code = sample.get('code', sample if isinstance(sample, str) else '')
	lines.append(f"Example {i} ({lang}):")
	lines.append(f"```{lang}")
	lines.append(code[:600])
	if len(code) > 600:
	lines.append("...")
	lines.append("```\n")

	lines.append("---\n")

	filepath = os.path.join(self.skill_dir, "references", f"{category}.md")
	with open(filepath, 'w', encoding='utf-8') as f:
	f.write('\n'.join(lines))

	print(f" ✓ {category}.md ({len(pages)} pages)")

	def create_enhanced_skill_md(self, categories, quick_ref):
	"""Create SKILL.md with actual examples (IMPROVED)"""
	description = self.config.get('description', f'Comprehensive assistance with {self.name}')

	# Extract actual code examples from docs
	example_codes = []
	for pages in categories.values():
	for page in pages[:3]: # First 3 pages per category
	for sample in page.get('code_samples', [])[:2]: # First 2 samples per page
	code = sample.get('code', sample if isinstance(sample, str) else '')
	lang = sample.get('language', 'unknown')
	if len(code) < 200 and lang != 'unknown':
	example_codes.append((lang, code))
	if len(example_codes) >= 10:
	break
	if len(example_codes) >= 10:
	break
	if len(example_codes) >= 10:
	break

	content = f"""---
	name: {self.name}
	description: {description}
	---

	# {self.name.title()} Skill

	Comprehensive assistance with {self.name} development, generated from official documentation.

	## When to Use This Skill

	This skill should be triggered when:
	- Working with {self.name}
	- Asking about {self.name} features or APIs
	- Implementing {self.name} solutions
	- Debugging {self.name} code
	- Learning {self.name} best practices

	## Quick Reference

	### Common Patterns

	"""

	# Add actual quick reference patterns
	if quick_ref:
	for i, pattern in enumerate(quick_ref[:8], 1):
	content += f"Pattern {i}: {pattern.get('description', 'Example pattern')}\n\n"
	content += "```\n"
	content += pattern.get('code', '')[:300]
	content += "\n```\n\n"
	else:
	content += "Quick reference patterns will be added as you use the skill.\n\n"

	# Add example codes from docs
	if example_codes:
	content += "### Example Code Patterns\n\n"
	for i, (lang, code) in enumerate(example_codes[:5], 1):
	content += f"Example {i} ({lang}):\n```{lang}\n{code}\n```\n\n"

	content += f"""## Reference Files

	This skill includes comprehensive documentation in `references/`:

	"""

	for cat in sorted(categories.keys()):
	content += f"- {cat}.md - {cat.replace('_', ' ').title()} documentation\n"

	content += """
	Use `view` to read specific reference files when detailed information is needed.

	## Working with This Skill

	### For Beginners
	Start with the getting_started or tutorials reference files for foundational concepts.

	### For Specific Features
	Use the appropriate category reference file (api, guides, etc.) for detailed information.

	### For Code Examples
	The quick reference section above contains common patterns extracted from the official docs.

	## Resources

	### references/
	Organized documentation extracted from official sources. These files contain:
	- Detailed explanations
	- Code examples with language annotations
	- Links to original documentation
	- Table of contents for quick navigation

	### scripts/
	Add helper scripts here for common automation tasks.

	### assets/
	Add templates, boilerplate, or example projects here.

	## Notes

	- This skill was automatically generated from official documentation
	- Reference files preserve the structure and examples from source docs
	- Code examples include language detection for better syntax highlighting
	- Quick reference patterns are extracted from common usage examples in the docs

	## Updating

	To refresh this skill with updated documentation:
	1. Re-run the scraper with the same configuration
	2. The skill will be rebuilt with the latest information
	"""

	filepath = os.path.join(self.skill_dir, "SKILL.md")
	with open(filepath, 'w', encoding='utf-8') as f:
	f.write(content)

	print(f" ✓ SKILL.md (enhanced with {len(example_codes)} examples)")

	def create_index(self, categories):
	"""Create navigation index"""
	lines = []
	lines.append(f"# {self.name.title()} Documentation Index\n")
	lines.append("## Categories\n")

	for cat, pages in sorted(categories.items()):
	lines.append(f"### {cat.replace('_', ' ').title()}")
	lines.append(f"File: `{cat}.md`")
	lines.append(f"Pages: {len(pages)}\n")

	filepath = os.path.join(self.skill_dir, "references", "index.md")
	with open(filepath, 'w', encoding='utf-8') as f:
	f.write('\n'.join(lines))

	print(" ✓ index.md")

	def build_skill(self):
	"""Build the skill from scraped data"""
	print(f"\n{'='*60}")
	print(f"BUILDING SKILL: {self.name}")
	print(f"{'='*60}\n")

	# Load data
	print("Loading scraped data...")
	pages = self.load_scraped_data()

	if not pages:
	print("✗ No scraped data found!")
	return False

	print(f" ✓ Loaded {len(pages)} pages\n")

	# Categorize
	print("Categorizing pages...")
	categories = self.smart_categorize(pages)
	print(f" ✓ Created {len(categories)} categories\n")

	# Generate quick reference
	print("Generating quick reference...")
	quick_ref = self.generate_quick_reference(pages)
	print(f" ✓ Extracted {len(quick_ref)} patterns\n")

	# Create reference files
	print("Creating reference files...")
	for cat, cat_pages in categories.items():
	self.create_reference_file(cat, cat_pages)

	# Create index
	self.create_index(categories)
	print()

	# Create enhanced SKILL.md
	print("Creating SKILL.md...")
	self.create_enhanced_skill_md(categories, quick_ref)

	print(f"\n✅ Skill built: {self.skill_dir}/")
	return True


	def validate_config(config):
	"""Validate configuration structure"""
	errors = []
	warnings = []

	# Required fields
	required_fields = ['name', 'base_url']
	for field in required_fields:
	if field not in config:
	errors.append(f"Missing required field: '{field}'")

	# Validate name (alphanumeric, hyphens, underscores only)
	if 'name' in config:
	if not re.match(r'^[a-zA-Z0-9_-]+$', config['name']):
	errors.append(f"Invalid name: '{config['name']}' (use only letters, numbers, hyphens, underscores)")

	# Validate base_url
	if 'base_url' in config:
	if not config['base_url'].startswith(('http://', 'https://')):
	errors.append(f"Invalid base_url: '{config['base_url']}' (must start with http:// or https://)")

	# Validate selectors structure
	if 'selectors' in config:
	if not isinstance(config['selectors'], dict):
	errors.append("'selectors' must be a dictionary")
	else:
	recommended_selectors = ['main_content', 'title', 'code_blocks']
	for selector in recommended_selectors:
	if selector not in config['selectors']:
	warnings.append(f"Missing recommended selector: '{selector}'")
	else:
	warnings.append("Missing 'selectors' section (recommended)")

	# Validate url_patterns
	if 'url_patterns' in config:
	if not isinstance(config['url_patterns'], dict):
	errors.append("'url_patterns' must be a dictionary")
	else:
	for key in ['include', 'exclude']:
	if key in config['url_patterns']:
	if not isinstance(config['url_patterns'][key], list):
	errors.append(f"'url_patterns.{key}' must be a list")

	# Validate categories
	if 'categories' in config:
	if not isinstance(config['categories'], dict):
	errors.append("'categories' must be a dictionary")
	else:
	for cat_name, keywords in config['categories'].items():
	if not isinstance(keywords, list):
	errors.append(f"'categories.{cat_name}' must be a list of keywords")

	# Validate rate_limit
	if 'rate_limit' in config:
	try:
	rate = float(config['rate_limit'])
	if rate < 0:
	errors.append(f"'rate_limit' must be non-negative (got {rate})")
	elif rate > 10:
	warnings.append(f"'rate_limit' is very high ({rate}s) - this may slow down scraping significantly")
	except (ValueError, TypeError):
	errors.append(f"'rate_limit' must be a number (got {config['rate_limit']})")

	# Validate max_pages
	if 'max_pages' in config:
	try:
	max_p = int(config['max_pages'])
	if max_p < 1:
	errors.append(f"'max_pages' must be at least 1 (got {max_p})")
	elif max_p > 10000:
	warnings.append(f"'max_pages' is very high ({max_p}) - scraping may take a very long time")
	except (ValueError, TypeError):
	errors.append(f"'max_pages' must be an integer (got {config['max_pages']})")

	# Validate start_urls if present
	if 'start_urls' in config:
	if not isinstance(config['start_urls'], list):
	errors.append("'start_urls' must be a list")
	else:
	for url in config['start_urls']:
	if not url.startswith(('http://', 'https://')):
	errors.append(f"Invalid start_url: '{url}' (must start with http:// or https://)")

	return errors, warnings


	def load_config(config_path):
	"""Load and validate configuration from file"""
	try:
	with open(config_path, 'r') as f:
	config = json.load(f)
	except json.JSONDecodeError as e:
	print(f"❌ Error: Invalid JSON in config file: {e}")
	sys.exit(1)
	except FileNotFoundError:
	print(f"❌ Error: Config file not found: {config_path}")
	sys.exit(1)

	# Validate config
	errors, warnings = validate_config(config)

	# Show warnings (non-blocking)
	if warnings:
	print(f"⚠️ Configuration warnings in {config_path}:")
	for warning in warnings:
	print(f" - {warning}")
	print()

	# Show errors (blocking)
	if errors:
	print(f"❌ Configuration validation errors in {config_path}:")
	for error in errors:
	print(f" - {error}")
	sys.exit(1)

	return config


	def interactive_config():
	"""Interactive configuration"""
	print("\n" + "="*60)
	print("Documentation to Skill Converter")
	print("="*60 + "\n")

	config = {}

	# Basic info
	config['name'] = input("Skill name (e.g., 'react', 'godot'): ").strip()
	config['description'] = input("Skill description: ").strip()
	config['base_url'] = input("Base URL (e.g., https://docs.example.com/): ").strip()

	if not config['base_url'].endswith('/'):
	config['base_url'] += '/'

	# Selectors
	print("\nCSS Selectors (press Enter for defaults):")
	selectors = {}
	selectors['main_content'] = input(" Main content [div[role='main']]: ").strip() or "div[role='main']"
	selectors['title'] = input(" Title [title]: ").strip() or "title"
	selectors['code_blocks'] = input(" Code blocks [pre code]: ").strip() or "pre code"
	config['selectors'] = selectors

	# URL patterns
	print("\nURL Patterns (comma-separated, optional):")
	include = input(" Include: ").strip()
	exclude = input(" Exclude: ").strip()
	config['url_patterns'] = {
	'include': [p.strip() for p in include.split(',') if p.strip()],
	'exclude': [p.strip() for p in exclude.split(',') if p.strip()]
	}

	# Settings
	rate = input("\nRate limit (seconds) [0.5]: ").strip()
	config['rate_limit'] = float(rate) if rate else 0.5

	max_p = input("Max pages [500]: ").strip()
	config['max_pages'] = int(max_p) if max_p else 500

	return config


	def check_existing_data(name):
	"""Check if scraped data already exists"""
	data_dir = f"output/{name}_data"
	if os.path.exists(data_dir) and os.path.exists(f"{data_dir}/summary.json"):
	with open(f"{data_dir}/summary.json", 'r') as f:
	summary = json.load(f)
	return True, summary.get('total_pages', 0)
	return False, 0


	def main():
	parser = argparse.ArgumentParser(
	description='Convert documentation websites to Claude skills',
	formatter_class=argparse.RawDescriptionHelpFormatter
	)

	parser.add_argument('--interactive', '-i', action='store_true',
	help='Interactive configuration mode')
	parser.add_argument('--config', '-c', type=str,
	help='Load configuration from file (e.g., configs/godot.json)')
	parser.add_argument('--name', type=str,
	help='Skill name')
	parser.add_argument('--url', type=str,
	help='Base documentation URL')
	parser.add_argument('--description', '-d', type=str,
	help='Skill description')
	parser.add_argument('--skip-scrape', action='store_true',
	help='Skip scraping, use existing data')
	parser.add_argument('--dry-run', action='store_true',
	help='Preview what will be scraped without actually scraping')
	parser.add_argument('--enhance', action='store_true',
	help='Enhance SKILL.md using Claude API after building (requires API key)')
	parser.add_argument('--enhance-local', action='store_true',
	help='Enhance SKILL.md using Claude Code in new terminal (no API key needed)')
	parser.add_argument('--api-key', type=str,
	help='Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)')
	parser.add_argument('--resume', action='store_true',
	help='Resume from last checkpoint (for interrupted scrapes)')
	parser.add_argument('--fresh', action='store_true',
	help='Clear checkpoint and start fresh')

	args = parser.parse_args()

	# Get configuration
	if args.config:
	config = load_config(args.config)
	elif args.interactive or not (args.name and args.url):
	config = interactive_config()
	else:
	config = {
	'name': args.name,
	'description': args.description or f'Comprehensive assistance with {args.name}',
	'base_url': args.url,
	'selectors': {
	'main_content': "div[role='main']",
	'title': 'title',
	'code_blocks': 'pre code'
	},
	'url_patterns': {'include': [], 'exclude': []},
	'rate_limit': 0.5,
	'max_pages': 500
	}

	# Dry run mode - preview only
	if args.dry_run:
	print(f"\n{'='*60}")
	print("DRY RUN MODE")
	print(f"{'='*60}")
	print("This will show what would be scraped without saving anything.\n")

	converter = DocToSkillConverter(config, dry_run=True)
	converter.scrape_all()

	print(f"\n📋 Configuration Summary:")
	print(f" Name: {config['name']}")
	print(f" Base URL: {config['base_url']}")
	print(f" Max pages: {config.get('max_pages', 500)}")
	print(f" Rate limit: {config.get('rate_limit', 0.5)}s")
	print(f" Categories: {len(config.get('categories', {}))}")
	return

	# Check for existing data
	exists, page_count = check_existing_data(config['name'])

	if exists and not args.skip_scrape:
	print(f"\n✓ Found existing data: {page_count} pages")
	response = input("Use existing data? (y/n): ").strip().lower()
	if response == 'y':
	args.skip_scrape = True

	# Create converter
	converter = DocToSkillConverter(config, resume=args.resume)

	# Handle fresh start (clear checkpoint)
	if args.fresh:
	converter.clear_checkpoint()

	# Scrape or skip
	if not args.skip_scrape:
	try:
	converter.scrape_all()
	# Save final checkpoint
	if converter.checkpoint_enabled:
	converter.save_checkpoint()
	print("\n💾 Final checkpoint saved")
	# Clear checkpoint after successful completion
	converter.clear_checkpoint()
	print("✅ Scraping complete - checkpoint cleared")
	except KeyboardInterrupt:
	print("\n\nScraping interrupted.")
	if converter.checkpoint_enabled:
	converter.save_checkpoint()
	print(f"💾 Progress saved to checkpoint")
	print(f" Resume with: --config {args.config if args.config else 'config.json'} --resume")
	response = input("Continue with skill building? (y/n): ").strip().lower()
	if response != 'y':
	return
	else:
	print(f"\n⏭️ Skipping scrape, using existing data")

	# Build skill
	success = converter.build_skill()

	if not success:
	sys.exit(1)

	# Optional enhancement with Claude API
	if args.enhance:
	print(f"\n{'='*60}")
	print(f"ENHANCING SKILL.MD WITH CLAUDE API")
	print(f"{'='*60}\n")

	try:
	import subprocess
	enhance_cmd = ['python3', 'enhance_skill.py', f'output/{config["name"]}/']
	if args.api_key:
	enhance_cmd.extend(['--api-key', args.api_key])

	result = subprocess.run(enhance_cmd, check=True)
	if result.returncode == 0:
	print("\n✅ Enhancement complete!")
	except subprocess.CalledProcessError:
	print("\n⚠ Enhancement failed, but skill was still built")
	except FileNotFoundError:
	print("\n⚠ enhance_skill.py not found. Run manually:")
	print(f" python3 enhance_skill.py output/{config['name']}/")

	# Optional enhancement with Claude Code (local, no API key)
	if args.enhance_local:
	print(f"\n{'='*60}")
	print(f"ENHANCING SKILL.MD WITH CLAUDE CODE (LOCAL)")
	print(f"{'='*60}\n")

	try:
	import subprocess
	enhance_cmd = ['python3', 'enhance_skill_local.py', f'output/{config["name"]}/']
	subprocess.run(enhance_cmd, check=True)
	except subprocess.CalledProcessError:
	print("\n⚠ Enhancement failed, but skill was still built")
	except FileNotFoundError:
	print("\n⚠ enhance_skill_local.py not found. Run manually:")
	print(f" python3 enhance_skill_local.py output/{config['name']}/")

	print(f"\n📦 Package your skill:")
	print(f" python3 package_skill.py output/{config['name']}/")

	if not args.enhance and not args.enhance_local:
	print(f"\n💡 Optional: Enhance SKILL.md with Claude:")
	print(f" API-based: python3 enhance_skill.py output/{config['name']}/")
	print(f" or re-run with: --enhance")
	print(f" Local (no API key): python3 enhance_skill_local.py output/{config['name']}/")
	print(f" or re-run with: --enhance-local")


	if __name__ == "__main__":
	main()