const axios = require('axios'); const { supabase, hydrateMemory } = require('./db'); const GITHUB_API = 'https://api.github.com'; const HEADERS = { Authorization: `token ${process.env.GITHUB_TOKEN}`, Accept: 'application/vnd.github.v3+json' }; // HELPER: Extract links from Markdown const extractLinksFromMarkdown = (text) => { const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g; const links = []; let match; while ((match = regex.exec(text)) !== null) { links.push({ name: match[1], url: match[2], owner: match[3], repo: match[4], description: match[5] || "" }); } return links; }; // PHASE A: Search Topics async function getTopicTools() { let tools = []; for (let page = 1; page <= 2; page++) { try { const { data } = await axios.get( `${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`, { headers: HEADERS } ); tools.push(...data.items.map(repo => ({ id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID name: repo.name, description: repo.description, url: repo.html_url, repo_url: repo.html_url, install_command: `npx -y ${repo.name}`, keywords: repo.topics || [], stars: repo.stargazers_count, last_updated: repo.pushed_at, is_awesome_listed: false, discovery_source: 'github_topic' }))); } catch (e) { console.error("Topic search error"); } } return tools; } // PHASE B: Awesome Lists async function getAwesomeTools() { const sources = [ { owner: 'wong2', repo: 'awesome-mcp-servers' }, { owner: 'punkpeye', repo: 'awesome-mcp-servers' } ]; let tools = []; for (const s of sources) { try { const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS }); const content = Buffer.from(data.content, 'base64').toString(); const links = extractLinksFromMarkdown(content); links.forEach(link => { tools.push({ id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search name: link.name, description: link.description, url: link.url, repo_url: link.url, install_command: `npx -y ${link.repo}`, is_awesome_listed: true, discovery_source: 'awesome_list' }); }); } catch (e) { console.error(`List ${s.repo} error`); } } return tools; } // PHASE C: Monorepos async function getMonorepoTools() { const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }]; let tools = []; for (const m of monorepos) { for (const path of ['src', 'servers', 'packages']) { try { const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS }); if (Array.isArray(data)) { data.filter(f => f.type === 'dir').forEach(dir => { tools.push({ id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(), name: `${m.repo}-${dir.name}`, description: `Official tool: ${dir.name}`, url: dir.html_url, repo_url: `https://github.com/${m.owner}/${m.repo}`, install_command: `npx -y @modelcontextprotocol/server-${dir.name}`, is_awesome_listed: true, discovery_source: 'official_monorepo' }); }); } } catch (e) {} } } return tools; } async function runSmartScraper() { console.log("🚀 Aggressive Scraper Started..."); try { const topicTools = await getTopicTools(); const awesomeTools = await getAwesomeTools(); const monoTools = await getMonorepoTools(); const allResults = [...topicTools, ...awesomeTools, ...monoTools]; // --- THE FIX: DEDUPLICATE BY ID --- const finalMap = new Map(); allResults.forEach(item => { const existing = finalMap.get(item.id); if (!existing) { finalMap.set(item.id, item); } else { // If we find the same ID again, prefer the one that is Awesome Listed if (item.is_awesome_listed && !existing.is_awesome_listed) { finalMap.set(item.id, item); } // Also update stars if the new object has them if (item.stars > (existing.stars || 0)) { existing.stars = item.stars; finalMap.set(item.id, existing); } } }); const finalTools = Array.from(finalMap.values()); console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`); const { error } = await supabase .from('mcp_tools') .upsert(finalTools, { onConflict: 'id' }); if (error) throw error; console.log(`✅ Success! Indexed ${finalTools.length} tools.`); await hydrateMemory(); } catch (err) { console.error("❌ Scraper Error:", err.message); } } module.exports = { runSmartScraper };