Spaces:
Running
Running
| const axios = require('axios'); | |
| const { supabase, hydrateMemory } = require('./db'); | |
| const GITHUB_API = 'https://api.github.com'; | |
| const HEADERS = { | |
| Authorization: `token ${process.env.GITHUB_TOKEN}`, | |
| Accept: 'application/vnd.github.v3+json' | |
| }; | |
| // HELPER: Extract links from Markdown | |
| const extractLinksFromMarkdown = (text) => { | |
| const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g; | |
| const links = []; | |
| let match; | |
| while ((match = regex.exec(text)) !== null) { | |
| links.push({ | |
| name: match[1], | |
| url: match[2], | |
| owner: match[3], | |
| repo: match[4], | |
| description: match[5] || "" | |
| }); | |
| } | |
| return links; | |
| }; | |
| // PHASE A: Search Topics | |
| async function getTopicTools() { | |
| let tools = []; | |
| for (let page = 1; page <= 2; page++) { | |
| try { | |
| const { data } = await axios.get( | |
| `${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`, | |
| { headers: HEADERS } | |
| ); | |
| tools.push(...data.items.map(repo => ({ | |
| id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID | |
| name: repo.name, | |
| description: repo.description, | |
| url: repo.html_url, | |
| repo_url: repo.html_url, | |
| install_command: `npx -y ${repo.name}`, | |
| keywords: repo.topics || [], | |
| stars: repo.stargazers_count, | |
| last_updated: repo.pushed_at, | |
| is_awesome_listed: false, | |
| discovery_source: 'github_topic' | |
| }))); | |
| } catch (e) { console.error("Topic search error"); } | |
| } | |
| return tools; | |
| } | |
| // PHASE B: Awesome Lists | |
| async function getAwesomeTools() { | |
| const sources = [ | |
| { owner: 'wong2', repo: 'awesome-mcp-servers' }, | |
| { owner: 'punkpeye', repo: 'awesome-mcp-servers' } | |
| ]; | |
| let tools = []; | |
| for (const s of sources) { | |
| try { | |
| const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS }); | |
| const content = Buffer.from(data.content, 'base64').toString(); | |
| const links = extractLinksFromMarkdown(content); | |
| links.forEach(link => { | |
| tools.push({ | |
| id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search | |
| name: link.name, | |
| description: link.description, | |
| url: link.url, | |
| repo_url: link.url, | |
| install_command: `npx -y ${link.repo}`, | |
| is_awesome_listed: true, | |
| discovery_source: 'awesome_list' | |
| }); | |
| }); | |
| } catch (e) { console.error(`List ${s.repo} error`); } | |
| } | |
| return tools; | |
| } | |
| // PHASE C: Monorepos | |
| async function getMonorepoTools() { | |
| const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }]; | |
| let tools = []; | |
| for (const m of monorepos) { | |
| for (const path of ['src', 'servers', 'packages']) { | |
| try { | |
| const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS }); | |
| if (Array.isArray(data)) { | |
| data.filter(f => f.type === 'dir').forEach(dir => { | |
| tools.push({ | |
| id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(), | |
| name: `${m.repo}-${dir.name}`, | |
| description: `Official tool: ${dir.name}`, | |
| url: dir.html_url, | |
| repo_url: `https://github.com/${m.owner}/${m.repo}`, | |
| install_command: `npx -y @modelcontextprotocol/server-${dir.name}`, | |
| is_awesome_listed: true, | |
| discovery_source: 'official_monorepo' | |
| }); | |
| }); | |
| } | |
| } catch (e) {} | |
| } | |
| } | |
| return tools; | |
| } | |
| async function runSmartScraper() { | |
| console.log("🚀 Aggressive Scraper Started..."); | |
| try { | |
| const topicTools = await getTopicTools(); | |
| const awesomeTools = await getAwesomeTools(); | |
| const monoTools = await getMonorepoTools(); | |
| const allResults = [...topicTools, ...awesomeTools, ...monoTools]; | |
| // --- THE FIX: DEDUPLICATE BY ID --- | |
| const finalMap = new Map(); | |
| allResults.forEach(item => { | |
| const existing = finalMap.get(item.id); | |
| if (!existing) { | |
| finalMap.set(item.id, item); | |
| } else { | |
| // If we find the same ID again, prefer the one that is Awesome Listed | |
| if (item.is_awesome_listed && !existing.is_awesome_listed) { | |
| finalMap.set(item.id, item); | |
| } | |
| // Also update stars if the new object has them | |
| if (item.stars > (existing.stars || 0)) { | |
| existing.stars = item.stars; | |
| finalMap.set(item.id, existing); | |
| } | |
| } | |
| }); | |
| const finalTools = Array.from(finalMap.values()); | |
| console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`); | |
| const { error } = await supabase | |
| .from('mcp_tools') | |
| .upsert(finalTools, { onConflict: 'id' }); | |
| if (error) throw error; | |
| console.log(`✅ Success! Indexed ${finalTools.length} tools.`); | |
| await hydrateMemory(); | |
| } catch (err) { | |
| console.error("❌ Scraper Error:", err.message); | |
| } | |
| } | |
| module.exports = { runSmartScraper }; |