Spaces:
Running
Running
File size: 5,815 Bytes
32304db 4c95364 32304db 481018d 32304db d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d 32304db 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d d1fe88e 481018d 32304db 481018d 32304db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | const axios = require('axios');
const { supabase, hydrateMemory } = require('./db');
const GITHUB_API = 'https://api.github.com';
const HEADERS = {
Authorization: `token ${process.env.GITHUB_TOKEN}`,
Accept: 'application/vnd.github.v3+json'
};
// HELPER: Extract links from Markdown
const extractLinksFromMarkdown = (text) => {
const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g;
const links = [];
let match;
while ((match = regex.exec(text)) !== null) {
links.push({
name: match[1],
url: match[2],
owner: match[3],
repo: match[4],
description: match[5] || ""
});
}
return links;
};
// PHASE A: Search Topics
async function getTopicTools() {
let tools = [];
for (let page = 1; page <= 2; page++) {
try {
const { data } = await axios.get(
`${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`,
{ headers: HEADERS }
);
tools.push(...data.items.map(repo => ({
id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
name: repo.name,
description: repo.description,
url: repo.html_url,
repo_url: repo.html_url,
install_command: `npx -y ${repo.name}`,
keywords: repo.topics || [],
stars: repo.stargazers_count,
last_updated: repo.pushed_at,
is_awesome_listed: false,
discovery_source: 'github_topic'
})));
} catch (e) { console.error("Topic search error"); }
}
return tools;
}
// PHASE B: Awesome Lists
async function getAwesomeTools() {
const sources = [
{ owner: 'wong2', repo: 'awesome-mcp-servers' },
{ owner: 'punkpeye', repo: 'awesome-mcp-servers' }
];
let tools = [];
for (const s of sources) {
try {
const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
const content = Buffer.from(data.content, 'base64').toString();
const links = extractLinksFromMarkdown(content);
links.forEach(link => {
tools.push({
id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
name: link.name,
description: link.description,
url: link.url,
repo_url: link.url,
install_command: `npx -y ${link.repo}`,
is_awesome_listed: true,
discovery_source: 'awesome_list'
});
});
} catch (e) { console.error(`List ${s.repo} error`); }
}
return tools;
}
// PHASE C: Monorepos
async function getMonorepoTools() {
const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
let tools = [];
for (const m of monorepos) {
for (const path of ['src', 'servers', 'packages']) {
try {
const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
if (Array.isArray(data)) {
data.filter(f => f.type === 'dir').forEach(dir => {
tools.push({
id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(),
name: `${m.repo}-${dir.name}`,
description: `Official tool: ${dir.name}`,
url: dir.html_url,
repo_url: `https://github.com/${m.owner}/${m.repo}`,
install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
is_awesome_listed: true,
discovery_source: 'official_monorepo'
});
});
}
} catch (e) {}
}
}
return tools;
}
async function runSmartScraper() {
console.log("🚀 Aggressive Scraper Started...");
try {
const topicTools = await getTopicTools();
const awesomeTools = await getAwesomeTools();
const monoTools = await getMonorepoTools();
const allResults = [...topicTools, ...awesomeTools, ...monoTools];
// --- THE FIX: DEDUPLICATE BY ID ---
const finalMap = new Map();
allResults.forEach(item => {
const existing = finalMap.get(item.id);
if (!existing) {
finalMap.set(item.id, item);
} else {
// If we find the same ID again, prefer the one that is Awesome Listed
if (item.is_awesome_listed && !existing.is_awesome_listed) {
finalMap.set(item.id, item);
}
// Also update stars if the new object has them
if (item.stars > (existing.stars || 0)) {
existing.stars = item.stars;
finalMap.set(item.id, existing);
}
}
});
const finalTools = Array.from(finalMap.values());
console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`);
const { error } = await supabase
.from('mcp_tools')
.upsert(finalTools, { onConflict: 'id' });
if (error) throw error;
console.log(`✅ Success! Indexed ${finalTools.length} tools.`);
await hydrateMemory();
} catch (err) {
console.error("❌ Scraper Error:", err.message);
}
}
module.exports = { runSmartScraper }; |