Spaces:
Running
Running
Delete scraper.js
Browse files- scraper.js +0 -161
scraper.js
DELETED
|
@@ -1,161 +0,0 @@
|
|
| 1 |
-
const axios = require('axios');
|
| 2 |
-
const { supabase, hydrateMemory } = require('./db');
|
| 3 |
-
|
| 4 |
-
const GITHUB_API = 'https://api.github.com';
|
| 5 |
-
const HEADERS = {
|
| 6 |
-
Authorization: `token ${process.env.GITHUB_TOKEN}`,
|
| 7 |
-
Accept: 'application/vnd.github.v3+json'
|
| 8 |
-
};
|
| 9 |
-
|
| 10 |
-
// HELPER: Extract links from Markdown
|
| 11 |
-
const extractLinksFromMarkdown = (text) => {
|
| 12 |
-
const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g;
|
| 13 |
-
const links = [];
|
| 14 |
-
let match;
|
| 15 |
-
while ((match = regex.exec(text)) !== null) {
|
| 16 |
-
links.push({
|
| 17 |
-
name: match[1],
|
| 18 |
-
url: match[2],
|
| 19 |
-
owner: match[3],
|
| 20 |
-
repo: match[4],
|
| 21 |
-
description: match[5] || ""
|
| 22 |
-
});
|
| 23 |
-
}
|
| 24 |
-
return links;
|
| 25 |
-
};
|
| 26 |
-
|
| 27 |
-
// PHASE A: Search Topics
|
| 28 |
-
async function getTopicTools() {
|
| 29 |
-
let tools = [];
|
| 30 |
-
for (let page = 1; page <= 2; page++) {
|
| 31 |
-
try {
|
| 32 |
-
const { data } = await axios.get(
|
| 33 |
-
`${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`,
|
| 34 |
-
{ headers: HEADERS }
|
| 35 |
-
);
|
| 36 |
-
tools.push(...data.items.map(repo => ({
|
| 37 |
-
id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
|
| 38 |
-
name: repo.name,
|
| 39 |
-
description: repo.description,
|
| 40 |
-
url: repo.html_url,
|
| 41 |
-
repo_url: repo.html_url,
|
| 42 |
-
install_command: `npx -y ${repo.name}`,
|
| 43 |
-
keywords: repo.topics || [],
|
| 44 |
-
stars: repo.stargazers_count,
|
| 45 |
-
last_updated: repo.pushed_at,
|
| 46 |
-
is_awesome_listed: false,
|
| 47 |
-
discovery_source: 'github_topic'
|
| 48 |
-
})));
|
| 49 |
-
} catch (e) { console.error("Topic search error"); }
|
| 50 |
-
}
|
| 51 |
-
return tools;
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
// PHASE B: Awesome Lists
|
| 55 |
-
async function getAwesomeTools() {
|
| 56 |
-
const sources = [
|
| 57 |
-
{ owner: 'wong2', repo: 'awesome-mcp-servers' },
|
| 58 |
-
{ owner: 'punkpeye', repo: 'awesome-mcp-servers' }
|
| 59 |
-
];
|
| 60 |
-
let tools = [];
|
| 61 |
-
|
| 62 |
-
for (const s of sources) {
|
| 63 |
-
try {
|
| 64 |
-
const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
|
| 65 |
-
const content = Buffer.from(data.content, 'base64').toString();
|
| 66 |
-
const links = extractLinksFromMarkdown(content);
|
| 67 |
-
|
| 68 |
-
links.forEach(link => {
|
| 69 |
-
tools.push({
|
| 70 |
-
id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
|
| 71 |
-
name: link.name,
|
| 72 |
-
description: link.description,
|
| 73 |
-
url: link.url,
|
| 74 |
-
repo_url: link.url,
|
| 75 |
-
install_command: `npx -y ${link.repo}`,
|
| 76 |
-
is_awesome_listed: true,
|
| 77 |
-
discovery_source: 'awesome_list'
|
| 78 |
-
});
|
| 79 |
-
});
|
| 80 |
-
} catch (e) { console.error(`List ${s.repo} error`); }
|
| 81 |
-
}
|
| 82 |
-
return tools;
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
// PHASE C: Monorepos
|
| 86 |
-
async function getMonorepoTools() {
|
| 87 |
-
const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
|
| 88 |
-
let tools = [];
|
| 89 |
-
for (const m of monorepos) {
|
| 90 |
-
for (const path of ['src', 'servers', 'packages']) {
|
| 91 |
-
try {
|
| 92 |
-
const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
|
| 93 |
-
if (Array.isArray(data)) {
|
| 94 |
-
data.filter(f => f.type === 'dir').forEach(dir => {
|
| 95 |
-
tools.push({
|
| 96 |
-
id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(),
|
| 97 |
-
name: `${m.repo}-${dir.name}`,
|
| 98 |
-
description: `Official tool: ${dir.name}`,
|
| 99 |
-
url: dir.html_url,
|
| 100 |
-
repo_url: `https://github.com/${m.owner}/${m.repo}`,
|
| 101 |
-
install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
|
| 102 |
-
is_awesome_listed: true,
|
| 103 |
-
discovery_source: 'official_monorepo'
|
| 104 |
-
});
|
| 105 |
-
});
|
| 106 |
-
}
|
| 107 |
-
} catch (e) {}
|
| 108 |
-
}
|
| 109 |
-
}
|
| 110 |
-
return tools;
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
async function runSmartScraper() {
|
| 114 |
-
console.log("🚀 Aggressive Scraper Started...");
|
| 115 |
-
try {
|
| 116 |
-
const topicTools = await getTopicTools();
|
| 117 |
-
const awesomeTools = await getAwesomeTools();
|
| 118 |
-
const monoTools = await getMonorepoTools();
|
| 119 |
-
|
| 120 |
-
const allResults = [...topicTools, ...awesomeTools, ...monoTools];
|
| 121 |
-
|
| 122 |
-
// --- THE FIX: DEDUPLICATE BY ID ---
|
| 123 |
-
const finalMap = new Map();
|
| 124 |
-
|
| 125 |
-
allResults.forEach(item => {
|
| 126 |
-
const existing = finalMap.get(item.id);
|
| 127 |
-
|
| 128 |
-
if (!existing) {
|
| 129 |
-
finalMap.set(item.id, item);
|
| 130 |
-
} else {
|
| 131 |
-
// If we find the same ID again, prefer the one that is Awesome Listed
|
| 132 |
-
if (item.is_awesome_listed && !existing.is_awesome_listed) {
|
| 133 |
-
finalMap.set(item.id, item);
|
| 134 |
-
}
|
| 135 |
-
// Also update stars if the new object has them
|
| 136 |
-
if (item.stars > (existing.stars || 0)) {
|
| 137 |
-
existing.stars = item.stars;
|
| 138 |
-
finalMap.set(item.id, existing);
|
| 139 |
-
}
|
| 140 |
-
}
|
| 141 |
-
});
|
| 142 |
-
|
| 143 |
-
const finalTools = Array.from(finalMap.values());
|
| 144 |
-
|
| 145 |
-
console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`);
|
| 146 |
-
|
| 147 |
-
const { error } = await supabase
|
| 148 |
-
.from('mcp_tools')
|
| 149 |
-
.upsert(finalTools, { onConflict: 'id' });
|
| 150 |
-
|
| 151 |
-
if (error) throw error;
|
| 152 |
-
|
| 153 |
-
console.log(`✅ Success! Indexed ${finalTools.length} tools.`);
|
| 154 |
-
await hydrateMemory();
|
| 155 |
-
|
| 156 |
-
} catch (err) {
|
| 157 |
-
console.error("❌ Scraper Error:", err.message);
|
| 158 |
-
}
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
module.exports = { runSmartScraper };
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|