Spaces:
Sleeping
Sleeping
Update scraper.js
Browse files- scraper.js +46 -39
scraper.js
CHANGED
|
@@ -7,20 +7,25 @@ const HEADERS = {
|
|
| 7 |
Accept: 'application/vnd.github.v3+json'
|
| 8 |
};
|
| 9 |
|
| 10 |
-
// HELPER: Extract links from Markdown
|
| 11 |
const extractLinksFromMarkdown = (text) => {
|
| 12 |
-
const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/[^)]+)\)\s*(?:-|β)?\s*([^\n]+)?/g;
|
| 13 |
const links = [];
|
| 14 |
let match;
|
| 15 |
while ((match = regex.exec(text)) !== null) {
|
| 16 |
-
links.push({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
return links;
|
| 19 |
};
|
| 20 |
|
| 21 |
-
// PHASE A:
|
| 22 |
async function getTopicTools() {
|
| 23 |
-
console.log("π Searching GitHub Topics...");
|
| 24 |
let tools = [];
|
| 25 |
for (let page = 1; page <= 2; page++) {
|
| 26 |
try {
|
|
@@ -29,7 +34,7 @@ async function getTopicTools() {
|
|
| 29 |
{ headers: HEADERS }
|
| 30 |
);
|
| 31 |
tools.push(...data.items.map(repo => ({
|
| 32 |
-
id: `gh_${repo.
|
| 33 |
name: repo.name,
|
| 34 |
description: repo.description,
|
| 35 |
url: repo.html_url,
|
|
@@ -38,15 +43,15 @@ async function getTopicTools() {
|
|
| 38 |
keywords: repo.topics || [],
|
| 39 |
stars: repo.stargazers_count,
|
| 40 |
last_updated: repo.pushed_at,
|
| 41 |
-
is_awesome_listed: false,
|
| 42 |
discovery_source: 'github_topic'
|
| 43 |
})));
|
| 44 |
-
} catch (e) { console.error(
|
| 45 |
}
|
| 46 |
return tools;
|
| 47 |
}
|
| 48 |
|
| 49 |
-
// PHASE B:
|
| 50 |
async function getAwesomeTools() {
|
| 51 |
const sources = [
|
| 52 |
{ owner: 'wong2', repo: 'awesome-mcp-servers' },
|
|
@@ -56,39 +61,33 @@ async function getAwesomeTools() {
|
|
| 56 |
|
| 57 |
for (const s of sources) {
|
| 58 |
try {
|
| 59 |
-
console.log(`π Scraping Awesome List: ${s.owner}/${s.repo}`);
|
| 60 |
const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
|
| 61 |
const content = Buffer.from(data.content, 'base64').toString();
|
| 62 |
const links = extractLinksFromMarkdown(content);
|
| 63 |
|
| 64 |
links.forEach(link => {
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
discovery_source: 'awesome_list'
|
| 76 |
-
});
|
| 77 |
-
}
|
| 78 |
});
|
| 79 |
-
} catch (e) { console.error(`
|
| 80 |
}
|
| 81 |
return tools;
|
| 82 |
}
|
| 83 |
|
| 84 |
-
// PHASE C:
|
| 85 |
async function getMonorepoTools() {
|
| 86 |
const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
|
| 87 |
let tools = [];
|
| 88 |
-
|
| 89 |
for (const m of monorepos) {
|
| 90 |
-
const
|
| 91 |
-
for (const path of paths) {
|
| 92 |
try {
|
| 93 |
const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
|
| 94 |
if (Array.isArray(data)) {
|
|
@@ -100,7 +99,7 @@ async function getMonorepoTools() {
|
|
| 100 |
url: dir.html_url,
|
| 101 |
repo_url: `https://github.com/${m.owner}/${m.repo}`,
|
| 102 |
install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
|
| 103 |
-
is_awesome_listed: true,
|
| 104 |
discovery_source: 'official_monorepo'
|
| 105 |
});
|
| 106 |
});
|
|
@@ -111,30 +110,40 @@ async function getMonorepoTools() {
|
|
| 111 |
return tools;
|
| 112 |
}
|
| 113 |
|
| 114 |
-
// MAIN EXECUTION
|
| 115 |
async function runSmartScraper() {
|
| 116 |
console.log("π Aggressive Scraper Started...");
|
| 117 |
-
|
| 118 |
try {
|
| 119 |
const topicTools = await getTopicTools();
|
| 120 |
const awesomeTools = await getAwesomeTools();
|
| 121 |
const monoTools = await getMonorepoTools();
|
| 122 |
|
| 123 |
-
// Combine all results
|
| 124 |
const allResults = [...topicTools, ...awesomeTools, ...monoTools];
|
| 125 |
|
| 126 |
-
//
|
| 127 |
const finalMap = new Map();
|
|
|
|
| 128 |
allResults.forEach(item => {
|
| 129 |
-
const existing = finalMap.get(item.
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
});
|
| 134 |
|
| 135 |
const finalTools = Array.from(finalMap.values());
|
| 136 |
|
| 137 |
-
|
|
|
|
| 138 |
const { error } = await supabase
|
| 139 |
.from('mcp_tools')
|
| 140 |
.upsert(finalTools, { onConflict: 'id' });
|
|
@@ -142,8 +151,6 @@ async function runSmartScraper() {
|
|
| 142 |
if (error) throw error;
|
| 143 |
|
| 144 |
console.log(`β
Success! Indexed ${finalTools.length} tools.`);
|
| 145 |
-
console.log(`β¨ Awesome tools found: ${finalTools.filter(t => t.is_awesome_listed).length}`);
|
| 146 |
-
|
| 147 |
await hydrateMemory();
|
| 148 |
|
| 149 |
} catch (err) {
|
|
|
|
| 7 |
Accept: 'application/vnd.github.v3+json'
|
| 8 |
};
|
| 9 |
|
| 10 |
+
// HELPER: Extract links from Markdown
|
| 11 |
const extractLinksFromMarkdown = (text) => {
|
| 12 |
+
const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|β)?\s*([^\n]+)?/g;
|
| 13 |
const links = [];
|
| 14 |
let match;
|
| 15 |
while ((match = regex.exec(text)) !== null) {
|
| 16 |
+
links.push({
|
| 17 |
+
name: match[1],
|
| 18 |
+
url: match[2],
|
| 19 |
+
owner: match[3],
|
| 20 |
+
repo: match[4],
|
| 21 |
+
description: match[5] || ""
|
| 22 |
+
});
|
| 23 |
}
|
| 24 |
return links;
|
| 25 |
};
|
| 26 |
|
| 27 |
+
// PHASE A: Search Topics
|
| 28 |
async function getTopicTools() {
|
|
|
|
| 29 |
let tools = [];
|
| 30 |
for (let page = 1; page <= 2; page++) {
|
| 31 |
try {
|
|
|
|
| 34 |
{ headers: HEADERS }
|
| 35 |
);
|
| 36 |
tools.push(...data.items.map(repo => ({
|
| 37 |
+
id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
|
| 38 |
name: repo.name,
|
| 39 |
description: repo.description,
|
| 40 |
url: repo.html_url,
|
|
|
|
| 43 |
keywords: repo.topics || [],
|
| 44 |
stars: repo.stargazers_count,
|
| 45 |
last_updated: repo.pushed_at,
|
| 46 |
+
is_awesome_listed: false,
|
| 47 |
discovery_source: 'github_topic'
|
| 48 |
})));
|
| 49 |
+
} catch (e) { console.error("Topic search error"); }
|
| 50 |
}
|
| 51 |
return tools;
|
| 52 |
}
|
| 53 |
|
| 54 |
+
// PHASE B: Awesome Lists
|
| 55 |
async function getAwesomeTools() {
|
| 56 |
const sources = [
|
| 57 |
{ owner: 'wong2', repo: 'awesome-mcp-servers' },
|
|
|
|
| 61 |
|
| 62 |
for (const s of sources) {
|
| 63 |
try {
|
|
|
|
| 64 |
const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
|
| 65 |
const content = Buffer.from(data.content, 'base64').toString();
|
| 66 |
const links = extractLinksFromMarkdown(content);
|
| 67 |
|
| 68 |
links.forEach(link => {
|
| 69 |
+
tools.push({
|
| 70 |
+
id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
|
| 71 |
+
name: link.name,
|
| 72 |
+
description: link.description,
|
| 73 |
+
url: link.url,
|
| 74 |
+
repo_url: link.url,
|
| 75 |
+
install_command: `npx -y ${link.repo}`,
|
| 76 |
+
is_awesome_listed: true,
|
| 77 |
+
discovery_source: 'awesome_list'
|
| 78 |
+
});
|
|
|
|
|
|
|
|
|
|
| 79 |
});
|
| 80 |
+
} catch (e) { console.error(`List ${s.repo} error`); }
|
| 81 |
}
|
| 82 |
return tools;
|
| 83 |
}
|
| 84 |
|
| 85 |
+
// PHASE C: Monorepos
|
| 86 |
async function getMonorepoTools() {
|
| 87 |
const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
|
| 88 |
let tools = [];
|
|
|
|
| 89 |
for (const m of monorepos) {
|
| 90 |
+
for (const path of ['src', 'servers', 'packages']) {
|
|
|
|
| 91 |
try {
|
| 92 |
const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
|
| 93 |
if (Array.isArray(data)) {
|
|
|
|
| 99 |
url: dir.html_url,
|
| 100 |
repo_url: `https://github.com/${m.owner}/${m.repo}`,
|
| 101 |
install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
|
| 102 |
+
is_awesome_listed: true,
|
| 103 |
discovery_source: 'official_monorepo'
|
| 104 |
});
|
| 105 |
});
|
|
|
|
| 110 |
return tools;
|
| 111 |
}
|
| 112 |
|
|
|
|
| 113 |
async function runSmartScraper() {
|
| 114 |
console.log("π Aggressive Scraper Started...");
|
|
|
|
| 115 |
try {
|
| 116 |
const topicTools = await getTopicTools();
|
| 117 |
const awesomeTools = await getAwesomeTools();
|
| 118 |
const monoTools = await getMonorepoTools();
|
| 119 |
|
|
|
|
| 120 |
const allResults = [...topicTools, ...awesomeTools, ...monoTools];
|
| 121 |
|
| 122 |
+
// --- THE FIX: DEDUPLICATE BY ID ---
|
| 123 |
const finalMap = new Map();
|
| 124 |
+
|
| 125 |
allResults.forEach(item => {
|
| 126 |
+
const existing = finalMap.get(item.id);
|
| 127 |
+
|
| 128 |
+
if (!existing) {
|
| 129 |
+
finalMap.set(item.id, item);
|
| 130 |
+
} else {
|
| 131 |
+
// If we find the same ID again, prefer the one that is Awesome Listed
|
| 132 |
+
if (item.is_awesome_listed && !existing.is_awesome_listed) {
|
| 133 |
+
finalMap.set(item.id, item);
|
| 134 |
+
}
|
| 135 |
+
// Also update stars if the new object has them
|
| 136 |
+
if (item.stars > (existing.stars || 0)) {
|
| 137 |
+
existing.stars = item.stars;
|
| 138 |
+
finalMap.set(item.id, existing);
|
| 139 |
+
}
|
| 140 |
}
|
| 141 |
});
|
| 142 |
|
| 143 |
const finalTools = Array.from(finalMap.values());
|
| 144 |
|
| 145 |
+
console.log(`π Preparing to upsert ${finalTools.length} unique tools...`);
|
| 146 |
+
|
| 147 |
const { error } = await supabase
|
| 148 |
.from('mcp_tools')
|
| 149 |
.upsert(finalTools, { onConflict: 'id' });
|
|
|
|
| 151 |
if (error) throw error;
|
| 152 |
|
| 153 |
console.log(`β
Success! Indexed ${finalTools.length} tools.`);
|
|
|
|
|
|
|
| 154 |
await hydrateMemory();
|
| 155 |
|
| 156 |
} catch (err) {
|