agentq_demo / scraper.js
Pepguy's picture
Update scraper.js
d1fe88e verified
const axios = require('axios');
const { supabase, hydrateMemory } = require('./db');
const GITHUB_API = 'https://api.github.com';
const HEADERS = {
Authorization: `token ${process.env.GITHUB_TOKEN}`,
Accept: 'application/vnd.github.v3+json'
};
// HELPER: Extract links from Markdown
const extractLinksFromMarkdown = (text) => {
const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g;
const links = [];
let match;
while ((match = regex.exec(text)) !== null) {
links.push({
name: match[1],
url: match[2],
owner: match[3],
repo: match[4],
description: match[5] || ""
});
}
return links;
};
// PHASE A: Search Topics
async function getTopicTools() {
let tools = [];
for (let page = 1; page <= 2; page++) {
try {
const { data } = await axios.get(
`${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`,
{ headers: HEADERS }
);
tools.push(...data.items.map(repo => ({
id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
name: repo.name,
description: repo.description,
url: repo.html_url,
repo_url: repo.html_url,
install_command: `npx -y ${repo.name}`,
keywords: repo.topics || [],
stars: repo.stargazers_count,
last_updated: repo.pushed_at,
is_awesome_listed: false,
discovery_source: 'github_topic'
})));
} catch (e) { console.error("Topic search error"); }
}
return tools;
}
// PHASE B: Awesome Lists
async function getAwesomeTools() {
const sources = [
{ owner: 'wong2', repo: 'awesome-mcp-servers' },
{ owner: 'punkpeye', repo: 'awesome-mcp-servers' }
];
let tools = [];
for (const s of sources) {
try {
const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
const content = Buffer.from(data.content, 'base64').toString();
const links = extractLinksFromMarkdown(content);
links.forEach(link => {
tools.push({
id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
name: link.name,
description: link.description,
url: link.url,
repo_url: link.url,
install_command: `npx -y ${link.repo}`,
is_awesome_listed: true,
discovery_source: 'awesome_list'
});
});
} catch (e) { console.error(`List ${s.repo} error`); }
}
return tools;
}
// PHASE C: Monorepos
async function getMonorepoTools() {
const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
let tools = [];
for (const m of monorepos) {
for (const path of ['src', 'servers', 'packages']) {
try {
const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
if (Array.isArray(data)) {
data.filter(f => f.type === 'dir').forEach(dir => {
tools.push({
id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(),
name: `${m.repo}-${dir.name}`,
description: `Official tool: ${dir.name}`,
url: dir.html_url,
repo_url: `https://github.com/${m.owner}/${m.repo}`,
install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
is_awesome_listed: true,
discovery_source: 'official_monorepo'
});
});
}
} catch (e) {}
}
}
return tools;
}
async function runSmartScraper() {
console.log("🚀 Aggressive Scraper Started...");
try {
const topicTools = await getTopicTools();
const awesomeTools = await getAwesomeTools();
const monoTools = await getMonorepoTools();
const allResults = [...topicTools, ...awesomeTools, ...monoTools];
// --- THE FIX: DEDUPLICATE BY ID ---
const finalMap = new Map();
allResults.forEach(item => {
const existing = finalMap.get(item.id);
if (!existing) {
finalMap.set(item.id, item);
} else {
// If we find the same ID again, prefer the one that is Awesome Listed
if (item.is_awesome_listed && !existing.is_awesome_listed) {
finalMap.set(item.id, item);
}
// Also update stars if the new object has them
if (item.stars > (existing.stars || 0)) {
existing.stars = item.stars;
finalMap.set(item.id, existing);
}
}
});
const finalTools = Array.from(finalMap.values());
console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`);
const { error } = await supabase
.from('mcp_tools')
.upsert(finalTools, { onConflict: 'id' });
if (error) throw error;
console.log(`✅ Success! Indexed ${finalTools.length} tools.`);
await hydrateMemory();
} catch (err) {
console.error("❌ Scraper Error:", err.message);
}
}
module.exports = { runSmartScraper };