Spaces:
Running
Running
Create src/scraper.js
Browse files- src/scraper.js +107 -0
src/scraper.js
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const axios = require('axios');
|
| 2 |
+
const { db, hydrateMemory, TOOLS_COLLECTION } = require('./db');
|
| 3 |
+
|
| 4 |
+
const GITHUB_API = 'https://api.github.com';
|
| 5 |
+
const HEADERS = { Authorization: `token ${process.env.GITHUB_TOKEN}` };
|
| 6 |
+
|
| 7 |
+
// Helper: Split array into chunks (Firestore limits batches to 500)
|
| 8 |
+
const chunkArray = (arr, size) =>
|
| 9 |
+
Array.from({ length: Math.ceil(arr.length / size) }, (v, i) =>
|
| 10 |
+
arr.slice(i * size, i * size + size)
|
| 11 |
+
);
|
| 12 |
+
|
| 13 |
+
// 1. The Deep Link Scanner (Monorepo aware)
|
| 14 |
+
async function scanRepoContents(repo) {
|
| 15 |
+
const toolsFound = [];
|
| 16 |
+
|
| 17 |
+
// Logic: Check if it's a monorepo by looking for 'packages' or 'servers' folder
|
| 18 |
+
// For MVP: We assume root is a tool, OR specific subfolders are tools.
|
| 19 |
+
// This is a simplified "heuristic" scanner.
|
| 20 |
+
|
| 21 |
+
// Base Tool (The Root Repo)
|
| 22 |
+
toolsFound.push({
|
| 23 |
+
id: `gh_${repo.id}`,
|
| 24 |
+
name: repo.name,
|
| 25 |
+
description: repo.description || "No description provided.",
|
| 26 |
+
url: repo.html_url,
|
| 27 |
+
repo_url: repo.html_url,
|
| 28 |
+
install_command: `npx -y ${repo.name}`, // Placeholder prediction
|
| 29 |
+
keywords: repo.topics || [],
|
| 30 |
+
stats: {
|
| 31 |
+
stars: repo.stargazers_count,
|
| 32 |
+
forks: repo.forks_count,
|
| 33 |
+
last_updated: repo.pushed_at
|
| 34 |
+
},
|
| 35 |
+
// Initialize empty reputation/analytics
|
| 36 |
+
reputation: { score: 0, verified: false, is_malicious: false },
|
| 37 |
+
analytics: { views: 0, installs: 0, success_rate: 0 }
|
| 38 |
+
});
|
| 39 |
+
|
| 40 |
+
return toolsFound;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
// 2. The Main Smart Scraper
|
| 44 |
+
async function runSmartScraper() {
|
| 45 |
+
console.log("🕷️ Starting Smart Scrape Cycle...");
|
| 46 |
+
|
| 47 |
+
try {
|
| 48 |
+
// A. Fetch Candidates from GitHub
|
| 49 |
+
// searching for standard tag
|
| 50 |
+
const { data } = await axios.get(`${GITHUB_API}/search/repositories?q=topic:mcp-server&sort=updated&per_page=100`, { headers: HEADERS });
|
| 51 |
+
|
| 52 |
+
const batchOps = [];
|
| 53 |
+
const knownIds = new Set(); // To prevent duplicates
|
| 54 |
+
|
| 55 |
+
// Get current DB state to check timestamps (Optimization)
|
| 56 |
+
const existingDocs = await db.collection(TOOLS_COLLECTION).select('stats.last_updated').get();
|
| 57 |
+
const existingMap = new Map();
|
| 58 |
+
existingDocs.forEach(doc => existingMap.set(doc.id, doc.data().stats.last_updated));
|
| 59 |
+
|
| 60 |
+
// B. Process Repos
|
| 61 |
+
for (const repo of data.items) {
|
| 62 |
+
// 1. Check if update is needed (Incremental Update)
|
| 63 |
+
const cachedTime = existingMap.get(`gh_${repo.id}`);
|
| 64 |
+
const remoteTime = repo.pushed_at;
|
| 65 |
+
|
| 66 |
+
if (cachedTime && cachedTime === remoteTime) {
|
| 67 |
+
// console.log(`⏩ Skipping ${repo.name} (No changes)`);
|
| 68 |
+
continue;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
// 2. Extract Tools (Deep Linking Logic)
|
| 72 |
+
const extractedTools = await scanRepoContents(repo);
|
| 73 |
+
|
| 74 |
+
extractedTools.forEach(tool => {
|
| 75 |
+
if (!knownIds.has(tool.id)) {
|
| 76 |
+
// Prepare Firestore Write
|
| 77 |
+
const docRef = db.collection(TOOLS_COLLECTION).doc(tool.id);
|
| 78 |
+
batchOps.push({ ref: docRef, data: tool });
|
| 79 |
+
knownIds.add(tool.id);
|
| 80 |
+
}
|
| 81 |
+
});
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
// C. Batch Write to Disk
|
| 85 |
+
if (batchOps.length > 0) {
|
| 86 |
+
console.log(`💾 Writing ${batchOps.length} updates to Firestore...`);
|
| 87 |
+
const chunks = chunkArray(batchOps, 400); // Safety limit
|
| 88 |
+
|
| 89 |
+
for (const chunk of chunks) {
|
| 90 |
+
const batch = db.batch();
|
| 91 |
+
chunk.forEach(op => batch.set(op.ref, op.data, { merge: true }));
|
| 92 |
+
await batch.commit();
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
// D. Update RAM
|
| 96 |
+
await hydrateMemory();
|
| 97 |
+
console.log("✅ Scrape & Hydrate Complete.");
|
| 98 |
+
} else {
|
| 99 |
+
console.log("💤 No updates needed.");
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
} catch (err) {
|
| 103 |
+
console.error("❌ Scraper Failed:", err.message);
|
| 104 |
+
}
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
module.exports = { runSmartScraper };
|