Pepguy commited on
Commit
32304db
·
verified ·
1 Parent(s): a133830

Create src/scraper.js

Browse files
Files changed (1) hide show
  1. src/scraper.js +107 -0
src/scraper.js ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const axios = require('axios');
2
+ const { db, hydrateMemory, TOOLS_COLLECTION } = require('./db');
3
+
4
+ const GITHUB_API = 'https://api.github.com';
5
+ const HEADERS = { Authorization: `token ${process.env.GITHUB_TOKEN}` };
6
+
7
+ // Helper: Split array into chunks (Firestore limits batches to 500)
8
+ const chunkArray = (arr, size) =>
9
+ Array.from({ length: Math.ceil(arr.length / size) }, (v, i) =>
10
+ arr.slice(i * size, i * size + size)
11
+ );
12
+
13
+ // 1. The Deep Link Scanner (Monorepo aware)
14
+ async function scanRepoContents(repo) {
15
+ const toolsFound = [];
16
+
17
+ // Logic: Check if it's a monorepo by looking for 'packages' or 'servers' folder
18
+ // For MVP: We assume root is a tool, OR specific subfolders are tools.
19
+ // This is a simplified "heuristic" scanner.
20
+
21
+ // Base Tool (The Root Repo)
22
+ toolsFound.push({
23
+ id: `gh_${repo.id}`,
24
+ name: repo.name,
25
+ description: repo.description || "No description provided.",
26
+ url: repo.html_url,
27
+ repo_url: repo.html_url,
28
+ install_command: `npx -y ${repo.name}`, // Placeholder prediction
29
+ keywords: repo.topics || [],
30
+ stats: {
31
+ stars: repo.stargazers_count,
32
+ forks: repo.forks_count,
33
+ last_updated: repo.pushed_at
34
+ },
35
+ // Initialize empty reputation/analytics
36
+ reputation: { score: 0, verified: false, is_malicious: false },
37
+ analytics: { views: 0, installs: 0, success_rate: 0 }
38
+ });
39
+
40
+ return toolsFound;
41
+ }
42
+
43
+ // 2. The Main Smart Scraper
44
+ async function runSmartScraper() {
45
+ console.log("🕷️ Starting Smart Scrape Cycle...");
46
+
47
+ try {
48
+ // A. Fetch Candidates from GitHub
49
+ // searching for standard tag
50
+ const { data } = await axios.get(`${GITHUB_API}/search/repositories?q=topic:mcp-server&sort=updated&per_page=100`, { headers: HEADERS });
51
+
52
+ const batchOps = [];
53
+ const knownIds = new Set(); // To prevent duplicates
54
+
55
+ // Get current DB state to check timestamps (Optimization)
56
+ const existingDocs = await db.collection(TOOLS_COLLECTION).select('stats.last_updated').get();
57
+ const existingMap = new Map();
58
+ existingDocs.forEach(doc => existingMap.set(doc.id, doc.data().stats.last_updated));
59
+
60
+ // B. Process Repos
61
+ for (const repo of data.items) {
62
+ // 1. Check if update is needed (Incremental Update)
63
+ const cachedTime = existingMap.get(`gh_${repo.id}`);
64
+ const remoteTime = repo.pushed_at;
65
+
66
+ if (cachedTime && cachedTime === remoteTime) {
67
+ // console.log(`⏩ Skipping ${repo.name} (No changes)`);
68
+ continue;
69
+ }
70
+
71
+ // 2. Extract Tools (Deep Linking Logic)
72
+ const extractedTools = await scanRepoContents(repo);
73
+
74
+ extractedTools.forEach(tool => {
75
+ if (!knownIds.has(tool.id)) {
76
+ // Prepare Firestore Write
77
+ const docRef = db.collection(TOOLS_COLLECTION).doc(tool.id);
78
+ batchOps.push({ ref: docRef, data: tool });
79
+ knownIds.add(tool.id);
80
+ }
81
+ });
82
+ }
83
+
84
+ // C. Batch Write to Disk
85
+ if (batchOps.length > 0) {
86
+ console.log(`💾 Writing ${batchOps.length} updates to Firestore...`);
87
+ const chunks = chunkArray(batchOps, 400); // Safety limit
88
+
89
+ for (const chunk of chunks) {
90
+ const batch = db.batch();
91
+ chunk.forEach(op => batch.set(op.ref, op.data, { merge: true }));
92
+ await batch.commit();
93
+ }
94
+
95
+ // D. Update RAM
96
+ await hydrateMemory();
97
+ console.log("✅ Scrape & Hydrate Complete.");
98
+ } else {
99
+ console.log("💤 No updates needed.");
100
+ }
101
+
102
+ } catch (err) {
103
+ console.error("❌ Scraper Failed:", err.message);
104
+ }
105
+ }
106
+
107
+ module.exports = { runSmartScraper };