Pepguy commited on
Commit
ba92074
·
verified ·
1 Parent(s): f2ccb20

Delete scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +0 -161
scraper.js DELETED
@@ -1,161 +0,0 @@
1
- const axios = require('axios');
2
- const { supabase, hydrateMemory } = require('./db');
3
-
4
- const GITHUB_API = 'https://api.github.com';
5
- const HEADERS = {
6
- Authorization: `token ${process.env.GITHUB_TOKEN}`,
7
- Accept: 'application/vnd.github.v3+json'
8
- };
9
-
10
- // HELPER: Extract links from Markdown
11
- const extractLinksFromMarkdown = (text) => {
12
- const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|—)?\s*([^\n]+)?/g;
13
- const links = [];
14
- let match;
15
- while ((match = regex.exec(text)) !== null) {
16
- links.push({
17
- name: match[1],
18
- url: match[2],
19
- owner: match[3],
20
- repo: match[4],
21
- description: match[5] || ""
22
- });
23
- }
24
- return links;
25
- };
26
-
27
- // PHASE A: Search Topics
28
- async function getTopicTools() {
29
- let tools = [];
30
- for (let page = 1; page <= 2; page++) {
31
- try {
32
- const { data } = await axios.get(
33
- `${GITHUB_API}/search/repositories?q=topic:mcp-server&per_page=100&page=${page}`,
34
- { headers: HEADERS }
35
- );
36
- tools.push(...data.items.map(repo => ({
37
- id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
38
- name: repo.name,
39
- description: repo.description,
40
- url: repo.html_url,
41
- repo_url: repo.html_url,
42
- install_command: `npx -y ${repo.name}`,
43
- keywords: repo.topics || [],
44
- stars: repo.stargazers_count,
45
- last_updated: repo.pushed_at,
46
- is_awesome_listed: false,
47
- discovery_source: 'github_topic'
48
- })));
49
- } catch (e) { console.error("Topic search error"); }
50
- }
51
- return tools;
52
- }
53
-
54
- // PHASE B: Awesome Lists
55
- async function getAwesomeTools() {
56
- const sources = [
57
- { owner: 'wong2', repo: 'awesome-mcp-servers' },
58
- { owner: 'punkpeye', repo: 'awesome-mcp-servers' }
59
- ];
60
- let tools = [];
61
-
62
- for (const s of sources) {
63
- try {
64
- const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
65
- const content = Buffer.from(data.content, 'base64').toString();
66
- const links = extractLinksFromMarkdown(content);
67
-
68
- links.forEach(link => {
69
- tools.push({
70
- id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
71
- name: link.name,
72
- description: link.description,
73
- url: link.url,
74
- repo_url: link.url,
75
- install_command: `npx -y ${link.repo}`,
76
- is_awesome_listed: true,
77
- discovery_source: 'awesome_list'
78
- });
79
- });
80
- } catch (e) { console.error(`List ${s.repo} error`); }
81
- }
82
- return tools;
83
- }
84
-
85
- // PHASE C: Monorepos
86
- async function getMonorepoTools() {
87
- const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
88
- let tools = [];
89
- for (const m of monorepos) {
90
- for (const path of ['src', 'servers', 'packages']) {
91
- try {
92
- const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
93
- if (Array.isArray(data)) {
94
- data.filter(f => f.type === 'dir').forEach(dir => {
95
- tools.push({
96
- id: `gh_mono_${m.repo}_${dir.name}`.toLowerCase(),
97
- name: `${m.repo}-${dir.name}`,
98
- description: `Official tool: ${dir.name}`,
99
- url: dir.html_url,
100
- repo_url: `https://github.com/${m.owner}/${m.repo}`,
101
- install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
102
- is_awesome_listed: true,
103
- discovery_source: 'official_monorepo'
104
- });
105
- });
106
- }
107
- } catch (e) {}
108
- }
109
- }
110
- return tools;
111
- }
112
-
113
- async function runSmartScraper() {
114
- console.log("🚀 Aggressive Scraper Started...");
115
- try {
116
- const topicTools = await getTopicTools();
117
- const awesomeTools = await getAwesomeTools();
118
- const monoTools = await getMonorepoTools();
119
-
120
- const allResults = [...topicTools, ...awesomeTools, ...monoTools];
121
-
122
- // --- THE FIX: DEDUPLICATE BY ID ---
123
- const finalMap = new Map();
124
-
125
- allResults.forEach(item => {
126
- const existing = finalMap.get(item.id);
127
-
128
- if (!existing) {
129
- finalMap.set(item.id, item);
130
- } else {
131
- // If we find the same ID again, prefer the one that is Awesome Listed
132
- if (item.is_awesome_listed && !existing.is_awesome_listed) {
133
- finalMap.set(item.id, item);
134
- }
135
- // Also update stars if the new object has them
136
- if (item.stars > (existing.stars || 0)) {
137
- existing.stars = item.stars;
138
- finalMap.set(item.id, existing);
139
- }
140
- }
141
- });
142
-
143
- const finalTools = Array.from(finalMap.values());
144
-
145
- console.log(`📊 Preparing to upsert ${finalTools.length} unique tools...`);
146
-
147
- const { error } = await supabase
148
- .from('mcp_tools')
149
- .upsert(finalTools, { onConflict: 'id' });
150
-
151
- if (error) throw error;
152
-
153
- console.log(`✅ Success! Indexed ${finalTools.length} tools.`);
154
- await hydrateMemory();
155
-
156
- } catch (err) {
157
- console.error("❌ Scraper Error:", err.message);
158
- }
159
- }
160
-
161
- module.exports = { runSmartScraper };