Pepguy commited on
Commit
d1fe88e
Β·
verified Β·
1 Parent(s): 481018d

Update scraper.js

Browse files
Files changed (1) hide show
  1. scraper.js +46 -39
scraper.js CHANGED
@@ -7,20 +7,25 @@ const HEADERS = {
7
  Accept: 'application/vnd.github.v3+json'
8
  };
9
 
10
- // HELPER: Extract links from Markdown READMEs
11
  const extractLinksFromMarkdown = (text) => {
12
- const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/[^)]+)\)\s*(?:-|β€”)?\s*([^\n]+)?/g;
13
  const links = [];
14
  let match;
15
  while ((match = regex.exec(text)) !== null) {
16
- links.push({ name: match[1], url: match[2], description: match[3] || "" });
 
 
 
 
 
 
17
  }
18
  return links;
19
  };
20
 
21
- // PHASE A: Scrape GitHub Topic (Broad Search)
22
  async function getTopicTools() {
23
- console.log("πŸ” Searching GitHub Topics...");
24
  let tools = [];
25
  for (let page = 1; page <= 2; page++) {
26
  try {
@@ -29,7 +34,7 @@ async function getTopicTools() {
29
  { headers: HEADERS }
30
  );
31
  tools.push(...data.items.map(repo => ({
32
- id: `gh_${repo.id}`,
33
  name: repo.name,
34
  description: repo.description,
35
  url: repo.html_url,
@@ -38,15 +43,15 @@ async function getTopicTools() {
38
  keywords: repo.topics || [],
39
  stars: repo.stargazers_count,
40
  last_updated: repo.pushed_at,
41
- is_awesome_listed: false, // Default for search
42
  discovery_source: 'github_topic'
43
  })));
44
- } catch (e) { console.error(`Topic Page ${page} failed`); }
45
  }
46
  return tools;
47
  }
48
 
49
- // PHASE B: Scrape Awesome Lists (High Quality)
50
  async function getAwesomeTools() {
51
  const sources = [
52
  { owner: 'wong2', repo: 'awesome-mcp-servers' },
@@ -56,39 +61,33 @@ async function getAwesomeTools() {
56
 
57
  for (const s of sources) {
58
  try {
59
- console.log(`🌟 Scraping Awesome List: ${s.owner}/${s.repo}`);
60
  const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
61
  const content = Buffer.from(data.content, 'base64').toString();
62
  const links = extractLinksFromMarkdown(content);
63
 
64
  links.forEach(link => {
65
- const parts = link.url.replace('https://github.com/', '').split('/');
66
- if (parts.length >= 2) {
67
- tools.push({
68
- id: `gh_ext_${parts[0]}_${parts[1]}`.toLowerCase(), // Normalized ID
69
- name: link.name,
70
- description: link.description,
71
- url: link.url,
72
- repo_url: link.url,
73
- install_command: `npx -y ${parts[1]}`,
74
- is_awesome_listed: true, // THE FLAG
75
- discovery_source: 'awesome_list'
76
- });
77
- }
78
  });
79
- } catch (e) { console.error(`Awesome List ${s.repo} failed`); }
80
  }
81
  return tools;
82
  }
83
 
84
- // PHASE C: Monorepo Deep Scan
85
  async function getMonorepoTools() {
86
  const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
87
  let tools = [];
88
-
89
  for (const m of monorepos) {
90
- const paths = ['src', 'servers', 'packages'];
91
- for (const path of paths) {
92
  try {
93
  const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
94
  if (Array.isArray(data)) {
@@ -100,7 +99,7 @@ async function getMonorepoTools() {
100
  url: dir.html_url,
101
  repo_url: `https://github.com/${m.owner}/${m.repo}`,
102
  install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
103
- is_awesome_listed: true, // Official tools count as Awesome
104
  discovery_source: 'official_monorepo'
105
  });
106
  });
@@ -111,30 +110,40 @@ async function getMonorepoTools() {
111
  return tools;
112
  }
113
 
114
- // MAIN EXECUTION
115
  async function runSmartScraper() {
116
  console.log("πŸš€ Aggressive Scraper Started...");
117
-
118
  try {
119
  const topicTools = await getTopicTools();
120
  const awesomeTools = await getAwesomeTools();
121
  const monoTools = await getMonorepoTools();
122
 
123
- // Combine all results
124
  const allResults = [...topicTools, ...awesomeTools, ...monoTools];
125
 
126
- // Deduplicate: If same tool is found in Search and Awesome, Awesome (with the flag) wins.
127
  const finalMap = new Map();
 
128
  allResults.forEach(item => {
129
- const existing = finalMap.get(item.url); // Use URL as the key for deduping
130
- if (!existing || (item.is_awesome_listed && !existing.is_awesome_listed)) {
131
- finalMap.set(item.url, item);
 
 
 
 
 
 
 
 
 
 
 
132
  }
133
  });
134
 
135
  const finalTools = Array.from(finalMap.values());
136
 
137
- // Upsert to Supabase
 
138
  const { error } = await supabase
139
  .from('mcp_tools')
140
  .upsert(finalTools, { onConflict: 'id' });
@@ -142,8 +151,6 @@ async function runSmartScraper() {
142
  if (error) throw error;
143
 
144
  console.log(`βœ… Success! Indexed ${finalTools.length} tools.`);
145
- console.log(`✨ Awesome tools found: ${finalTools.filter(t => t.is_awesome_listed).length}`);
146
-
147
  await hydrateMemory();
148
 
149
  } catch (err) {
 
7
  Accept: 'application/vnd.github.v3+json'
8
  };
9
 
10
+ // HELPER: Extract links from Markdown
11
  const extractLinksFromMarkdown = (text) => {
12
+ const regex = /\[([^\]]+)\]\((https:\/\/github\.com\/([^/)]+)\/([^/)]+))\)\s*(?:-|β€”)?\s*([^\n]+)?/g;
13
  const links = [];
14
  let match;
15
  while ((match = regex.exec(text)) !== null) {
16
+ links.push({
17
+ name: match[1],
18
+ url: match[2],
19
+ owner: match[3],
20
+ repo: match[4],
21
+ description: match[5] || ""
22
+ });
23
  }
24
  return links;
25
  };
26
 
27
+ // PHASE A: Search Topics
28
  async function getTopicTools() {
 
29
  let tools = [];
30
  for (let page = 1; page <= 2; page++) {
31
  try {
 
34
  { headers: HEADERS }
35
  );
36
  tools.push(...data.items.map(repo => ({
37
+ id: `gh_${repo.owner.login}_${repo.name}`.toLowerCase(), // Normalized ID
38
  name: repo.name,
39
  description: repo.description,
40
  url: repo.html_url,
 
43
  keywords: repo.topics || [],
44
  stars: repo.stargazers_count,
45
  last_updated: repo.pushed_at,
46
+ is_awesome_listed: false,
47
  discovery_source: 'github_topic'
48
  })));
49
+ } catch (e) { console.error("Topic search error"); }
50
  }
51
  return tools;
52
  }
53
 
54
+ // PHASE B: Awesome Lists
55
  async function getAwesomeTools() {
56
  const sources = [
57
  { owner: 'wong2', repo: 'awesome-mcp-servers' },
 
61
 
62
  for (const s of sources) {
63
  try {
 
64
  const { data } = await axios.get(`${GITHUB_API}/repos/${s.owner}/${s.repo}/readme`, { headers: HEADERS });
65
  const content = Buffer.from(data.content, 'base64').toString();
66
  const links = extractLinksFromMarkdown(content);
67
 
68
  links.forEach(link => {
69
+ tools.push({
70
+ id: `gh_${link.owner}_${link.repo}`.toLowerCase(), // Same ID format as Topic Search
71
+ name: link.name,
72
+ description: link.description,
73
+ url: link.url,
74
+ repo_url: link.url,
75
+ install_command: `npx -y ${link.repo}`,
76
+ is_awesome_listed: true,
77
+ discovery_source: 'awesome_list'
78
+ });
 
 
 
79
  });
80
+ } catch (e) { console.error(`List ${s.repo} error`); }
81
  }
82
  return tools;
83
  }
84
 
85
+ // PHASE C: Monorepos
86
  async function getMonorepoTools() {
87
  const monorepos = [{ owner: 'modelcontextprotocol', repo: 'servers' }];
88
  let tools = [];
 
89
  for (const m of monorepos) {
90
+ for (const path of ['src', 'servers', 'packages']) {
 
91
  try {
92
  const { data } = await axios.get(`${GITHUB_API}/repos/${m.owner}/${m.repo}/contents/${path}`, { headers: HEADERS });
93
  if (Array.isArray(data)) {
 
99
  url: dir.html_url,
100
  repo_url: `https://github.com/${m.owner}/${m.repo}`,
101
  install_command: `npx -y @modelcontextprotocol/server-${dir.name}`,
102
+ is_awesome_listed: true,
103
  discovery_source: 'official_monorepo'
104
  });
105
  });
 
110
  return tools;
111
  }
112
 
 
113
  async function runSmartScraper() {
114
  console.log("πŸš€ Aggressive Scraper Started...");
 
115
  try {
116
  const topicTools = await getTopicTools();
117
  const awesomeTools = await getAwesomeTools();
118
  const monoTools = await getMonorepoTools();
119
 
 
120
  const allResults = [...topicTools, ...awesomeTools, ...monoTools];
121
 
122
+ // --- THE FIX: DEDUPLICATE BY ID ---
123
  const finalMap = new Map();
124
+
125
  allResults.forEach(item => {
126
+ const existing = finalMap.get(item.id);
127
+
128
+ if (!existing) {
129
+ finalMap.set(item.id, item);
130
+ } else {
131
+ // If we find the same ID again, prefer the one that is Awesome Listed
132
+ if (item.is_awesome_listed && !existing.is_awesome_listed) {
133
+ finalMap.set(item.id, item);
134
+ }
135
+ // Also update stars if the new object has them
136
+ if (item.stars > (existing.stars || 0)) {
137
+ existing.stars = item.stars;
138
+ finalMap.set(item.id, existing);
139
+ }
140
  }
141
  });
142
 
143
  const finalTools = Array.from(finalMap.values());
144
 
145
+ console.log(`πŸ“Š Preparing to upsert ${finalTools.length} unique tools...`);
146
+
147
  const { error } = await supabase
148
  .from('mcp_tools')
149
  .upsert(finalTools, { onConflict: 'id' });
 
151
  if (error) throw error;
152
 
153
  console.log(`βœ… Success! Indexed ${finalTools.length} tools.`);
 
 
154
  await hydrateMemory();
155
 
156
  } catch (err) {