CrispStrobe commited on
Commit
c3fc087
Β·
1 Parent(s): d756c8e

feat: improve size detection via hf_id inheritance, add embedder icon, and restore comments

Browse files
data/providers.json CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/fetch-providers.js CHANGED
@@ -16,8 +16,6 @@ const { getJson, getText, fetchRobust } = require('./fetch-utils');
16
  const DATA_FILE = path.join(__dirname, '..', 'data', 'providers.json');
17
 
18
  // Registry of all available fetchers.
19
- // Each module must export { providerName, fetch<Name> }.
20
- // Add new providers here as scripts/providers/<name>.js modules.
21
  const FETCHER_MODULES = {
22
  scaleway: require('./providers/scaleway'),
23
  openrouter: require('./providers/openrouter'),
@@ -32,7 +30,6 @@ const FETCHER_MODULES = {
32
  };
33
 
34
  const FETCHERS = Object.entries(FETCHER_MODULES).map(([key, mod]) => {
35
- // Find the exported async function (the one that isn't providerName)
36
  const fn = Object.values(mod).find((v) => typeof v === 'function');
37
  if (!fn) throw new Error(`Module for ${key} exports no function`);
38
  return { key, providerName: mod.providerName, fn };
@@ -56,66 +53,50 @@ function updateProviderModels(providers, providerName, models) {
56
  return true;
57
  }
58
 
59
- // Normalize a model name/ID for fuzzy matching (same as App.tsx normalizeName).
60
  const normName = (s) =>
61
  s.toLowerCase().replace(/[-_.:]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();
62
 
63
- // Build an index of normalized OpenRouter model-part β†’ { capabilities, type, size_b }
64
- // Only includes entries that carry non-trivial capability data.
65
  function buildOrIndex(orProvider) {
66
  if (!orProvider) return [];
67
  const index = [];
68
  for (const m of orProvider.models || []) {
69
  if (!m.capabilities || m.capabilities.length === 0) continue;
70
- // Strip :free suffix and take the model part after '/'
71
  const modelPart = m.name.replace(/:free$/, '').split('/').pop();
72
- index.push({ norm: normName(modelPart), capabilities: m.capabilities, type: m.type, size_b: m.size_b });
 
 
 
 
 
 
73
  }
74
  return index;
75
  }
76
 
77
- // For a given model name, find the best matching OpenRouter index entry.
78
- // Returns { capabilities, type, size_b } or null.
79
  function findOrMatch(modelName, orIndex) {
80
- // Use the model part (after last '/') for matching, strip :region/@suffix
81
  const raw = modelName.replace(/@[^/]+$/, '').replace(/:[^/]+$/, '');
82
  const modelPart = raw.includes('/') ? raw.split('/').pop() : raw;
83
- // Strip reasoning/thinking suffixes that don't appear in OR model IDs
84
  const n = normName(modelPart).replace(/ (?:reasoning|thinking|extended|nothinking)$/, '');
85
 
86
- // 1. Exact match
87
- for (const entry of orIndex) {
88
- if (entry.norm === n) return entry;
89
- }
90
- // 2. Provider model name starts with OR model part (e.g. "claude-3-5-sonnet-20241022" starts with "claude-3-5-sonnet")
91
- let best = null;
92
- let bestLen = 0;
93
  for (const entry of orIndex) {
94
  if (n.startsWith(entry.norm) && entry.norm.length > bestLen) {
95
- best = entry;
96
- bestLen = entry.norm.length;
97
  }
98
  }
99
  if (best) return best;
100
- // 3. OR model part starts with provider name (e.g. "claude-haiku-4-5" β†’ "claude-haiku-4-5-20251001")
101
- for (const entry of orIndex) {
102
- if (entry.norm.startsWith(n + ' ')) return entry;
103
- }
104
- // 4. OR model norm contains provider name as a contiguous word sequence.
105
- // Handles short display names like "Sonnet 4.6" matching inside "claude sonnet 4 6".
106
  if (n.length >= 5) {
107
  let bestC = null, bestCLen = Infinity;
108
  for (const entry of orIndex) {
109
  const e = entry.norm;
110
- if ((e === n || e.includes(' ' + n + ' ') || e.startsWith(n + ' ') || e.endsWith(' ' + n))
111
- && e.length < bestCLen) {
112
  bestC = entry; bestCLen = e.length;
113
  }
114
  }
115
  if (bestC) return bestC;
116
  }
117
- // 5. All tokens of provider name appear in OR norm (handles word-order differences).
118
- // e.g. "Sonnet 3.7" β†’ tokens ["sonnet","3","7"] match inside "claude 3 7 sonnet 20250219".
119
  const tokens = n.split(' ');
120
  if (tokens.length >= 2 && n.length >= 7) {
121
  let bestT = null, bestTLen = Infinity;
@@ -137,110 +118,93 @@ async function fetchHFSize(hfId) {
137
  const headers = token ? { Authorization: `Bearer ${token}` } : {};
138
  try {
139
  const data = await getJson(`https://huggingface.co/api/models/${hfId}`, { headers });
140
- // Check various common metadata locations for total parameters
141
  let params = data.safetensors?.total || data.config?.total_parameters || data.config?.model_type_params;
142
  if (!params && data.cardData?.model_details?.parameters) {
143
  const match = data.cardData.model_details.parameters.match(/([\d.]+)\s*[Bb]/);
144
  if (match) params = parseFloat(match[1]) * 1_000_000_000;
145
  }
146
  return params ? Math.round(params / 1_000_000_000 * 10) / 10 : null;
147
- } catch (e) {
148
- return null; // Silently skip failures for individual models
149
- }
150
  }
151
 
152
  const EMBEDDER_KEYWORDS = ['embed', 'bge', 'gte', 'e5', 'stella', 'minilm', 'multilingual-mpnet'];
153
 
154
- // Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
155
- // Only fills in fields when the model doesn't already have them.
156
  async function propagateExtraData(data) {
157
  const orProvider = data.providers.find((p) => p.name === 'OpenRouter');
158
  const orIndex = buildOrIndex(orProvider);
159
 
160
- // Load benchmarks for size lookup
161
  let benchmarks = [];
162
  try {
163
  const bmFile = path.join(__dirname, '..', 'data', 'benchmarks.json');
164
  if (fs.existsSync(bmFile)) benchmarks = JSON.parse(fs.readFileSync(bmFile, 'utf8'));
165
- } catch (e) { /* ignore */ }
166
 
 
167
  const bmSizeMap = new Map();
 
168
  benchmarks.forEach((b) => {
169
  if (b.params_b) {
 
170
  if (b.name) bmSizeMap.set(normName(b.name), b.params_b);
171
- if (b.hf_id) bmSizeMap.set(normName(b.hf_id.split('/').pop()), b.params_b);
172
  if (b.lb_name) bmSizeMap.set(normName(b.lb_name), b.params_b);
173
  }
174
  });
175
 
176
- let propagatedCaps = 0;
177
- let propagatedSize = 0;
178
- let autoTagged = 0;
179
- let hfSizeFetched = 0;
180
-
181
- // We'll collect models missing size that have a clear HF-id-like name
182
  const hfLookupQueue = [];
183
 
184
  for (const provider of data.providers) {
185
  for (const model of provider.models || []) {
186
  const n = normName(model.name);
187
 
188
- // 1. Auto-tag image-gen and embedding models
 
 
 
 
 
 
189
  if (model.type === 'image' && (!model.capabilities || !model.capabilities.length)) {
190
- model.capabilities = ['image-gen'];
191
- autoTagged++;
192
  }
193
  if (model.type === 'chat' && EMBEDDER_KEYWORDS.some(k => n.includes(k))) {
194
- model.type = 'embedding';
195
- autoTagged++;
196
  }
197
 
198
- // 2. Propagate size from benchmarks (if missing)
199
  if (!model.size_b) {
200
- // Try exact name match or base name match
201
  const size = bmSizeMap.get(n) || bmSizeMap.get(n.split(' ').pop());
202
- if (size) {
203
- model.size_b = size;
204
- propagatedSize++;
205
- }
206
  }
207
 
208
- // 3. Propagate capabilities/type/size from OpenRouter
209
  if (provider.name !== 'OpenRouter') {
210
  const match = findOrMatch(model.name, orIndex);
211
  if (match) {
212
- if (!model.capabilities || model.capabilities.length === 0) {
213
- model.capabilities = match.capabilities;
214
- propagatedCaps++;
215
- }
216
  if (model.type === 'chat' && match.type !== 'chat') model.type = match.type;
217
- if (!model.size_b && match.size_b) {
218
- model.size_b = match.size_b;
219
- propagatedSize++;
220
- }
221
  }
222
  }
223
 
224
- // Special case: gemma2 based models often 9b or 27b
225
  if (!model.size_b) {
226
  if (n.includes('gemma 2 9b') || n.includes('gemma2 9b')) { model.size_b = 9; propagatedSize++; }
227
  else if (n.includes('gemma 2 27b') || n.includes('gemma2 27b')) { model.size_b = 27; propagatedSize++; }
228
  else if (n.includes('gemma 2 2b') || n.includes('gemma2 2b')) { model.size_b = 2; propagatedSize++; }
229
  }
230
 
231
- // 4. Queue for HF Hub metadata if still missing and looks like an ID
232
- if (!model.size_b && (model.name.includes('/') || model.hf_id)) {
233
- hfLookupQueue.push(model);
234
- }
235
  }
236
  }
237
 
238
- // 5. Final fallback: Technical metadata inspection from HF Hub API (Option 2)
239
- // Only for models that still have no size after all other sources.
240
- // Limit to a small batch of unique IDs to avoid long startup.
241
  const uniqueIds = [...new Set(hfLookupQueue.map(m => m.hf_id || m.name).filter(id => id.includes('/')))].slice(0, 30);
242
  if (uniqueIds.length > 0) {
243
- process.stdout.write(` HF Hub: inspecting ${uniqueIds.length} models for metadata... `);
244
  const idToSize = new Map();
245
  await Promise.all(uniqueIds.map(async (id) => {
246
  const size = await fetchHFSize(id);
@@ -249,10 +213,7 @@ async function propagateExtraData(data) {
249
  for (const model of hfLookupQueue) {
250
  if (!model.size_b) {
251
  const size = idToSize.get(model.hf_id || model.name);
252
- if (size) {
253
- model.size_b = size;
254
- hfSizeFetched++;
255
- }
256
  }
257
  }
258
  console.log(`βœ“ ${hfSizeFetched} sizes found`);
@@ -265,7 +226,6 @@ async function propagateExtraData(data) {
265
 
266
  async function runFetcher(fetcher, data) {
267
  const { key, providerName, fn } = fetcher;
268
-
269
  try {
270
  process.stdout.write(`Fetching ${providerName}... `);
271
  const models = await fn();
@@ -279,46 +239,25 @@ async function runFetcher(fetcher, data) {
279
  }
280
 
281
  async function main() {
282
- // Determine which fetchers to run
283
  const args = process.argv.slice(2).map((a) => a.toLowerCase());
284
- const fetchers =
285
- args.length > 0
286
- ? FETCHERS.filter((f) => args.includes(f.key))
287
- : FETCHERS;
288
-
289
  if (fetchers.length === 0) {
290
  console.error('No matching fetchers found. Available:', FETCHERS.map((f) => f.key).join(', '));
291
  process.exit(1);
292
  }
293
-
294
  const data = loadData();
295
  console.log(`Running ${fetchers.length} fetcher(s)...\n`);
296
-
297
  const results = [];
298
- for (const fetcher of fetchers) {
299
- const result = await runFetcher(fetcher, data);
300
- results.push(result);
301
- }
302
-
303
- // Always propagate extra data from OpenRouter and Benchmarks to all providers' models.
304
  await propagateExtraData(data);
305
-
306
  saveData(data);
307
-
308
  console.log('\nSummary:');
309
  let anyFailed = false;
310
  results.forEach((r) => {
311
  if (r.success) console.log(` βœ“ ${r.providerName}: ${r.count} models`);
312
- else {
313
- console.log(` βœ— ${r.providerName}: ${r.error}`);
314
- anyFailed = true;
315
- }
316
  });
317
-
318
  if (anyFailed) process.exit(1);
319
  }
320
 
321
- main().catch((err) => {
322
- console.error('Fatal:', err);
323
- process.exit(1);
324
- });
 
16
  const DATA_FILE = path.join(__dirname, '..', 'data', 'providers.json');
17
 
18
  // Registry of all available fetchers.
 
 
19
  const FETCHER_MODULES = {
20
  scaleway: require('./providers/scaleway'),
21
  openrouter: require('./providers/openrouter'),
 
30
  };
31
 
32
  const FETCHERS = Object.entries(FETCHER_MODULES).map(([key, mod]) => {
 
33
  const fn = Object.values(mod).find((v) => typeof v === 'function');
34
  if (!fn) throw new Error(`Module for ${key} exports no function`);
35
  return { key, providerName: mod.providerName, fn };
 
53
  return true;
54
  }
55
 
 
56
  const normName = (s) =>
57
  s.toLowerCase().replace(/[-_.:]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();
58
 
 
 
59
  function buildOrIndex(orProvider) {
60
  if (!orProvider) return [];
61
  const index = [];
62
  for (const m of orProvider.models || []) {
63
  if (!m.capabilities || m.capabilities.length === 0) continue;
 
64
  const modelPart = m.name.replace(/:free$/, '').split('/').pop();
65
+ index.push({
66
+ norm: normName(modelPart),
67
+ capabilities: m.capabilities,
68
+ type: m.type,
69
+ size_b: m.size_b,
70
+ hf_id: m.hf_id,
71
+ });
72
  }
73
  return index;
74
  }
75
 
 
 
76
  function findOrMatch(modelName, orIndex) {
 
77
  const raw = modelName.replace(/@[^/]+$/, '').replace(/:[^/]+$/, '');
78
  const modelPart = raw.includes('/') ? raw.split('/').pop() : raw;
 
79
  const n = normName(modelPart).replace(/ (?:reasoning|thinking|extended|nothinking)$/, '');
80
 
81
+ for (const entry of orIndex) if (entry.norm === n) return entry;
82
+ let best = null, bestLen = 0;
 
 
 
 
 
83
  for (const entry of orIndex) {
84
  if (n.startsWith(entry.norm) && entry.norm.length > bestLen) {
85
+ best = entry; bestLen = entry.norm.length;
 
86
  }
87
  }
88
  if (best) return best;
89
+ for (const entry of orIndex) if (entry.norm.startsWith(n + ' ')) return entry;
 
 
 
 
 
90
  if (n.length >= 5) {
91
  let bestC = null, bestCLen = Infinity;
92
  for (const entry of orIndex) {
93
  const e = entry.norm;
94
+ if ((e === n || e.includes(' ' + n + ' ') || e.startsWith(n + ' ') || e.endsWith(' ' + n)) && e.length < bestCLen) {
 
95
  bestC = entry; bestCLen = e.length;
96
  }
97
  }
98
  if (bestC) return bestC;
99
  }
 
 
100
  const tokens = n.split(' ');
101
  if (tokens.length >= 2 && n.length >= 7) {
102
  let bestT = null, bestTLen = Infinity;
 
118
  const headers = token ? { Authorization: `Bearer ${token}` } : {};
119
  try {
120
  const data = await getJson(`https://huggingface.co/api/models/${hfId}`, { headers });
 
121
  let params = data.safetensors?.total || data.config?.total_parameters || data.config?.model_type_params;
122
  if (!params && data.cardData?.model_details?.parameters) {
123
  const match = data.cardData.model_details.parameters.match(/([\d.]+)\s*[Bb]/);
124
  if (match) params = parseFloat(match[1]) * 1_000_000_000;
125
  }
126
  return params ? Math.round(params / 1_000_000_000 * 10) / 10 : null;
127
+ } catch (e) { return null; }
 
 
128
  }
129
 
130
  const EMBEDDER_KEYWORDS = ['embed', 'bge', 'gte', 'e5', 'stella', 'minilm', 'multilingual-mpnet'];
131
 
 
 
132
  async function propagateExtraData(data) {
133
  const orProvider = data.providers.find((p) => p.name === 'OpenRouter');
134
  const orIndex = buildOrIndex(orProvider);
135
 
 
136
  let benchmarks = [];
137
  try {
138
  const bmFile = path.join(__dirname, '..', 'data', 'benchmarks.json');
139
  if (fs.existsSync(bmFile)) benchmarks = JSON.parse(fs.readFileSync(bmFile, 'utf8'));
140
+ } catch (e) {}
141
 
142
+ // Multi-level Benchmark Size Maps
143
  const bmSizeMap = new Map();
144
+ const hfIdToSize = new Map();
145
  benchmarks.forEach((b) => {
146
  if (b.params_b) {
147
+ if (b.hf_id) hfIdToSize.set(b.hf_id.toLowerCase(), b.params_b);
148
  if (b.name) bmSizeMap.set(normName(b.name), b.params_b);
 
149
  if (b.lb_name) bmSizeMap.set(normName(b.lb_name), b.params_b);
150
  }
151
  });
152
 
153
+ let propagatedCaps = 0, propagatedSize = 0, autoTagged = 0, hfSizeFetched = 0;
 
 
 
 
 
154
  const hfLookupQueue = [];
155
 
156
  for (const provider of data.providers) {
157
  for (const model of provider.models || []) {
158
  const n = normName(model.name);
159
 
160
+ // 1. STRUCTURED LOOKUP: Match size by hf_id if available (Benchmark gold-standard)
161
+ if (!model.size_b && model.hf_id) {
162
+ const size = hfIdToSize.get(model.hf_id.toLowerCase());
163
+ if (size) { model.size_b = size; propagatedSize++; }
164
+ }
165
+
166
+ // 2. AUTO-TAG type
167
  if (model.type === 'image' && (!model.capabilities || !model.capabilities.length)) {
168
+ model.capabilities = ['image-gen']; autoTagged++;
 
169
  }
170
  if (model.type === 'chat' && EMBEDDER_KEYWORDS.some(k => n.includes(k))) {
171
+ model.type = 'embedding'; autoTagged++;
 
172
  }
173
 
174
+ // 3. FALLBACK: Match size by name against benchmarks
175
  if (!model.size_b) {
 
176
  const size = bmSizeMap.get(n) || bmSizeMap.get(n.split(' ').pop());
177
+ if (size) { model.size_b = size; propagatedSize++; }
 
 
 
178
  }
179
 
180
+ // 4. INHERIT: Structured data inheritance from OpenRouter
181
  if (provider.name !== 'OpenRouter') {
182
  const match = findOrMatch(model.name, orIndex);
183
  if (match) {
184
+ if (!model.capabilities || model.capabilities.length === 0) { model.capabilities = match.capabilities; propagatedCaps++; }
 
 
 
185
  if (model.type === 'chat' && match.type !== 'chat') model.type = match.type;
186
+ if (!model.size_b && match.size_b) { model.size_b = match.size_b; propagatedSize++; }
187
+ // Crucial: inherit hf_id to enable Hub API fallback below
188
+ if (!model.hf_id && match.hf_id) model.hf_id = match.hf_id;
 
189
  }
190
  }
191
 
192
+ // 5. HARDCODED heuristics
193
  if (!model.size_b) {
194
  if (n.includes('gemma 2 9b') || n.includes('gemma2 9b')) { model.size_b = 9; propagatedSize++; }
195
  else if (n.includes('gemma 2 27b') || n.includes('gemma2 27b')) { model.size_b = 27; propagatedSize++; }
196
  else if (n.includes('gemma 2 2b') || n.includes('gemma2 2b')) { model.size_b = 2; propagatedSize++; }
197
  }
198
 
199
+ // 6. QUEUE: Still missing size? Try Hub API metadata lookup
200
+ if (!model.size_b && (model.name.includes('/') || model.hf_id)) hfLookupQueue.push(model);
 
 
201
  }
202
  }
203
 
204
+ // 7. HUB API: Inspect technical metadata (Limit 30 to prevent timeouts)
 
 
205
  const uniqueIds = [...new Set(hfLookupQueue.map(m => m.hf_id || m.name).filter(id => id.includes('/')))].slice(0, 30);
206
  if (uniqueIds.length > 0) {
207
+ process.stdout.write(` HF Hub: technical metadata inspection for ${uniqueIds.length} models... `);
208
  const idToSize = new Map();
209
  await Promise.all(uniqueIds.map(async (id) => {
210
  const size = await fetchHFSize(id);
 
213
  for (const model of hfLookupQueue) {
214
  if (!model.size_b) {
215
  const size = idToSize.get(model.hf_id || model.name);
216
+ if (size) { model.size_b = size; hfSizeFetched++; }
 
 
 
217
  }
218
  }
219
  console.log(`βœ“ ${hfSizeFetched} sizes found`);
 
226
 
227
  async function runFetcher(fetcher, data) {
228
  const { key, providerName, fn } = fetcher;
 
229
  try {
230
  process.stdout.write(`Fetching ${providerName}... `);
231
  const models = await fn();
 
239
  }
240
 
241
  async function main() {
 
242
  const args = process.argv.slice(2).map((a) => a.toLowerCase());
243
+ const fetchers = args.length > 0 ? FETCHERS.filter((f) => args.includes(f.key)) : FETCHERS;
 
 
 
 
244
  if (fetchers.length === 0) {
245
  console.error('No matching fetchers found. Available:', FETCHERS.map((f) => f.key).join(', '));
246
  process.exit(1);
247
  }
 
248
  const data = loadData();
249
  console.log(`Running ${fetchers.length} fetcher(s)...\n`);
 
250
  const results = [];
251
+ for (const fetcher of fetchers) results.push(await runFetcher(fetcher, data));
 
 
 
 
 
252
  await propagateExtraData(data);
 
253
  saveData(data);
 
254
  console.log('\nSummary:');
255
  let anyFailed = false;
256
  results.forEach((r) => {
257
  if (r.success) console.log(` βœ“ ${r.providerName}: ${r.count} models`);
258
+ else { console.log(` βœ— ${r.providerName}: ${r.error}`); anyFailed = true; }
 
 
 
259
  });
 
260
  if (anyFailed) process.exit(1);
261
  }
262
 
263
+ main().catch((err) => { console.error('Fatal:', err); process.exit(1); });
 
 
 
scripts/providers/openrouter.js CHANGED
@@ -95,6 +95,8 @@ async function fetchOpenRouter() {
95
  currency: 'USD',
96
  };
97
 
 
 
98
  // For pure image-gen models (no per-token pricing), store the per-image price
99
  if (imagePrice > 0 && inputPrice === 0 && outputPrice === 0) {
100
  modelEntry.price_per_image = Math.round(imagePrice * 100000) / 100000;
 
95
  currency: 'USD',
96
  };
97
 
98
+ if (model.hugging_face_id) modelEntry.hf_id = model.hugging_face_id;
99
+
100
  // For pure image-gen models (no per-token pricing), store the per-image price
101
  if (imagePrice > 0 && inputPrice === 0 && outputPrice === 0) {
102
  modelEntry.price_per_image = Math.round(imagePrice * 100000) / 100000;
src/App.tsx CHANGED
@@ -89,6 +89,7 @@ const CAP_ICON: Record<string, string> = {
89
  'image-gen': '🎨',
90
  tools: 'πŸ”§',
91
  reasoning: 'πŸ’‘',
 
92
  }
93
 
94
  function App() {
@@ -491,6 +492,9 @@ function App() {
491
  </td>
492
  <td className="model-name">{model.display_name ?? model.name}</td>
493
  <td className="caps-cell">
 
 
 
494
  {(model.capabilities || []).map(cap => (
495
  <span key={cap} className={`cap-badge cap-${cap}`} title={cap}>{CAP_ICON[cap] ?? cap}</span>
496
  ))}
 
89
  'image-gen': '🎨',
90
  tools: 'πŸ”§',
91
  reasoning: 'πŸ’‘',
92
+ embedding: '🧩',
93
  }
94
 
95
  function App() {
 
492
  </td>
493
  <td className="model-name">{model.display_name ?? model.name}</td>
494
  <td className="caps-cell">
495
+ {model.type === 'embedding' && (
496
+ <span className="cap-badge cap-embedding" title="embedding">{CAP_ICON.embedding}</span>
497
+ )}
498
  {(model.capabilities || []).map(cap => (
499
  <span key={cap} className={`cap-badge cap-${cap}`} title={cap}>{CAP_ICON[cap] ?? cap}</span>
500
  ))}