CrispStrobe commited on
Commit
02e6e85
·
1 Parent(s): 3a30924

feat: improve size precision for small models and add manual HF mapping for Infomaniak

Browse files
Files changed (2) hide show
  1. data/providers.json +0 -0
  2. scripts/fetch-providers.js +112 -22
data/providers.json CHANGED
The diff for this file is too large to render. See raw diff
 
scripts/fetch-providers.js CHANGED
@@ -16,6 +16,8 @@ const { getJson, getText, fetchRobust } = require('./fetch-utils');
16
  const DATA_FILE = path.join(__dirname, '..', 'data', 'providers.json');
17
 
18
  // Registry of all available fetchers.
 
 
19
  const FETCHER_MODULES = {
20
  scaleway: require('./providers/scaleway'),
21
  openrouter: require('./providers/openrouter'),
@@ -30,6 +32,7 @@ const FETCHER_MODULES = {
30
  };
31
 
32
  const FETCHERS = Object.entries(FETCHER_MODULES).map(([key, mod]) => {
 
33
  const fn = Object.values(mod).find((v) => typeof v === 'function');
34
  if (!fn) throw new Error(`Module for ${key} exports no function`);
35
  return { key, providerName: mod.providerName, fn };
@@ -53,14 +56,18 @@ function updateProviderModels(providers, providerName, models) {
53
  return true;
54
  }
55
 
 
56
  const normName = (s) =>
57
  s.toLowerCase().replace(/[-_.:]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();
58
 
 
 
59
  function buildOrIndex(orProvider) {
60
  if (!orProvider) return [];
61
  const index = [];
62
  for (const m of orProvider.models || []) {
63
  if (!m.capabilities || m.capabilities.length === 0) continue;
 
64
  const modelPart = m.name.replace(/:free$/, '').split('/').pop();
65
  index.push({
66
  norm: normName(modelPart),
@@ -73,30 +80,48 @@ function buildOrIndex(orProvider) {
73
  return index;
74
  }
75
 
 
 
76
  function findOrMatch(modelName, orIndex) {
 
77
  const raw = modelName.replace(/@[^/]+$/, '').replace(/:[^/]+$/, '');
78
  const modelPart = raw.includes('/') ? raw.split('/').pop() : raw;
 
79
  const n = normName(modelPart).replace(/ (?:reasoning|thinking|extended|nothinking)$/, '');
80
 
81
- for (const entry of orIndex) if (entry.norm === n) return entry;
82
- let best = null, bestLen = 0;
 
 
 
 
 
83
  for (const entry of orIndex) {
84
  if (n.startsWith(entry.norm) && entry.norm.length > bestLen) {
85
- best = entry; bestLen = entry.norm.length;
 
86
  }
87
  }
88
  if (best) return best;
89
- for (const entry of orIndex) if (entry.norm.startsWith(n + ' ')) return entry;
 
 
 
 
 
90
  if (n.length >= 5) {
91
  let bestC = null, bestCLen = Infinity;
92
  for (const entry of orIndex) {
93
  const e = entry.norm;
94
- if ((e === n || e.includes(' ' + n + ' ') || e.startsWith(n + ' ') || e.endsWith(' ' + n)) && e.length < bestCLen) {
 
95
  bestC = entry; bestCLen = e.length;
96
  }
97
  }
98
  if (bestC) return bestC;
99
  }
 
 
100
  const tokens = n.split(' ');
101
  if (tokens.length >= 2 && n.length >= 7) {
102
  let bestT = null, bestTLen = Infinity;
@@ -118,26 +143,42 @@ async function fetchHFSize(hfId) {
118
  const headers = token ? { Authorization: `Bearer ${token}` } : {};
119
  try {
120
  const data = await getJson(`https://huggingface.co/api/models/${hfId}`, { headers });
 
121
  let params = data.safetensors?.total || data.config?.total_parameters || data.config?.model_type_params;
122
  if (!params && data.cardData?.model_details?.parameters) {
123
  const match = data.cardData.model_details.parameters.match(/([\d.]+)\s*[Bb]/);
124
  if (match) params = parseFloat(match[1]) * 1_000_000_000;
125
  }
126
- return params ? Math.round(params / 1_000_000_000 * 10) / 10 : null;
127
- } catch (e) { return null; }
 
 
 
 
 
128
  }
129
 
130
  const EMBEDDER_KEYWORDS = ['embed', 'bge', 'gte', 'e5', 'stella', 'minilm', 'multilingual-mpnet'];
131
 
 
 
 
 
 
 
 
 
 
132
  async function propagateExtraData(data) {
133
  const orProvider = data.providers.find((p) => p.name === 'OpenRouter');
134
  const orIndex = buildOrIndex(orProvider);
135
 
 
136
  let benchmarks = [];
137
  try {
138
  const bmFile = path.join(__dirname, '..', 'data', 'benchmarks.json');
139
  if (fs.existsSync(bmFile)) benchmarks = JSON.parse(fs.readFileSync(bmFile, 'utf8'));
140
- } catch (e) {}
141
 
142
  // Multi-level Benchmark Size Maps
143
  const bmSizeMap = new Map();
@@ -150,31 +191,50 @@ async function propagateExtraData(data) {
150
  }
151
  });
152
 
153
- let propagatedCaps = 0, propagatedSize = 0, autoTagged = 0, hfSizeFetched = 0;
 
 
 
 
 
154
  const hfLookupQueue = [];
155
 
156
  for (const provider of data.providers) {
157
  for (const model of provider.models || []) {
158
  const n = normName(model.name);
159
 
 
 
 
 
 
160
  // 1. STRUCTURED LOOKUP: Match size by hf_id if available (Benchmark gold-standard)
161
  if (!model.size_b && model.hf_id) {
162
  const size = hfIdToSize.get(model.hf_id.toLowerCase());
163
- if (size) { model.size_b = size; propagatedSize++; }
 
 
 
164
  }
165
 
166
- // 2. AUTO-TAG type
167
  if (model.type === 'image' && (!model.capabilities || !model.capabilities.length)) {
168
- model.capabilities = ['image-gen']; autoTagged++;
 
169
  }
170
  if (model.type === 'chat' && EMBEDDER_KEYWORDS.some(k => n.includes(k))) {
171
- model.type = 'embedding'; autoTagged++;
 
172
  }
173
 
174
  // 3. FALLBACK: Match size by name against benchmarks
175
  if (!model.size_b) {
 
176
  const size = bmSizeMap.get(n) || bmSizeMap.get(n.split(' ').pop());
177
- if (size) { model.size_b = size; propagatedSize++; }
 
 
 
178
  }
179
 
180
  // 4. INHERIT: Structured data inheritance from OpenRouter
@@ -188,7 +248,10 @@ async function propagateExtraData(data) {
188
  }
189
  if (model.type === 'chat' && match.type !== 'chat') model.type = match.type;
190
 
191
- if (!model.size_b && match.size_b) { model.size_b = match.size_b; propagatedSize++; }
 
 
 
192
  // Crucial: inherit hf_id to enable Hub API fallback below
193
  if (!model.hf_id && match.hf_id) model.hf_id = match.hf_id;
194
  }
@@ -202,11 +265,13 @@ async function propagateExtraData(data) {
202
  }
203
 
204
  // 6. QUEUE: Still missing size? Try Hub API metadata lookup
205
- if (!model.size_b && (model.name.includes('/') || model.hf_id)) hfLookupQueue.push(model);
 
 
206
  }
207
  }
208
 
209
- // 7. HUB API: Inspect technical metadata (Limit 30 to prevent timeouts)
210
  const uniqueIds = [...new Set(hfLookupQueue.map(m => m.hf_id || m.name).filter(id => id.includes('/')))].slice(0, 30);
211
  if (uniqueIds.length > 0) {
212
  process.stdout.write(` HF Hub: technical metadata inspection for ${uniqueIds.length} models... `);
@@ -218,7 +283,10 @@ async function propagateExtraData(data) {
218
  for (const model of hfLookupQueue) {
219
  if (!model.size_b) {
220
  const size = idToSize.get(model.hf_id || model.name);
221
- if (size) { model.size_b = size; hfSizeFetched++; }
 
 
 
222
  }
223
  }
224
  console.log(`✓ ${hfSizeFetched} sizes found`);
@@ -231,6 +299,7 @@ async function propagateExtraData(data) {
231
 
232
  async function runFetcher(fetcher, data) {
233
  const { key, providerName, fn } = fetcher;
 
234
  try {
235
  process.stdout.write(`Fetching ${providerName}... `);
236
  const models = await fn();
@@ -244,25 +313,46 @@ async function runFetcher(fetcher, data) {
244
  }
245
 
246
  async function main() {
 
247
  const args = process.argv.slice(2).map((a) => a.toLowerCase());
248
- const fetchers = args.length > 0 ? FETCHERS.filter((f) => args.includes(f.key)) : FETCHERS;
 
 
 
 
249
  if (fetchers.length === 0) {
250
  console.error('No matching fetchers found. Available:', FETCHERS.map((f) => f.key).join(', '));
251
  process.exit(1);
252
  }
 
253
  const data = loadData();
254
  console.log(`Running ${fetchers.length} fetcher(s)...\n`);
 
255
  const results = [];
256
- for (const fetcher of fetchers) results.push(await runFetcher(fetcher, data));
 
 
 
 
 
257
  await propagateExtraData(data);
 
258
  saveData(data);
 
259
  console.log('\nSummary:');
260
  let anyFailed = false;
261
  results.forEach((r) => {
262
  if (r.success) console.log(` ✓ ${r.providerName}: ${r.count} models`);
263
- else { console.log(` ✗ ${r.providerName}: ${r.error}`); anyFailed = true; }
 
 
 
264
  });
 
265
  if (anyFailed) process.exit(1);
266
  }
267
 
268
- main().catch((err) => { console.error('Fatal:', err); process.exit(1); });
 
 
 
 
16
  const DATA_FILE = path.join(__dirname, '..', 'data', 'providers.json');
17
 
18
  // Registry of all available fetchers.
19
+ // Each module must export { providerName, fetch<Name> }.
20
+ // Add new providers here as scripts/providers/<name>.js modules.
21
  const FETCHER_MODULES = {
22
  scaleway: require('./providers/scaleway'),
23
  openrouter: require('./providers/openrouter'),
 
32
  };
33
 
34
  const FETCHERS = Object.entries(FETCHER_MODULES).map(([key, mod]) => {
35
+ // Find the exported async function (the one that isn't providerName)
36
  const fn = Object.values(mod).find((v) => typeof v === 'function');
37
  if (!fn) throw new Error(`Module for ${key} exports no function`);
38
  return { key, providerName: mod.providerName, fn };
 
56
  return true;
57
  }
58
 
59
+ // Normalize a model name/ID for fuzzy matching (same as App.tsx normalizeName).
60
  const normName = (s) =>
61
  s.toLowerCase().replace(/[-_.:]/g, ' ').replace(/[^a-z0-9 ]/g, '').replace(/\s+/g, ' ').trim();
62
 
63
+ // Build an index of normalized OpenRouter model-part → { capabilities, type, size_b, hf_id }
64
+ // Only includes entries that carry non-trivial capability data.
65
  function buildOrIndex(orProvider) {
66
  if (!orProvider) return [];
67
  const index = [];
68
  for (const m of orProvider.models || []) {
69
  if (!m.capabilities || m.capabilities.length === 0) continue;
70
+ // Strip :free suffix and take the model part after '/'
71
  const modelPart = m.name.replace(/:free$/, '').split('/').pop();
72
  index.push({
73
  norm: normName(modelPart),
 
80
  return index;
81
  }
82
 
83
+ // For a given model name, find the best matching OpenRouter index entry.
84
+ // Returns { capabilities, type, size_b, hf_id } or null.
85
  function findOrMatch(modelName, orIndex) {
86
+ // Use the model part (after last '/') for matching, strip :region/@suffix
87
  const raw = modelName.replace(/@[^/]+$/, '').replace(/:[^/]+$/, '');
88
  const modelPart = raw.includes('/') ? raw.split('/').pop() : raw;
89
+ // Strip reasoning/thinking suffixes that don't appear in OR model IDs
90
  const n = normName(modelPart).replace(/ (?:reasoning|thinking|extended|nothinking)$/, '');
91
 
92
+ // 1. Exact match
93
+ for (const entry of orIndex) {
94
+ if (entry.norm === n) return entry;
95
+ }
96
+ // 2. Provider model name starts with OR model part (e.g. "claude-3-5-sonnet-20241022" starts with "claude-3-5-sonnet")
97
+ let best = null;
98
+ let bestLen = 0;
99
  for (const entry of orIndex) {
100
  if (n.startsWith(entry.norm) && entry.norm.length > bestLen) {
101
+ best = entry;
102
+ bestLen = entry.norm.length;
103
  }
104
  }
105
  if (best) return best;
106
+ // 3. OR model part starts with provider name (e.g. "claude-haiku-4-5" "claude-haiku-4-5-20251001")
107
+ for (const entry of orIndex) {
108
+ if (entry.norm.startsWith(n + ' ')) return entry;
109
+ }
110
+ // 4. OR model norm contains provider name as a contiguous word sequence.
111
+ // Handles short display names like "Sonnet 4.6" matching inside "claude sonnet 4 6".
112
  if (n.length >= 5) {
113
  let bestC = null, bestCLen = Infinity;
114
  for (const entry of orIndex) {
115
  const e = entry.norm;
116
+ if ((e === n || e.includes(' ' + n + ' ') || e.startsWith(n + ' ') || e.endsWith(' ' + n))
117
+ && e.length < bestCLen) {
118
  bestC = entry; bestCLen = e.length;
119
  }
120
  }
121
  if (bestC) return bestC;
122
  }
123
+ // 5. All tokens of provider name appear in OR norm (handles word-order differences).
124
+ // e.g. "Sonnet 3.7" → tokens ["sonnet","3","7"] match inside "claude 3 7 sonnet 20250219".
125
  const tokens = n.split(' ');
126
  if (tokens.length >= 2 && n.length >= 7) {
127
  let bestT = null, bestTLen = Infinity;
 
143
  const headers = token ? { Authorization: `Bearer ${token}` } : {};
144
  try {
145
  const data = await getJson(`https://huggingface.co/api/models/${hfId}`, { headers });
146
+ // Check various common metadata locations for total parameters
147
  let params = data.safetensors?.total || data.config?.total_parameters || data.config?.model_type_params;
148
  if (!params && data.cardData?.model_details?.parameters) {
149
  const match = data.cardData.model_details.parameters.match(/([\d.]+)\s*[Bb]/);
150
  if (match) params = parseFloat(match[1]) * 1_000_000_000;
151
  }
152
+ if (!params) return null;
153
+ const b = params / 1_000_000_000;
154
+ // Keep 2 decimals for small models (<1B), 1 decimal for others
155
+ return b < 1 ? Math.round(b * 100) / 100 : Math.round(b * 10) / 10;
156
+ } catch (e) {
157
+ return null; // Silently skip failures for individual models
158
+ }
159
  }
160
 
161
  const EMBEDDER_KEYWORDS = ['embed', 'bge', 'gte', 'e5', 'stella', 'minilm', 'multilingual-mpnet'];
162
 
163
+ // Link common models to their HF IDs when naming is non-standard
164
+ const MANUAL_HF_ID_MAP = {
165
+ 'all minilm l12 v2': 'sentence-transformers/all-MiniLM-L12-v2',
166
+ 'whisper v3': 'openai/whisper-large-v3',
167
+ 'whisper-large-v3': 'openai/whisper-large-v3',
168
+ };
169
+
170
+ // Propagate capabilities and size from benchmarks, OpenRouter, or HF Hub to all other providers' models.
171
+ // Only fills in fields when the model doesn't already have them.
172
  async function propagateExtraData(data) {
173
  const orProvider = data.providers.find((p) => p.name === 'OpenRouter');
174
  const orIndex = buildOrIndex(orProvider);
175
 
176
+ // Load benchmarks for size lookup
177
  let benchmarks = [];
178
  try {
179
  const bmFile = path.join(__dirname, '..', 'data', 'benchmarks.json');
180
  if (fs.existsSync(bmFile)) benchmarks = JSON.parse(fs.readFileSync(bmFile, 'utf8'));
181
+ } catch (e) { /* ignore */ }
182
 
183
  // Multi-level Benchmark Size Maps
184
  const bmSizeMap = new Map();
 
191
  }
192
  });
193
 
194
+ let propagatedCaps = 0;
195
+ let propagatedSize = 0;
196
+ let autoTagged = 0;
197
+ let hfSizeFetched = 0;
198
+
199
+ // We'll collect models missing size that have a clear HF-id-like name
200
  const hfLookupQueue = [];
201
 
202
  for (const provider of data.providers) {
203
  for (const model of provider.models || []) {
204
  const n = normName(model.name);
205
 
206
+ // 0. MANUAL OVERRIDE: Link common models to their HF IDs
207
+ if (!model.hf_id && MANUAL_HF_ID_MAP[n]) {
208
+ model.hf_id = MANUAL_HF_ID_MAP[n];
209
+ }
210
+
211
  // 1. STRUCTURED LOOKUP: Match size by hf_id if available (Benchmark gold-standard)
212
  if (!model.size_b && model.hf_id) {
213
  const size = hfIdToSize.get(model.hf_id.toLowerCase());
214
+ if (size) {
215
+ model.size_b = size;
216
+ propagatedSize++;
217
+ }
218
  }
219
 
220
+ // 2. AUTO-TAG image-gen and embedding models
221
  if (model.type === 'image' && (!model.capabilities || !model.capabilities.length)) {
222
+ model.capabilities = ['image-gen'];
223
+ autoTagged++;
224
  }
225
  if (model.type === 'chat' && EMBEDDER_KEYWORDS.some(k => n.includes(k))) {
226
+ model.type = 'embedding';
227
+ autoTagged++;
228
  }
229
 
230
  // 3. FALLBACK: Match size by name against benchmarks
231
  if (!model.size_b) {
232
+ // Try exact name match or base name match
233
  const size = bmSizeMap.get(n) || bmSizeMap.get(n.split(' ').pop());
234
+ if (size) {
235
+ model.size_b = size;
236
+ propagatedSize++;
237
+ }
238
  }
239
 
240
  // 4. INHERIT: Structured data inheritance from OpenRouter
 
248
  }
249
  if (model.type === 'chat' && match.type !== 'chat') model.type = match.type;
250
 
251
+ if (!model.size_b && match.size_b) {
252
+ model.size_b = match.size_b;
253
+ propagatedSize++;
254
+ }
255
  // Crucial: inherit hf_id to enable Hub API fallback below
256
  if (!model.hf_id && match.hf_id) model.hf_id = match.hf_id;
257
  }
 
265
  }
266
 
267
  // 6. QUEUE: Still missing size? Try Hub API metadata lookup
268
+ if (!model.size_b && (model.name.includes('/') || model.hf_id)) {
269
+ hfLookupQueue.push(model);
270
+ }
271
  }
272
  }
273
 
274
+ // 7. HUB API: Inspect technical metadata (Limit 30 unique IDs to avoid long startup)
275
  const uniqueIds = [...new Set(hfLookupQueue.map(m => m.hf_id || m.name).filter(id => id.includes('/')))].slice(0, 30);
276
  if (uniqueIds.length > 0) {
277
  process.stdout.write(` HF Hub: technical metadata inspection for ${uniqueIds.length} models... `);
 
283
  for (const model of hfLookupQueue) {
284
  if (!model.size_b) {
285
  const size = idToSize.get(model.hf_id || model.name);
286
+ if (size) {
287
+ model.size_b = size;
288
+ hfSizeFetched++;
289
+ }
290
  }
291
  }
292
  console.log(`✓ ${hfSizeFetched} sizes found`);
 
299
 
300
  async function runFetcher(fetcher, data) {
301
  const { key, providerName, fn } = fetcher;
302
+
303
  try {
304
  process.stdout.write(`Fetching ${providerName}... `);
305
  const models = await fn();
 
313
  }
314
 
315
  async function main() {
316
+ // Determine which fetchers to run
317
  const args = process.argv.slice(2).map((a) => a.toLowerCase());
318
+ const fetchers =
319
+ args.length > 0
320
+ ? FETCHERS.filter((f) => args.includes(f.key))
321
+ : FETCHERS;
322
+
323
  if (fetchers.length === 0) {
324
  console.error('No matching fetchers found. Available:', FETCHERS.map((f) => f.key).join(', '));
325
  process.exit(1);
326
  }
327
+
328
  const data = loadData();
329
  console.log(`Running ${fetchers.length} fetcher(s)...\n`);
330
+
331
  const results = [];
332
+ for (const fetcher of fetchers) {
333
+ const result = await runFetcher(fetcher, data);
334
+ results.push(result);
335
+ }
336
+
337
+ // Always propagate extra data from OpenRouter and Benchmarks to all providers' models.
338
  await propagateExtraData(data);
339
+
340
  saveData(data);
341
+
342
  console.log('\nSummary:');
343
  let anyFailed = false;
344
  results.forEach((r) => {
345
  if (r.success) console.log(` ✓ ${r.providerName}: ${r.count} models`);
346
+ else {
347
+ console.log(` ✗ ${r.providerName}: ${r.error}`);
348
+ anyFailed = true;
349
+ }
350
  });
351
+
352
  if (anyFailed) process.exit(1);
353
  }
354
 
355
+ main().catch((err) => {
356
+ console.error('Fatal:', err);
357
+ process.exit(1);
358
+ });