File size: 2,453 Bytes
da8db3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import "./server-only-shim.mjs"
import fs from "fs"

// Replicate normalizeDeveloperName from lib/model-data.ts:217-244 verbatim.
const KNOWN_DEVELOPER_NAMES = {
  openai: "OpenAI",
  google: "Google",
  anthropic: "Anthropic",
  meta: "Meta",
  microsoft: "Microsoft",
  mistralai: "Mistral AI",
  deepseek: "DeepSeek",
  "deepseek-ai": "DeepSeek",
  cohere: "Cohere",
  nvidia: "NVIDIA",
  alibaba: "Alibaba",
  amazon: "Amazon",
  apple: "Apple",
  ibm: "IBM",
  xai: "xAI",
  "x-ai": "xAI",
}

function normalizeDeveloperName(name) {
  const key = name.trim().toLowerCase()
  if (KNOWN_DEVELOPER_NAMES[key]) return KNOWN_DEVELOPER_NAMES[key]
  if (name === name.toLowerCase() && /^[a-z]/.test(name)) {
    return name.charAt(0).toUpperCase() + name.slice(1)
  }
  return name
}

const devs = JSON.parse(fs.readFileSync(".cache/hf-data/developers.json", "utf8"))

console.log(`=== Audit: normalizeDeveloperName across ${devs.length} developers ===`)
const buckets = { mapHit: 0, titleCase: 0, passthrough: 0 }
const examples = { mapHit: [], titleCase: [], passthrough: [] }
for (const d of devs) {
  const raw = d.developer
  const normalized = normalizeDeveloperName(raw)
  const key = raw.trim().toLowerCase()
  let bucket
  if (KNOWN_DEVELOPER_NAMES[key]) bucket = "mapHit"
  else if (raw === raw.toLowerCase() && /^[a-z]/.test(raw)) bucket = "titleCase"
  else bucket = "passthrough"
  buckets[bucket]++
  if (examples[bucket].length < 5) examples[bucket].push({ raw, normalized })
}
console.log(buckets)
console.log()
for (const [bucket, exs] of Object.entries(examples)) {
  console.log(`--- ${bucket} ---`)
  for (const e of exs) console.log(`  '${e.raw}' → '${e.normalized}'`)
}

// Also check model-cards.json — `developer` field there
console.log("\n=== Audit: across 5830 model-cards.json entries ===")
const cards = JSON.parse(fs.readFileSync(".cache/hf-data/model-cards.json", "utf8"))
const cardBuckets = { mapHit: 0, titleCase: 0, passthrough: 0 }
for (const c of cards) {
  const raw = c.developer
  const key = raw.trim().toLowerCase()
  let bucket
  if (KNOWN_DEVELOPER_NAMES[key]) bucket = "mapHit"
  else if (raw === raw.toLowerCase() && /^[a-z]/.test(raw)) bucket = "titleCase"
  else bucket = "passthrough"
  cardBuckets[bucket]++
}
console.log(cardBuckets)

// Distinct developer names
const distinctDevs = new Set(devs.map(d => d.developer))
console.log("\n=== Distinct developer name strings:", distinctDevs.size, "===")