midah commited on
Commit
42ab226
Β·
1 Parent(s): 0efb0d1

Add filtering, sorting, and grouping to linkages page and implement comprehensive model enrichment pipeline

Browse files
app/analytics/page.tsx CHANGED
@@ -104,15 +104,15 @@ export default async function AnalyticsPage() {
104
  return (
105
  <div className="min-h-screen bg-background">
106
  <div className="container-content section-padding">
107
- <div className="mb-8">
108
- <h1 className="text-4xl font-semibold mb-4">Market Analytics</h1>
109
- <p className="text-text-muted text-lg">
110
  Market structure, concentration, and trends
111
  </p>
112
  </div>
113
 
114
  {/* Key Stats */}
115
- <div className="grid grid-cols-1 md:grid-cols-4 gap-6 mb-12">
116
  <div className="stat-card">
117
  <div className="stat-value">{analytics.totalDeals}</div>
118
  <div className="stat-label">Total Deals</div>
@@ -132,8 +132,8 @@ export default async function AnalyticsPage() {
132
  </div>
133
 
134
  {/* Modality Breakdown */}
135
- <div className="card mb-8">
136
- <h2 className="text-2xl font-semibold mb-6">Deals by Modality</h2>
137
  <div className="space-y-4">
138
  {Object.entries(analytics.modalityCounts)
139
  .sort(([, a], [, b]) => b - a)
@@ -152,9 +152,9 @@ export default async function AnalyticsPage() {
152
  </div>
153
 
154
  {/* Top Buyers */}
155
- <div className="grid md:grid-cols-2 gap-8 mb-8">
156
  <div className="card">
157
- <h2 className="text-2xl font-semibold mb-6">Top Buyers by Spend</h2>
158
  <div className="space-y-4">
159
  {analytics.topBuyers.map((buyer, idx) => (
160
  <div key={buyer.name} className="flex items-center justify-between py-2">
@@ -173,7 +173,7 @@ export default async function AnalyticsPage() {
173
 
174
  {/* Top Providers */}
175
  <div className="card">
176
- <h2 className="text-2xl font-semibold mb-6">Top Providers by Spend</h2>
177
  <div className="space-y-4">
178
  {analytics.topProviders.map((provider, idx) => (
179
  <div key={provider.name} className="flex items-center justify-between py-2">
 
104
  return (
105
  <div className="min-h-screen bg-background">
106
  <div className="container-content section-padding">
107
+ <div className="mb-4">
108
+ <h1 className="text-3xl font-semibold mb-1">Market Analytics</h1>
109
+ <p className="text-text-muted text-sm">
110
  Market structure, concentration, and trends
111
  </p>
112
  </div>
113
 
114
  {/* Key Stats */}
115
+ <div className="grid grid-cols-1 md:grid-cols-4 gap-4 mb-8">
116
  <div className="stat-card">
117
  <div className="stat-value">{analytics.totalDeals}</div>
118
  <div className="stat-label">Total Deals</div>
 
132
  </div>
133
 
134
  {/* Modality Breakdown */}
135
+ <div className="card mb-6">
136
+ <h2 className="text-xl font-semibold mb-4">Deals by Modality</h2>
137
  <div className="space-y-4">
138
  {Object.entries(analytics.modalityCounts)
139
  .sort(([, a], [, b]) => b - a)
 
152
  </div>
153
 
154
  {/* Top Buyers */}
155
+ <div className="grid md:grid-cols-2 gap-4 mb-6">
156
  <div className="card">
157
+ <h2 className="text-xl font-semibold mb-4">Top Buyers by Spend</h2>
158
  <div className="space-y-4">
159
  {analytics.topBuyers.map((buyer, idx) => (
160
  <div key={buyer.name} className="flex items-center justify-between py-2">
 
173
 
174
  {/* Top Providers */}
175
  <div className="card">
176
+ <h2 className="text-xl font-semibold mb-4">Top Providers by Spend</h2>
177
  <div className="space-y-4">
178
  {analytics.topProviders.map((provider, idx) => (
179
  <div key={provider.name} className="flex items-center justify-between py-2">
app/api/models/enrich-web/route.ts ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from 'next/server'
2
+ import { exec } from 'child_process'
3
+ import { promisify } from 'util'
4
+ import path from 'path'
5
+ import { existsSync } from 'fs'
6
+
7
+ const execAsync = promisify(exec)
8
+
9
+ export const dynamic = 'force-dynamic'
10
+ export const maxDuration = 600 // 10 minutes for web enrichment
11
+
12
+ /**
13
+ * POST /api/models/enrich-web - Enrich models using web search and LLM extraction
14
+ *
15
+ * Optional query params:
16
+ * - limit: number of models to enrich (default: all)
17
+ * - no_web: disable web search (default: false)
18
+ * - no_llm: disable LLM extraction (default: false)
19
+ */
20
+ export async function POST(request: Request) {
21
+ try {
22
+ const { searchParams } = new URL(request.url)
23
+ const limit = searchParams.get('limit')
24
+ const noWeb = searchParams.get('no_web') === 'true'
25
+ const noLlm = searchParams.get('no_llm') === 'true'
26
+
27
+ // Path to the enrichment script
28
+ const enrichScript = path.join(process.cwd(), 'registry', 'enrich_all_models.py')
29
+ const venvPython = path.join(process.cwd(), 'venv', 'bin', 'python3')
30
+ const pythonPath = process.env.PYTHON_PATH || (existsSync(venvPython) ? venvPython : 'python3')
31
+
32
+ console.log(`Starting web enrichment: limit=${limit || 'all'}, web=${!noWeb}, llm=${!noLlm}`)
33
+
34
+ // Build command
35
+ let command = `${pythonPath} ${enrichScript}`
36
+ if (limit) {
37
+ command += ` --limit ${limit}`
38
+ }
39
+ if (noWeb) {
40
+ command += ` --no-web`
41
+ }
42
+ if (noLlm) {
43
+ command += ` --no-llm`
44
+ }
45
+
46
+ const { stdout, stderr } = await execAsync(command, {
47
+ cwd: process.cwd(),
48
+ timeout: 540000, // 9 minutes timeout
49
+ env: {
50
+ ...process.env,
51
+ PYTHONUNBUFFERED: '1',
52
+ },
53
+ })
54
+
55
+ // Parse output to extract summary
56
+ const summary: {
57
+ success: boolean
58
+ limit?: number
59
+ no_web: boolean
60
+ no_llm: boolean
61
+ output: string
62
+ errors: string
63
+ timestamp: string
64
+ models_enriched?: number
65
+ errors_count?: number
66
+ } = {
67
+ success: true,
68
+ no_web: noWeb,
69
+ no_llm: noLlm,
70
+ output: stdout,
71
+ errors: stderr,
72
+ timestamp: new Date().toISOString(),
73
+ }
74
+
75
+ if (limit) {
76
+ summary.limit = parseInt(limit)
77
+ }
78
+
79
+ // Try to extract numbers from output
80
+ const enrichedMatch = stdout.match(/Successfully enriched: (\d+)\/(\d+)/)
81
+ const errorsMatch = stdout.match(/Errors: (\d+)/)
82
+
83
+ if (enrichedMatch) {
84
+ summary.models_enriched = parseInt(enrichedMatch[1])
85
+ }
86
+ if (errorsMatch) {
87
+ summary.errors_count = parseInt(errorsMatch[1])
88
+ }
89
+
90
+ return NextResponse.json(summary)
91
+ } catch (error: any) {
92
+ console.error('Web enrichment error:', error)
93
+ return NextResponse.json(
94
+ {
95
+ success: false,
96
+ error: error.message || 'Failed to enrich models with web search',
97
+ timestamp: new Date().toISOString(),
98
+ },
99
+ { status: 500 }
100
+ )
101
+ }
102
+ }
103
+
app/components/Sidebar.tsx CHANGED
@@ -10,37 +10,31 @@ export default function Sidebar() {
10
  {
11
  href: '/',
12
  label: 'Deals',
13
- icon: 'πŸ“Š',
14
  },
15
  {
16
  href: '/timeline',
17
  label: 'Timeline',
18
- icon: 'πŸ“…',
19
  },
20
  {
21
  href: '/models',
22
  label: 'Models',
23
- icon: 'πŸ€–',
24
  },
25
  {
26
  href: '/linkages',
27
  label: 'Linkages',
28
- icon: 'πŸ”—',
29
  },
30
  {
31
  href: '/analytics',
32
  label: 'Analytics',
33
- icon: 'πŸ“ˆ',
34
  },
35
  ]
36
 
37
  return (
38
  <aside className="w-64 bg-surface border-r border-border flex-shrink-0 min-h-screen sticky top-0">
39
- <div className="p-6 border-b border-border">
40
- <h2 className="text-lg font-semibold mb-1">AI Training Data</h2>
41
- <p className="text-xs text-text-muted">Deals Dashboard</p>
42
  </div>
43
- <nav className="p-4">
44
  <ul className="space-y-1">
45
  {navItems.map((item) => {
46
  const isActive = pathname === item.href
@@ -48,14 +42,13 @@ export default function Sidebar() {
48
  <li key={item.href}>
49
  <Link
50
  href={item.href}
51
- className={`flex items-center gap-3 px-3 py-2 rounded-sm text-sm transition-colors ${
52
  isActive
53
  ? 'bg-accent/10 text-accent font-medium'
54
  : 'text-text-muted hover:bg-border-subtle hover:text-text'
55
  }`}
56
  >
57
- <span className="text-base">{item.icon}</span>
58
- <span>{item.label}</span>
59
  </Link>
60
  </li>
61
  )
 
10
  {
11
  href: '/',
12
  label: 'Deals',
 
13
  },
14
  {
15
  href: '/timeline',
16
  label: 'Timeline',
 
17
  },
18
  {
19
  href: '/models',
20
  label: 'Models',
 
21
  },
22
  {
23
  href: '/linkages',
24
  label: 'Linkages',
 
25
  },
26
  {
27
  href: '/analytics',
28
  label: 'Analytics',
 
29
  },
30
  ]
31
 
32
  return (
33
  <aside className="w-64 bg-surface border-r border-border flex-shrink-0 min-h-screen sticky top-0">
34
+ <div className="p-4 border-b border-border">
35
+ <h2 className="text-lg font-semibold">AI Training Data</h2>
 
36
  </div>
37
+ <nav className="p-2">
38
  <ul className="space-y-1">
39
  {navItems.map((item) => {
40
  const isActive = pathname === item.href
 
42
  <li key={item.href}>
43
  <Link
44
  href={item.href}
45
+ className={`px-3 py-2 rounded-none text-sm transition-colors ${
46
  isActive
47
  ? 'bg-accent/10 text-accent font-medium'
48
  : 'text-text-muted hover:bg-border-subtle hover:text-text'
49
  }`}
50
  >
51
+ {item.label}
 
52
  </Link>
53
  </li>
54
  )
app/components/Tooltip.tsx CHANGED
@@ -33,7 +33,7 @@ export default function Tooltip({
33
  {children}
34
  {isVisible && (
35
  <div
36
- className={`absolute z-50 ${positionClasses[position]} w-64 p-3 bg-surface border border-border rounded-sm shadow-lg text-xs text-text leading-relaxed pointer-events-none`}
37
  >
38
  {content}
39
  {/* Arrow */}
 
33
  {children}
34
  {isVisible && (
35
  <div
36
+ className={`absolute z-50 ${positionClasses[position]} w-64 p-3 bg-surface border border-border rounded-none shadow-lg text-xs text-text leading-relaxed pointer-events-none`}
37
  >
38
  {content}
39
  {/* Arrow */}
app/components/deals/DealFeed.tsx CHANGED
@@ -104,17 +104,17 @@ function FeedItemCard({ item }: { item: FeedItem }) {
104
  const isArticle = item.type === 'article' || (!isTwitter && item.url.startsWith('http'))
105
 
106
  return (
107
- <div className="border border-border-subtle rounded-sm p-4 hover:border-border transition-colors">
108
  <div className="flex items-start justify-between gap-3">
109
  <div className="flex-1 min-w-0">
110
  <div className="flex items-center gap-2 mb-2">
111
  {isTwitter && (
112
- <span className="text-[10px] px-1.5 py-0.5 bg-blue-500/20 text-blue-500 rounded-sm font-mono">
113
  TWITTER
114
  </span>
115
  )}
116
  {isArticle && (
117
- <span className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-sm font-mono">
118
  ARTICLE
119
  </span>
120
  )}
@@ -192,7 +192,7 @@ function FeedItemCard({ item }: { item: FeedItem }) {
192
  <div className="mt-3 pt-3 border-t border-border-subtle">
193
  <iframe
194
  src={item.url}
195
- className="w-full h-64 border border-border-subtle rounded-sm"
196
  title={item.title || 'Article preview'}
197
  sandbox="allow-same-origin allow-scripts"
198
  />
 
104
  const isArticle = item.type === 'article' || (!isTwitter && item.url.startsWith('http'))
105
 
106
  return (
107
+ <div className="border border-border-subtle rounded-none p-4 hover:border-border transition-colors">
108
  <div className="flex items-start justify-between gap-3">
109
  <div className="flex-1 min-w-0">
110
  <div className="flex items-center gap-2 mb-2">
111
  {isTwitter && (
112
+ <span className="text-[10px] px-1.5 py-0.5 bg-blue-500/20 text-blue-500 rounded-none font-mono">
113
  TWITTER
114
  </span>
115
  )}
116
  {isArticle && (
117
+ <span className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-none font-mono">
118
  ARTICLE
119
  </span>
120
  )}
 
192
  <div className="mt-3 pt-3 border-t border-border-subtle">
193
  <iframe
194
  src={item.url}
195
+ className="w-full h-64 border border-border-subtle rounded-none"
196
  title={item.title || 'Article preview'}
197
  sandbox="allow-same-origin allow-scripts"
198
  />
app/components/deals/DiscoveryButton.tsx CHANGED
@@ -69,7 +69,7 @@ export default function DiscoveryButton() {
69
  </button>
70
 
71
  {status && (
72
- <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-sm shadow-lg p-3 min-w-[300px]">
73
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
74
  {results && (
75
  <div className="text-xs text-text-muted space-y-1">
 
69
  </button>
70
 
71
  {status && (
72
+ <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-none shadow-lg p-3 min-w-[300px]">
73
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
74
  {results && (
75
  <div className="text-xs text-text-muted space-y-1">
app/components/linkages/CreateLinkagesButton.tsx CHANGED
@@ -54,7 +54,7 @@ export default function CreateLinkagesButton() {
54
  </button>
55
 
56
  {status && (
57
- <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-sm shadow-lg p-3 min-w-[300px]">
58
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
59
  {results && (
60
  <div className="text-xs text-text-muted space-y-1">
 
54
  </button>
55
 
56
  {status && (
57
+ <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-none shadow-lg p-3 min-w-[300px]">
58
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
59
  {results && (
60
  <div className="text-xs text-text-muted space-y-1">
app/components/models/IngestModelsButton.tsx CHANGED
@@ -78,7 +78,7 @@ export default function IngestModelsButton() {
78
  </button>
79
 
80
  {status && (
81
- <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-sm shadow-lg p-3 min-w-[300px]">
82
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
83
  {results && (
84
  <div className="text-xs text-text-muted space-y-1">
 
78
  </button>
79
 
80
  {status && (
81
+ <div className="absolute top-full right-0 mt-2 z-50 bg-surface border border-border rounded-none shadow-lg p-3 min-w-[300px]">
82
  <div className="text-xs font-semibold mb-2 text-text">{status}</div>
83
  {results && (
84
  <div className="text-xs text-text-muted space-y-1">
app/components/models/TokenCalculationCard.tsx CHANGED
@@ -52,7 +52,7 @@ export default function TokenCalculationCard({
52
 
53
  <div className="pt-3 border-t border-border-subtle">
54
  <strong className="text-text">Step-by-Step Calculation:</strong>
55
- <div className="mt-2 space-y-2 font-mono text-xs bg-[rgba(139,111,71,0.05)] p-3 rounded-sm">
56
  <div>
57
  <span className="text-text-muted">Min estimate:</span> {paramsAbs.toLocaleString()} params Γ— {ratioMin} = {tokensEstMin ? (tokensEstMin / 1e9).toFixed(1) : 'β€”'}B tokens
58
  </div>
 
52
 
53
  <div className="pt-3 border-t border-border-subtle">
54
  <strong className="text-text">Step-by-Step Calculation:</strong>
55
+ <div className="mt-2 space-y-2 font-mono text-xs bg-[rgba(139,111,71,0.05)] p-3 rounded-none">
56
  <div>
57
  <span className="text-text-muted">Min estimate:</span> {paramsAbs.toLocaleString()} params Γ— {ratioMin} = {tokensEstMin ? (tokensEstMin / 1e9).toFixed(1) : 'β€”'}B tokens
58
  </div>
app/components/models/TokenEstimateTooltip.tsx CHANGED
@@ -45,7 +45,7 @@ export default function TokenEstimateTooltip({
45
 
46
  {isOpen && (
47
  <div
48
- className="absolute z-50 left-0 top-full mt-2 w-80 bg-surface border border-border rounded-sm shadow-lg p-4 text-xs"
49
  onClick={(e) => e.stopPropagation()}
50
  >
51
  <div className="font-semibold text-text mb-2">Token Estimate Calculation</div>
 
45
 
46
  {isOpen && (
47
  <div
48
+ className="absolute z-50 left-0 top-full mt-2 w-80 bg-surface border border-border rounded-none shadow-lg p-4 text-xs"
49
  onClick={(e) => e.stopPropagation()}
50
  >
51
  <div className="font-semibold text-text mb-2">Token Estimate Calculation</div>
app/deals/DealModal.tsx CHANGED
@@ -51,11 +51,11 @@ export default function DealModal({ deal, isOpen, onClose }: DealModalProps) {
51
  onClick={onClose}
52
  >
53
  <div
54
- className="bg-surface rounded-sm shadow-xl max-w-4xl w-full max-h-[90vh] overflow-y-auto"
55
  onClick={(e) => e.stopPropagation()}
56
  >
57
  {/* Header */}
58
- <div className="sticky top-0 bg-surface border-b border-border px-6 py-4 flex items-start justify-between">
59
  <div className="flex-1">
60
  <h2 className="text-2xl font-semibold mb-1">
61
  {deal.provider} β†’ {deal.buyer}
@@ -249,7 +249,7 @@ export default function DealModal({ deal, isOpen, onClose }: DealModalProps) {
249
  <h3 className="text-lg font-semibold mb-4">Pricing Normalizations</h3>
250
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
251
  {normalizations.map((norm, idx) => (
252
- <div key={idx} className="border border-border-subtle rounded-sm p-3">
253
  <div className="text-xs text-text-muted mb-1">Per {norm.unitType}</div>
254
  <div className="font-medium">
255
  {norm.normalizedCostPerUnit < 0.001
 
51
  onClick={onClose}
52
  >
53
  <div
54
+ className="bg-surface rounded-none shadow-xl max-w-4xl w-full max-h-[90vh] overflow-y-auto"
55
  onClick={(e) => e.stopPropagation()}
56
  >
57
  {/* Header */}
58
+ <div className="sticky top-0 bg-surface border-b border-border px-4 py-3 flex items-start justify-between">
59
  <div className="flex-1">
60
  <h2 className="text-2xl font-semibold mb-1">
61
  {deal.provider} β†’ {deal.buyer}
 
249
  <h3 className="text-lg font-semibold mb-4">Pricing Normalizations</h3>
250
  <div className="grid grid-cols-2 md:grid-cols-4 gap-4">
251
  {normalizations.map((norm, idx) => (
252
+ <div key={idx} className="border border-border-subtle rounded-none p-3">
253
  <div className="text-xs text-text-muted mb-1">Per {norm.unitType}</div>
254
  <div className="font-medium">
255
  {norm.normalizedCostPerUnit < 0.001
app/deals/DealsClient.tsx CHANGED
@@ -391,7 +391,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
391
  <thead>
392
  <tr className="bg-border-subtle">
393
  <th
394
- className="cursor-pointer hover:bg-border select-none px-4 py-3"
395
  onClick={() => handleSort('provider')}
396
  title="Click to sort by provider"
397
  >
@@ -508,7 +508,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
508
  onClick={() => toggleGroup(groupKey)}
509
  className="cursor-pointer bg-border-subtle hover:bg-border transition-colors"
510
  >
511
- <td colSpan={7} className="px-4 py-3">
512
  <div className="flex items-center justify-between">
513
  <div className="flex items-center gap-3">
514
  <span className="text-xs text-text-muted">{isExpanded ? 'β–Ό' : 'β–Ά'}</span>
@@ -539,14 +539,14 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
539
  onClick={() => handleDealClick(deal)}
540
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
541
  >
542
- <td className="px-4 py-3 pl-8">
543
  <div className="flex items-center gap-2">
544
  <div className="font-medium text-accent hover:text-accent-hover">
545
  {deal.provider}
546
  </div>
547
  {deal.discoveredVia === 'exa' && (
548
  <span
549
- className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-sm font-mono"
550
  title={`Source: Exa${deal.exaQuery ? ` (${deal.exaQuery})` : ''}${deal.exaScore ? ` - Score: ${deal.exaScore.toFixed(2)}` : ''}`}
551
  >
552
  ARTICLE
@@ -554,7 +554,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
554
  )}
555
  {deal.discoveredVia && deal.discoveredVia !== 'exa' && (
556
  <span
557
- className="text-[10px] px-1.5 py-0.5 bg-border-subtle text-text-muted rounded-sm"
558
  title={`Discovered via ${deal.discoveredVia}`}
559
  >
560
  {deal.discoveredVia.toUpperCase()}
@@ -562,10 +562,10 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
562
  )}
563
  </div>
564
  </td>
565
- <td className="px-4 py-3">
566
  <div className="text-sm">{deal.buyer}</div>
567
  </td>
568
- <td className="px-4 py-3">
569
  <span className="badge badge-secondary text-xs">{deal.modality}</span>
570
  </td>
571
  <td className="px-4 py-3 text-right">
@@ -589,7 +589,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
589
  <span className="text-text-muted/40 text-xs">β€”</span>
590
  )}
591
  </td>
592
- <td className="px-4 py-3">
593
  <div className="text-sm text-text-muted/80">{formatDate(deal.date)}</div>
594
  </td>
595
  </tr>
@@ -605,14 +605,14 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
605
  onClick={() => handleDealClick(deal)}
606
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
607
  >
608
- <td className="px-4 py-3">
609
  <div className="flex items-center gap-2">
610
  <div className="font-medium text-accent hover:text-accent-hover">
611
  {deal.provider}
612
  </div>
613
  {deal.discoveredVia === 'exa' && (
614
  <span
615
- className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-sm font-mono"
616
  title={`Source: Exa${deal.exaQuery ? ` (${deal.exaQuery})` : ''}${deal.exaScore ? ` - Score: ${deal.exaScore.toFixed(2)}` : ''}`}
617
  >
618
  ARTICLE
@@ -620,7 +620,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
620
  )}
621
  {deal.discoveredVia && deal.discoveredVia !== 'exa' && (
622
  <span
623
- className="text-[10px] px-1.5 py-0.5 bg-border-subtle text-text-muted rounded-sm"
624
  title={`Discovered via ${deal.discoveredVia}`}
625
  >
626
  {deal.discoveredVia.toUpperCase()}
@@ -628,10 +628,10 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
628
  )}
629
  </div>
630
  </td>
631
- <td className="px-4 py-3">
632
  <div className="text-sm">{deal.buyer}</div>
633
  </td>
634
- <td className="px-4 py-3">
635
  <span className="badge badge-secondary text-xs">{deal.modality}</span>
636
  </td>
637
  <td className="px-4 py-3 text-right">
@@ -655,7 +655,7 @@ export default function DealsClient({ initialDeals }: DealsClientProps) {
655
  <span className="text-text-muted/40 text-xs">β€”</span>
656
  )}
657
  </td>
658
- <td className="px-4 py-3">
659
  <div className="text-sm text-text-muted/80">{formatDate(deal.date)}</div>
660
  </td>
661
  </tr>
@@ -712,7 +712,7 @@ function PriceCellWithTooltip({ deal }: { deal: Deal }) {
712
  </div>
713
  </div>
714
  {showTooltip && (
715
- <div className="absolute right-0 top-full mt-2 z-50 bg-surface border border-border rounded-sm shadow-lg p-3 min-w-[280px]">
716
  <div className="text-xs font-semibold mb-2 text-text">Normalized Pricing</div>
717
  <div className="space-y-2">
718
  {normalizations.map((norm, idx) => (
 
391
  <thead>
392
  <tr className="bg-border-subtle">
393
  <th
394
+ className="cursor-pointer hover:bg-border select-none"
395
  onClick={() => handleSort('provider')}
396
  title="Click to sort by provider"
397
  >
 
508
  onClick={() => toggleGroup(groupKey)}
509
  className="cursor-pointer bg-border-subtle hover:bg-border transition-colors"
510
  >
511
+ <td colSpan={7}>
512
  <div className="flex items-center justify-between">
513
  <div className="flex items-center gap-3">
514
  <span className="text-xs text-text-muted">{isExpanded ? 'β–Ό' : 'β–Ά'}</span>
 
539
  onClick={() => handleDealClick(deal)}
540
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
541
  >
542
+ <td className="pl-6">
543
  <div className="flex items-center gap-2">
544
  <div className="font-medium text-accent hover:text-accent-hover">
545
  {deal.provider}
546
  </div>
547
  {deal.discoveredVia === 'exa' && (
548
  <span
549
+ className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-none font-mono"
550
  title={`Source: Exa${deal.exaQuery ? ` (${deal.exaQuery})` : ''}${deal.exaScore ? ` - Score: ${deal.exaScore.toFixed(2)}` : ''}`}
551
  >
552
  ARTICLE
 
554
  )}
555
  {deal.discoveredVia && deal.discoveredVia !== 'exa' && (
556
  <span
557
+ className="text-[10px] px-1.5 py-0.5 bg-border-subtle text-text-muted rounded-none"
558
  title={`Discovered via ${deal.discoveredVia}`}
559
  >
560
  {deal.discoveredVia.toUpperCase()}
 
562
  )}
563
  </div>
564
  </td>
565
+ <td>
566
  <div className="text-sm">{deal.buyer}</div>
567
  </td>
568
+ <td>
569
  <span className="badge badge-secondary text-xs">{deal.modality}</span>
570
  </td>
571
  <td className="px-4 py-3 text-right">
 
589
  <span className="text-text-muted/40 text-xs">β€”</span>
590
  )}
591
  </td>
592
+ <td>
593
  <div className="text-sm text-text-muted/80">{formatDate(deal.date)}</div>
594
  </td>
595
  </tr>
 
605
  onClick={() => handleDealClick(deal)}
606
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
607
  >
608
+ <td>
609
  <div className="flex items-center gap-2">
610
  <div className="font-medium text-accent hover:text-accent-hover">
611
  {deal.provider}
612
  </div>
613
  {deal.discoveredVia === 'exa' && (
614
  <span
615
+ className="text-[10px] px-1.5 py-0.5 bg-accent/20 text-accent rounded-none font-mono"
616
  title={`Source: Exa${deal.exaQuery ? ` (${deal.exaQuery})` : ''}${deal.exaScore ? ` - Score: ${deal.exaScore.toFixed(2)}` : ''}`}
617
  >
618
  ARTICLE
 
620
  )}
621
  {deal.discoveredVia && deal.discoveredVia !== 'exa' && (
622
  <span
623
+ className="text-[10px] px-1.5 py-0.5 bg-border-subtle text-text-muted rounded-none"
624
  title={`Discovered via ${deal.discoveredVia}`}
625
  >
626
  {deal.discoveredVia.toUpperCase()}
 
628
  )}
629
  </div>
630
  </td>
631
+ <td>
632
  <div className="text-sm">{deal.buyer}</div>
633
  </td>
634
+ <td>
635
  <span className="badge badge-secondary text-xs">{deal.modality}</span>
636
  </td>
637
  <td className="px-4 py-3 text-right">
 
655
  <span className="text-text-muted/40 text-xs">β€”</span>
656
  )}
657
  </td>
658
+ <td>
659
  <div className="text-sm text-text-muted/80">{formatDate(deal.date)}</div>
660
  </td>
661
  </tr>
 
712
  </div>
713
  </div>
714
  {showTooltip && (
715
+ <div className="absolute right-0 top-full mt-2 z-50 bg-surface border border-border rounded-none shadow-lg p-3 min-w-[280px]">
716
  <div className="text-xs font-semibold mb-2 text-text">Normalized Pricing</div>
717
  <div className="space-y-2">
718
  {normalizations.map((norm, idx) => (
app/deals/[id]/page.tsx CHANGED
@@ -62,13 +62,13 @@ export default async function DealDetailPage({
62
 
63
  <div className="max-w-4xl">
64
  {/* Header Card */}
65
- <div className="card mb-8">
66
- <div className="flex items-start justify-between mb-6">
67
  <div>
68
- <h1 className="text-3xl font-semibold mb-2">
69
  {deal.provider} β†’ {deal.buyer}
70
  </h1>
71
- <p className="text-text-muted">{deal.dataType}</p>
72
  </div>
73
  <div className="flex gap-2">
74
  <span className="badge badge-secondary">{deal.modality}</span>
@@ -78,7 +78,7 @@ export default async function DealDetailPage({
78
  </div>
79
  </div>
80
 
81
- <div className="grid grid-cols-2 md:grid-cols-4 gap-4 pt-6 border-t border-border">
82
  <div>
83
  <div className="text-sm text-text-muted mb-1">Price</div>
84
  <div className="text-xl font-semibold">{formatPrice(deal)}</div>
@@ -115,16 +115,16 @@ export default async function DealDetailPage({
115
 
116
  {/* Reported Terms */}
117
  {deal.reportedTerms && (
118
- <div className="card mb-8">
119
- <h2 className="text-xl font-semibold mb-4">Reported Terms</h2>
120
- <p className="text-lg leading-relaxed">{deal.reportedTerms}</p>
121
  </div>
122
  )}
123
 
124
  {/* Deal Details & Compensation */}
125
- <div className="grid md:grid-cols-2 gap-6 mb-8">
126
  <div className="card">
127
- <h3 className="text-lg font-semibold mb-4">Deal Details</h3>
128
  <div className="space-y-2">
129
  <div className="flex justify-between">
130
  <span className="text-text-muted">Deal Type</span>
@@ -158,7 +158,7 @@ export default async function DealDetailPage({
158
  </div>
159
 
160
  <div className="card">
161
- <h3 className="text-lg font-semibold mb-4">Creator Compensation</h3>
162
  <div className="space-y-2">
163
  <div className="flex justify-between">
164
  <span className="text-text-muted">Compensated</span>
@@ -190,15 +190,15 @@ export default async function DealDetailPage({
190
 
191
  {/* Notes */}
192
  {deal.notes && (
193
- <div className="card mb-8">
194
- <h3 className="text-lg font-semibold mb-4">Notes</h3>
195
- <p className="leading-relaxed">{deal.notes}</p>
196
  </div>
197
  )}
198
 
199
  {/* Sources - Hyperlinked */}
200
- <div className="card mb-8">
201
- <h3 className="text-lg font-semibold mb-4">Sources</h3>
202
  <div className="space-y-2">
203
  {deal.sourcePrimary && (
204
  <div className="text-text-muted">
 
62
 
63
  <div className="max-w-4xl">
64
  {/* Header Card */}
65
+ <div className="card mb-6">
66
+ <div className="flex items-start justify-between mb-4">
67
  <div>
68
+ <h1 className="text-2xl font-semibold mb-1">
69
  {deal.provider} β†’ {deal.buyer}
70
  </h1>
71
+ <p className="text-text-muted text-sm">{deal.dataType}</p>
72
  </div>
73
  <div className="flex gap-2">
74
  <span className="badge badge-secondary">{deal.modality}</span>
 
78
  </div>
79
  </div>
80
 
81
+ <div className="grid grid-cols-2 md:grid-cols-4 gap-4 pt-4 border-t border-border">
82
  <div>
83
  <div className="text-sm text-text-muted mb-1">Price</div>
84
  <div className="text-xl font-semibold">{formatPrice(deal)}</div>
 
115
 
116
  {/* Reported Terms */}
117
  {deal.reportedTerms && (
118
+ <div className="card mb-6">
119
+ <h2 className="text-lg font-semibold mb-3">Reported Terms</h2>
120
+ <p className="text-sm leading-relaxed">{deal.reportedTerms}</p>
121
  </div>
122
  )}
123
 
124
  {/* Deal Details & Compensation */}
125
+ <div className="grid md:grid-cols-2 gap-4 mb-6">
126
  <div className="card">
127
+ <h3 className="text-base font-semibold mb-3">Deal Details</h3>
128
  <div className="space-y-2">
129
  <div className="flex justify-between">
130
  <span className="text-text-muted">Deal Type</span>
 
158
  </div>
159
 
160
  <div className="card">
161
+ <h3 className="text-base font-semibold mb-3">Creator Compensation</h3>
162
  <div className="space-y-2">
163
  <div className="flex justify-between">
164
  <span className="text-text-muted">Compensated</span>
 
190
 
191
  {/* Notes */}
192
  {deal.notes && (
193
+ <div className="card mb-6">
194
+ <h3 className="text-base font-semibold mb-3">Notes</h3>
195
+ <p className="text-sm leading-relaxed">{deal.notes}</p>
196
  </div>
197
  )}
198
 
199
  {/* Sources - Hyperlinked */}
200
+ <div className="card mb-6">
201
+ <h3 className="text-base font-semibold mb-3">Sources</h3>
202
  <div className="space-y-2">
203
  {deal.sourcePrimary && (
204
  <div className="text-text-muted">
app/globals.css CHANGED
@@ -120,21 +120,21 @@
120
  }
121
 
122
  @layer components {
123
- /* Container - Max Width with Generous Padding */
124
  .container-content {
125
- @apply px-6;
126
  max-width: var(--max-width-content);
127
  margin: 0 auto;
128
  }
129
 
130
  .container-narrow {
131
- @apply mx-auto px-6;
132
  max-width: var(--max-width-narrow);
133
  }
134
 
135
- /* Cards - Minimalist, Sharp */
136
  .card {
137
- @apply bg-surface border border-border rounded-sm p-6;
138
  box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.05);
139
  }
140
 
@@ -147,9 +147,9 @@
147
  transform: translateY(-1px);
148
  }
149
 
150
- /* Buttons - Minimalist, Sharp */
151
  .btn {
152
- @apply px-4 py-2 rounded-sm font-medium transition-all duration-200;
153
  @apply focus:outline-none focus:ring-2 focus:ring-offset-2;
154
  }
155
 
@@ -177,11 +177,11 @@
177
 
178
  .table th {
179
  @apply text-left font-semibold text-xs text-text-muted uppercase tracking-wide;
180
- @apply select-none;
181
  }
182
 
183
  .table td {
184
- @apply border-b border-border-subtle;
185
  }
186
 
187
  .table tbody tr:hover {
@@ -192,9 +192,9 @@
192
  @apply border-b-0;
193
  }
194
 
195
- /* Badges/Tags - Subtle, Sharp */
196
  .badge {
197
- @apply inline-flex items-center px-2.5 py-0.5 rounded-sm text-xs font-medium;
198
  }
199
 
200
  .badge-primary {
@@ -206,9 +206,9 @@
206
  @apply badge bg-border text-text-muted;
207
  }
208
 
209
- /* Inputs - Clean, Sharp */
210
  .input {
211
- @apply w-full px-4 py-2 border border-border rounded-sm;
212
  @apply bg-surface text-text;
213
  @apply focus:outline-none focus:ring-2 focus:ring-accent focus:border-transparent;
214
  @apply transition-all duration-200;
@@ -234,9 +234,9 @@
234
  text-wrap: balance;
235
  }
236
 
237
- /* Spacing Utilities */
238
  .section-padding {
239
- @apply py-12 md:py-16 lg:py-20;
240
  }
241
 
242
  /* Modal utilities */
@@ -260,7 +260,7 @@
260
 
261
  .overflow-x-auto::-webkit-scrollbar-thumb {
262
  background-color: var(--color-border);
263
- border-radius: 3px;
264
  }
265
 
266
  .overflow-x-auto::-webkit-scrollbar-thumb:hover {
@@ -271,7 +271,7 @@
271
  .timeline-container {
272
  max-width: 1200px;
273
  margin: 0 auto;
274
- padding: 50px;
275
  }
276
 
277
  .timeline-header {
@@ -377,11 +377,11 @@
377
 
378
  /* Progress bar - container and fill using CSS custom properties */
379
  .progress-bar-container {
380
- @apply w-full bg-border-subtle rounded-sm h-1.5 relative overflow-hidden;
381
  }
382
 
383
  .progress-bar-fill {
384
- @apply bg-accent h-1.5 rounded-sm absolute top-0 left-0 transition-all;
385
  width: calc(var(--progress-percentage, 0) * 1%);
386
  }
387
 
 
120
  }
121
 
122
  @layer components {
123
+ /* Container - Max Width with Reduced Padding */
124
  .container-content {
125
+ @apply px-2;
126
  max-width: var(--max-width-content);
127
  margin: 0 auto;
128
  }
129
 
130
  .container-narrow {
131
+ @apply mx-auto px-2;
132
  max-width: var(--max-width-narrow);
133
  }
134
 
135
+ /* Cards - Minimalist, Sharp, No Rounded Edges */
136
  .card {
137
+ @apply bg-surface border border-border rounded-none p-4;
138
  box-shadow: 0 1px 3px 0 rgba(0, 0, 0, 0.05);
139
  }
140
 
 
147
  transform: translateY(-1px);
148
  }
149
 
150
+ /* Buttons - Minimalist, Sharp, No Rounded Edges */
151
  .btn {
152
+ @apply px-4 py-2 rounded-none font-medium transition-all duration-200;
153
  @apply focus:outline-none focus:ring-2 focus:ring-offset-2;
154
  }
155
 
 
177
 
178
  .table th {
179
  @apply text-left font-semibold text-xs text-text-muted uppercase tracking-wide;
180
+ @apply select-none px-3 py-2;
181
  }
182
 
183
  .table td {
184
+ @apply border-b border-border-subtle px-3 py-2;
185
  }
186
 
187
  .table tbody tr:hover {
 
192
  @apply border-b-0;
193
  }
194
 
195
+ /* Badges/Tags - Subtle, Sharp, No Rounded Edges */
196
  .badge {
197
+ @apply inline-flex items-center px-2.5 py-0.5 rounded-none text-xs font-medium;
198
  }
199
 
200
  .badge-primary {
 
206
  @apply badge bg-border text-text-muted;
207
  }
208
 
209
+ /* Inputs - Clean, Sharp, No Rounded Edges */
210
  .input {
211
+ @apply w-full px-4 py-2 border border-border rounded-none;
212
  @apply bg-surface text-text;
213
  @apply focus:outline-none focus:ring-2 focus:ring-accent focus:border-transparent;
214
  @apply transition-all duration-200;
 
234
  text-wrap: balance;
235
  }
236
 
237
+ /* Spacing Utilities - Reduced Padding */
238
  .section-padding {
239
+ @apply py-4 md:py-6;
240
  }
241
 
242
  /* Modal utilities */
 
260
 
261
  .overflow-x-auto::-webkit-scrollbar-thumb {
262
  background-color: var(--color-border);
263
+ border-radius: 0;
264
  }
265
 
266
  .overflow-x-auto::-webkit-scrollbar-thumb:hover {
 
271
  .timeline-container {
272
  max-width: 1200px;
273
  margin: 0 auto;
274
+ padding: 16px 8px;
275
  }
276
 
277
  .timeline-header {
 
377
 
378
  /* Progress bar - container and fill using CSS custom properties */
379
  .progress-bar-container {
380
+ @apply w-full bg-border-subtle rounded-none h-1.5 relative overflow-hidden;
381
  }
382
 
383
  .progress-bar-fill {
384
+ @apply bg-accent h-1.5 rounded-none absolute top-0 left-0 transition-all;
385
  width: calc(var(--progress-percentage, 0) * 1%);
386
  }
387
 
app/linkages/LinkagesClient.tsx ADDED
@@ -0,0 +1,622 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client'
2
+
3
+ import { useState, useEffect } from 'react'
4
+ import React from 'react'
5
+ import Link from 'next/link'
6
+ import { formatDate } from '@/lib/utils'
7
+ import Tooltip from '@/app/components/Tooltip'
8
+
9
+ interface Linkage {
10
+ id: string
11
+ linkageType: string
12
+ linkageStrength: string
13
+ impactInference: string | null
14
+ deal: {
15
+ id: string
16
+ provider: string
17
+ buyer: string
18
+ modality: string
19
+ priceUsd: number | null
20
+ date: string | null
21
+ }
22
+ model: {
23
+ id: string
24
+ modelId: string
25
+ provider: string
26
+ family: string | null
27
+ tokensEstMid: number | null
28
+ }
29
+ }
30
+
31
+ interface LinkagesClientProps {
32
+ initialLinkages: Linkage[]
33
+ }
34
+
35
+ function formatTokens(value: number | null | undefined): string {
36
+ if (!value) return 'β€”'
37
+ if (value >= 1e15) return `${(value / 1e15).toFixed(1)}P`
38
+ if (value >= 1e12) return `${(value / 1e12).toFixed(1)}T`
39
+ if (value >= 1e9) return `${(value / 1e9).toFixed(1)}B`
40
+ return `${(value / 1e6).toFixed(0)}M`
41
+ }
42
+
43
+ export default function LinkagesClient({ initialLinkages }: LinkagesClientProps) {
44
+ const [linkages] = useState<Linkage[]>(initialLinkages)
45
+ const [filters, setFilters] = useState({
46
+ linkageType: '',
47
+ linkageStrength: '',
48
+ provider: '',
49
+ buyer: '',
50
+ modelProvider: '',
51
+ modality: '',
52
+ })
53
+ const [searchQuery, setSearchQuery] = useState('')
54
+ const [sortBy, setSortBy] = useState<{ column: string; direction: 'asc' | 'desc' }>({
55
+ column: 'linkageStrength',
56
+ direction: 'desc',
57
+ })
58
+ const [groupBy, setGroupBy] = useState<string>('')
59
+ const [expandedGroups, setExpandedGroups] = useState<Set<string>>(new Set())
60
+
61
+ // Filter linkages
62
+ let filteredLinkages = linkages.filter(linkage => {
63
+ if (!linkage || !linkage.deal || !linkage.model) return false
64
+
65
+ if (filters.linkageType && linkage.linkageType !== filters.linkageType) return false
66
+ if (filters.linkageStrength && linkage.linkageStrength !== filters.linkageStrength) return false
67
+ if (filters.provider && !linkage.deal.provider.toLowerCase().includes(filters.provider.toLowerCase())) return false
68
+ if (filters.buyer && !linkage.deal.buyer.toLowerCase().includes(filters.buyer.toLowerCase())) return false
69
+ if (filters.modelProvider && !linkage.model.provider.toLowerCase().includes(filters.modelProvider.toLowerCase())) return false
70
+ if (filters.modality && linkage.deal.modality !== filters.modality) return false
71
+
72
+ if (searchQuery) {
73
+ const query = searchQuery.toLowerCase()
74
+ if (
75
+ !linkage.deal.provider.toLowerCase().includes(query) &&
76
+ !linkage.deal.buyer.toLowerCase().includes(query) &&
77
+ !linkage.model.modelId.toLowerCase().includes(query) &&
78
+ !linkage.model.provider.toLowerCase().includes(query) &&
79
+ !linkage.deal.modality.toLowerCase().includes(query)
80
+ ) {
81
+ return false
82
+ }
83
+ }
84
+ return true
85
+ })
86
+
87
+ // Sort linkages
88
+ const sortedLinkages = [...filteredLinkages].sort((a, b) => {
89
+ const { column, direction } = sortBy
90
+ let comparison = 0
91
+
92
+ switch (column) {
93
+ case 'deal':
94
+ comparison = a.deal.provider.localeCompare(b.deal.provider)
95
+ break
96
+ case 'model':
97
+ comparison = a.model.modelId.localeCompare(b.model.modelId)
98
+ break
99
+ case 'linkageType':
100
+ comparison = a.linkageType.localeCompare(b.linkageType)
101
+ break
102
+ case 'linkageStrength':
103
+ const strengthOrder = { 'high': 3, 'medium': 2, 'low': 1 }
104
+ comparison = (strengthOrder[a.linkageStrength as keyof typeof strengthOrder] || 0) -
105
+ (strengthOrder[b.linkageStrength as keyof typeof strengthOrder] || 0)
106
+ break
107
+ case 'date':
108
+ const dateA = a.deal.date || ''
109
+ const dateB = b.deal.date || ''
110
+ comparison = dateA.localeCompare(dateB)
111
+ break
112
+ default:
113
+ comparison = 0
114
+ }
115
+
116
+ return direction === 'asc' ? comparison : -comparison
117
+ })
118
+
119
+ // Extract unique values for filters
120
+ const linkageTypes = Array.from(new Set(linkages.map(l => l.linkageType))).filter(Boolean).sort()
121
+ const linkageStrengths = Array.from(new Set(linkages.map(l => l.linkageStrength))).filter(Boolean).sort()
122
+ const providers = Array.from(new Set(linkages.map(l => l.deal.provider))).filter(Boolean).sort()
123
+ const buyers = Array.from(new Set(linkages.flatMap(l => l.deal.buyer.split(',').map(b => b.trim())))).filter(Boolean).sort()
124
+ const modelProviders = Array.from(new Set(linkages.map(l => l.model.provider))).filter(Boolean).sort()
125
+ const modalities = Array.from(new Set(linkages.map(l => l.deal.modality))).filter(Boolean).sort()
126
+
127
+ // Group linkages
128
+ function groupLinkages(linkages: Linkage[], groupByField: string): Record<string, Linkage[]> {
129
+ if (!groupByField) {
130
+ return { 'All': linkages }
131
+ }
132
+
133
+ const groups: Record<string, Linkage[]> = {}
134
+
135
+ linkages.forEach(linkage => {
136
+ let groupKey = 'Unknown'
137
+
138
+ switch (groupByField) {
139
+ case 'linkageType':
140
+ groupKey = linkage.linkageType === 'temporal_overlap' ? 'Same Time Period' :
141
+ linkage.linkageType === 'inferred' ? 'Same Company' :
142
+ linkage.linkageType || 'Unknown'
143
+ break
144
+ case 'linkageStrength':
145
+ groupKey = linkage.linkageStrength || 'Unknown'
146
+ break
147
+ case 'provider':
148
+ groupKey = linkage.deal.provider || 'Unknown'
149
+ break
150
+ case 'buyer':
151
+ groupKey = linkage.deal.buyer || 'Unknown'
152
+ break
153
+ case 'modelProvider':
154
+ groupKey = linkage.model.provider || 'Unknown'
155
+ break
156
+ case 'modality':
157
+ groupKey = linkage.deal.modality || 'Unknown'
158
+ break
159
+ default:
160
+ groupKey = 'All'
161
+ }
162
+
163
+ if (!groups[groupKey]) {
164
+ groups[groupKey] = []
165
+ }
166
+ groups[groupKey].push(linkage)
167
+ })
168
+
169
+ return groups
170
+ }
171
+
172
+ const groupedLinkages = groupLinkages(sortedLinkages, groupBy)
173
+ const groupKeys = Object.keys(groupedLinkages).sort((a, b) => {
174
+ // Sort by strength order if grouping by strength
175
+ if (groupBy === 'linkageStrength') {
176
+ const strengthOrder = { 'high': 3, 'medium': 2, 'low': 1 }
177
+ return (strengthOrder[b.toLowerCase() as keyof typeof strengthOrder] || 0) -
178
+ (strengthOrder[a.toLowerCase() as keyof typeof strengthOrder] || 0)
179
+ }
180
+ return a.localeCompare(b)
181
+ })
182
+
183
+ const toggleGroup = (groupKey: string) => {
184
+ setExpandedGroups(prev => {
185
+ const next = new Set(prev)
186
+ if (next.has(groupKey)) {
187
+ next.delete(groupKey)
188
+ } else {
189
+ next.add(groupKey)
190
+ }
191
+ return next
192
+ })
193
+ }
194
+
195
+ // Expand all groups by default
196
+ useEffect(() => {
197
+ if (groupBy && expandedGroups.size === 0) {
198
+ setExpandedGroups(new Set(groupKeys))
199
+ }
200
+ }, [groupBy, groupKeys])
201
+
202
+ const handleSort = (column: string) => {
203
+ setSortBy(prev => ({
204
+ column,
205
+ direction: prev.column === column && prev.direction === 'asc' ? 'desc' : 'asc',
206
+ }))
207
+ }
208
+
209
+ const getSortIndicator = (column: string) => {
210
+ if (sortBy.column !== column) return null
211
+ return sortBy.direction === 'asc' ? '↑' : '↓'
212
+ }
213
+
214
+ return (
215
+ <>
216
+ {/* Search and Filters */}
217
+ <div className="card mb-6 p-4">
218
+ <div className="mb-4">
219
+ <input
220
+ type="text"
221
+ placeholder="Search linkages..."
222
+ value={searchQuery}
223
+ onChange={(e) => setSearchQuery(e.target.value)}
224
+ className="input w-full text-sm"
225
+ />
226
+ </div>
227
+
228
+ <div className="grid grid-cols-2 md:grid-cols-6 gap-3">
229
+ <div>
230
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Connection Type</label>
231
+ <select
232
+ value={filters.linkageType}
233
+ onChange={(e) => setFilters({ ...filters, linkageType: e.target.value })}
234
+ className="input text-sm py-1.5"
235
+ >
236
+ <option value="">All</option>
237
+ {linkageTypes.map(type => (
238
+ <option key={type} value={type}>
239
+ {type === 'temporal_overlap' ? 'Same Time Period' :
240
+ type === 'inferred' ? 'Same Company' : type}
241
+ </option>
242
+ ))}
243
+ </select>
244
+ </div>
245
+
246
+ <div>
247
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Confidence</label>
248
+ <select
249
+ value={filters.linkageStrength}
250
+ onChange={(e) => setFilters({ ...filters, linkageStrength: e.target.value })}
251
+ className="input text-sm py-1.5"
252
+ >
253
+ <option value="">All</option>
254
+ {linkageStrengths.map(strength => (
255
+ <option key={strength} value={strength}>{strength.charAt(0).toUpperCase() + strength.slice(1)}</option>
256
+ ))}
257
+ </select>
258
+ </div>
259
+
260
+ <div>
261
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Data Provider</label>
262
+ <select
263
+ value={filters.provider}
264
+ onChange={(e) => setFilters({ ...filters, provider: e.target.value })}
265
+ className="input text-sm py-1.5"
266
+ >
267
+ <option value="">All</option>
268
+ {providers.map(p => (
269
+ <option key={p} value={p}>{p}</option>
270
+ ))}
271
+ </select>
272
+ </div>
273
+
274
+ <div>
275
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Buyer</label>
276
+ <select
277
+ value={filters.buyer}
278
+ onChange={(e) => setFilters({ ...filters, buyer: e.target.value })}
279
+ className="input text-sm py-1.5"
280
+ >
281
+ <option value="">All</option>
282
+ {buyers.map(b => (
283
+ <option key={b} value={b}>{b}</option>
284
+ ))}
285
+ </select>
286
+ </div>
287
+
288
+ <div>
289
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Model Provider</label>
290
+ <select
291
+ value={filters.modelProvider}
292
+ onChange={(e) => setFilters({ ...filters, modelProvider: e.target.value })}
293
+ className="input text-sm py-1.5"
294
+ >
295
+ <option value="">All</option>
296
+ {modelProviders.map(p => (
297
+ <option key={p} value={p}>{p}</option>
298
+ ))}
299
+ </select>
300
+ </div>
301
+
302
+ <div>
303
+ <label className="block text-xs font-medium text-text-muted mb-1.5 uppercase tracking-wide">Modality</label>
304
+ <select
305
+ value={filters.modality}
306
+ onChange={(e) => setFilters({ ...filters, modality: e.target.value })}
307
+ className="input text-sm py-1.5"
308
+ >
309
+ <option value="">All</option>
310
+ {modalities.map(m => (
311
+ <option key={m} value={m}>{m}</option>
312
+ ))}
313
+ </select>
314
+ </div>
315
+ </div>
316
+ </div>
317
+
318
+ {/* Grouping and Results Count */}
319
+ <div className="mb-3 flex items-center justify-between flex-wrap gap-3">
320
+ <div className="flex items-center gap-3">
321
+ <div className="flex items-center gap-2">
322
+ <label className="text-xs font-medium text-text-muted uppercase tracking-wide">Group by:</label>
323
+ <select
324
+ value={groupBy}
325
+ onChange={(e) => {
326
+ setGroupBy(e.target.value)
327
+ setExpandedGroups(new Set())
328
+ }}
329
+ className="input text-sm py-1.5"
330
+ >
331
+ <option value="">None</option>
332
+ <option value="linkageType">Connection Type</option>
333
+ <option value="linkageStrength">Confidence</option>
334
+ <option value="provider">Data Provider</option>
335
+ <option value="buyer">Buyer</option>
336
+ <option value="modelProvider">Model Provider</option>
337
+ <option value="modality">Modality</option>
338
+ </select>
339
+ </div>
340
+ {groupBy && (
341
+ <button
342
+ onClick={() => {
343
+ setExpandedGroups(new Set(groupKeys))
344
+ }}
345
+ className="text-xs text-accent hover:text-accent-hover"
346
+ >
347
+ Expand All
348
+ </button>
349
+ )}
350
+ {groupBy && (
351
+ <button
352
+ onClick={() => {
353
+ setExpandedGroups(new Set())
354
+ }}
355
+ className="text-xs text-accent hover:text-accent-hover"
356
+ >
357
+ Collapse All
358
+ </button>
359
+ )}
360
+ </div>
361
+ <div className="text-sm text-text-muted">
362
+ Showing <span className="font-medium text-text">{sortedLinkages.length}</span> of <span className="font-medium text-text">{linkages.length}</span> linkages
363
+ </div>
364
+ </div>
365
+
366
+ {/* Results */}
367
+ <div className="card overflow-hidden p-0">
368
+ <div className="overflow-x-auto">
369
+ <table className="table text-sm">
370
+ <thead>
371
+ <tr className="bg-border-subtle">
372
+ <th
373
+ className="cursor-pointer hover:bg-border select-none"
374
+ onClick={() => handleSort('deal')}
375
+ title="Click to sort by deal"
376
+ >
377
+ <div className="flex items-center gap-2">
378
+ <Tooltip content="The training data deal, showing the data provider (who owns the data) and the buyer (the AI company licensing it).">
379
+ <span className="underline decoration-dotted cursor-help">Deal</span>
380
+ </Tooltip>
381
+ {getSortIndicator('deal') && (
382
+ <span className="text-text-muted text-xs">{getSortIndicator('deal')}</span>
383
+ )}
384
+ </div>
385
+ <div className="text-xs font-normal text-text-muted mt-0.5">Data provider β†’ Buyer</div>
386
+ </th>
387
+ <th
388
+ className="cursor-pointer hover:bg-border select-none"
389
+ onClick={() => handleSort('model')}
390
+ title="Click to sort by model"
391
+ >
392
+ <div className="flex items-center gap-2">
393
+ <Tooltip content="The AI model that may have been trained using data from this deal. Linkages are inferred based on company matches and timing.">
394
+ <span className="underline decoration-dotted cursor-help">Model</span>
395
+ </Tooltip>
396
+ {getSortIndicator('model') && (
397
+ <span className="text-text-muted text-xs">{getSortIndicator('model')}</span>
398
+ )}
399
+ </div>
400
+ <div className="text-xs font-normal text-text-muted mt-0.5">AI model that may have used this data</div>
401
+ </th>
402
+ <th
403
+ className="cursor-pointer hover:bg-border select-none"
404
+ onClick={() => handleSort('linkageType')}
405
+ title="Click to sort by connection type"
406
+ >
407
+ <div className="flex items-center gap-2">
408
+ <Tooltip content="The type of connection: 'Same Time Period' (deal and model within 1 year), 'Same Company' (buyer matches model provider), or 'Explicit' (directly stated).">
409
+ <span className="underline decoration-dotted cursor-help">Connection Type</span>
410
+ </Tooltip>
411
+ {getSortIndicator('linkageType') && (
412
+ <span className="text-text-muted text-xs">{getSortIndicator('linkageType')}</span>
413
+ )}
414
+ </div>
415
+ <div className="text-xs font-normal text-text-muted mt-0.5">How the link was determined</div>
416
+ </th>
417
+ <th
418
+ className="cursor-pointer hover:bg-border select-none"
419
+ onClick={() => handleSort('linkageStrength')}
420
+ title="Click to sort by confidence"
421
+ >
422
+ <div className="flex items-center gap-2">
423
+ <Tooltip content="The confidence level in the linkage: High (strong evidence like temporal overlap), Medium (moderate evidence), or Low (weak evidence).">
424
+ <span className="underline decoration-dotted cursor-help">Confidence</span>
425
+ </Tooltip>
426
+ {getSortIndicator('linkageStrength') && (
427
+ <span className="text-text-muted text-xs">{getSortIndicator('linkageStrength')}</span>
428
+ )}
429
+ </div>
430
+ <div className="text-xs font-normal text-text-muted mt-0.5">How certain we are</div>
431
+ </th>
432
+ <th className="text-left font-semibold">
433
+ <Tooltip content="An interpretation of what this linkage means - how the deal's data may have impacted the model's training.">
434
+ <span className="underline decoration-dotted cursor-help">What This Means</span>
435
+ </Tooltip>
436
+ <div className="text-xs font-normal text-text-muted mt-0.5">Interpretation of the connection</div>
437
+ </th>
438
+ </tr>
439
+ </thead>
440
+ <tbody>
441
+ {sortedLinkages.length === 0 ? (
442
+ <tr>
443
+ <td colSpan={5} className="text-center py-12 text-text-muted">
444
+ No linkages found matching your filters
445
+ </td>
446
+ </tr>
447
+ ) : groupBy ? (
448
+ // Grouped view
449
+ groupKeys.map((groupKey) => {
450
+ const groupLinkages = groupedLinkages[groupKey]
451
+ const isExpanded = expandedGroups.has(groupKey)
452
+
453
+ return (
454
+ <React.Fragment key={groupKey}>
455
+ <tr
456
+ onClick={() => toggleGroup(groupKey)}
457
+ className="cursor-pointer bg-border-subtle hover:bg-border transition-colors"
458
+ >
459
+ <td colSpan={5}>
460
+ <div className="flex items-center justify-between">
461
+ <div className="flex items-center gap-3">
462
+ <span className="text-xs text-text-muted">{isExpanded ? 'β–Ό' : 'β–Ά'}</span>
463
+ <span className="font-semibold text-sm">{groupKey}</span>
464
+ <span className="text-xs text-text-muted">
465
+ ({groupLinkages.length} {groupLinkages.length === 1 ? 'linkage' : 'linkages'})
466
+ </span>
467
+ </div>
468
+ </div>
469
+ </td>
470
+ </tr>
471
+ {isExpanded && groupLinkages.map((linkage) => (
472
+ <tr
473
+ key={linkage.id}
474
+ className="transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
475
+ >
476
+ <td className="pl-6">
477
+ <Link
478
+ href={`/deals/${linkage.deal.id}`}
479
+ className="font-medium text-accent hover:text-accent-hover"
480
+ >
481
+ {linkage.deal.provider} β†’ {linkage.deal.buyer}
482
+ </Link>
483
+ <div className="text-xs text-text-muted mt-0.5">
484
+ {linkage.deal.modality} β€’ {linkage.deal.date ? formatDate(linkage.deal.date) : 'β€”'}
485
+ </div>
486
+ </td>
487
+ <td>
488
+ <Link
489
+ href={`/models/${linkage.model.id}`}
490
+ className="font-medium text-accent hover:text-accent-hover"
491
+ >
492
+ {linkage.model.modelId}
493
+ </Link>
494
+ {linkage.model.family && (
495
+ <div className="text-xs text-text-muted mt-0.5">
496
+ {linkage.model.family} β€’ {linkage.model.provider}
497
+ </div>
498
+ )}
499
+ {linkage.model.tokensEstMid && (
500
+ <div className="text-xs text-text-muted mt-0.5">
501
+ {formatTokens(linkage.model.tokensEstMid)} tokens
502
+ </div>
503
+ )}
504
+ </td>
505
+ <td>
506
+ <div className="flex flex-col gap-1">
507
+ <span className="badge badge-secondary text-xs">
508
+ {linkage.linkageType === 'temporal_overlap' ? 'Same Time Period' :
509
+ linkage.linkageType === 'inferred' ? 'Same Company' :
510
+ linkage.linkageType || 'β€”'}
511
+ </span>
512
+ <div className="text-xs text-text-muted/70">
513
+ {linkage.linkageType === 'temporal_overlap'
514
+ ? 'Deal & model within 1 year'
515
+ : linkage.linkageType === 'inferred'
516
+ ? 'Buyer matches model provider'
517
+ : ''}
518
+ </div>
519
+ </div>
520
+ </td>
521
+ <td>
522
+ <span className={`badge ${
523
+ linkage.linkageStrength === 'high'
524
+ ? 'badge-primary'
525
+ : 'badge-secondary'
526
+ } text-xs`}>
527
+ {linkage.linkageStrength === 'high' ? 'High' :
528
+ linkage.linkageStrength === 'medium' ? 'Medium' :
529
+ linkage.linkageStrength === 'low' ? 'Low' :
530
+ 'β€”'}
531
+ </span>
532
+ </td>
533
+ <td>
534
+ <div className="text-sm text-text leading-relaxed">
535
+ {linkage.impactInference || 'β€”'}
536
+ </div>
537
+ </td>
538
+ </tr>
539
+ ))}
540
+ </React.Fragment>
541
+ )
542
+ })
543
+ ) : (
544
+ // Ungrouped view
545
+ sortedLinkages.map((linkage) => (
546
+ <tr
547
+ key={linkage.id}
548
+ className="transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
549
+ >
550
+ <td>
551
+ <Link
552
+ href={`/deals/${linkage.deal.id}`}
553
+ className="font-medium text-accent hover:text-accent-hover"
554
+ >
555
+ {linkage.deal.provider} β†’ {linkage.deal.buyer}
556
+ </Link>
557
+ <div className="text-xs text-text-muted mt-0.5">
558
+ {linkage.deal.modality} β€’ {linkage.deal.date ? formatDate(linkage.deal.date) : 'β€”'}
559
+ </div>
560
+ </td>
561
+ <td>
562
+ <Link
563
+ href={`/models/${linkage.model.id}`}
564
+ className="font-medium text-accent hover:text-accent-hover"
565
+ >
566
+ {linkage.model.modelId}
567
+ </Link>
568
+ {linkage.model.family && (
569
+ <div className="text-xs text-text-muted mt-0.5">
570
+ {linkage.model.family} β€’ {linkage.model.provider}
571
+ </div>
572
+ )}
573
+ {linkage.model.tokensEstMid && (
574
+ <div className="text-xs text-text-muted mt-0.5">
575
+ {formatTokens(linkage.model.tokensEstMid)} tokens
576
+ </div>
577
+ )}
578
+ </td>
579
+ <td>
580
+ <div className="flex flex-col gap-1">
581
+ <span className="badge badge-secondary text-xs">
582
+ {linkage.linkageType === 'temporal_overlap' ? 'Same Time Period' :
583
+ linkage.linkageType === 'inferred' ? 'Same Company' :
584
+ linkage.linkageType || 'β€”'}
585
+ </span>
586
+ <div className="text-xs text-text-muted/70">
587
+ {linkage.linkageType === 'temporal_overlap'
588
+ ? 'Deal & model within 1 year'
589
+ : linkage.linkageType === 'inferred'
590
+ ? 'Buyer matches model provider'
591
+ : ''}
592
+ </div>
593
+ </div>
594
+ </td>
595
+ <td>
596
+ <span className={`badge ${
597
+ linkage.linkageStrength === 'high'
598
+ ? 'badge-primary'
599
+ : 'badge-secondary'
600
+ } text-xs`}>
601
+ {linkage.linkageStrength === 'high' ? 'High' :
602
+ linkage.linkageStrength === 'medium' ? 'Medium' :
603
+ linkage.linkageStrength === 'low' ? 'Low' :
604
+ 'β€”'}
605
+ </span>
606
+ </td>
607
+ <td>
608
+ <div className="text-sm text-text leading-relaxed">
609
+ {linkage.impactInference || 'β€”'}
610
+ </div>
611
+ </td>
612
+ </tr>
613
+ ))
614
+ )}
615
+ </tbody>
616
+ </table>
617
+ </div>
618
+ </div>
619
+ </>
620
+ )
621
+ }
622
+
app/linkages/page.tsx CHANGED
@@ -1,69 +1,45 @@
1
  import { prisma } from '@/lib/prisma'
2
- import Link from 'next/link'
3
- import { formatDate } from '@/lib/utils'
4
  import AutoCreate from '@/app/components/linkages/AutoCreate'
5
- import Tooltip from '@/app/components/Tooltip'
6
 
7
  async function getLinkages() {
8
- try {
9
- // Fetch linkages with relations
10
- const linkages = await prisma.dealModelLinkage.findMany({
11
- include: {
12
- deal: {
13
- select: {
14
- id: true,
15
- provider: true,
16
- buyer: true,
17
- modality: true,
18
- priceUsd: true,
19
- date: true,
20
- },
21
  },
22
- model: {
23
- select: {
24
- id: true,
25
- modelId: true,
26
- provider: true,
27
- family: true,
28
- tokensEstMid: true,
29
- },
30
  },
31
  },
32
- orderBy: [
33
- { linkageStrength: 'desc' },
34
- { analysisTimestamp: 'desc' },
35
- ],
36
- })
37
- return linkages
38
- } catch (error: any) {
39
- console.error('Error fetching linkages:', error)
40
- // Return empty array on error to prevent page crash
41
- return []
42
- }
43
  }
44
 
45
- function formatTokens(value: number | null | undefined): string {
46
- if (!value) return 'β€”'
47
- if (value >= 1e15) return `${(value / 1e15).toFixed(1)}P`
48
- if (value >= 1e12) return `${(value / 1e12).toFixed(1)}T`
49
- if (value >= 1e9) return `${(value / 1e9).toFixed(1)}B`
50
- return `${(value / 1e6).toFixed(0)}M`
51
- }
52
 
53
  async function getDealCount() {
54
- try {
55
- return await prisma.deal.count()
56
- } catch {
57
- return 0
58
- }
59
  }
60
 
61
  async function getModelCount() {
62
- try {
63
- return await prisma.modelRegistry.count()
64
- } catch {
65
- return 0
66
- }
67
  }
68
 
69
  export default async function LinkagesPage() {
@@ -82,196 +58,68 @@ export default async function LinkagesPage() {
82
  />
83
 
84
  {/* Header */}
85
- <div className="mb-6">
86
- <div className="mb-6">
87
- <h1 className="text-4xl font-semibold mb-2">Deal-Model Linkages</h1>
88
- <p className="text-text-muted text-lg mb-4">
89
- Connections between training data deals and AI models
90
- </p>
91
-
92
- {/* Simple Explanation */}
93
- <div className="card bg-[rgba(139,111,71,0.05)] border border-accent/20">
94
- <div className="p-4">
95
- <h3 className="font-semibold text-text mb-2">What are linkages?</h3>
96
- <p className="text-sm text-text-muted leading-relaxed">
97
- Linkages connect training data deals to AI models. For example: if OpenAI signed a deal with News Corp in 2023,
98
- and GPT-4 was released in 2023, there's a linkage suggesting the News Corp data may have been used to train GPT-4.
99
- </p>
100
- <p className="text-sm text-text-muted leading-relaxed mt-2">
101
- The system automatically creates linkages when: (1) the deal buyer matches the model provider (e.g., OpenAI deal β†’ OpenAI model),
102
- and (2) optionally, when the deal date and model release date are close in time (within 1 year).
103
- </p>
104
- </div>
105
- </div>
106
- </div>
107
  </div>
108
 
109
-
110
- {/* Linkages Table */}
111
- <div className="card overflow-hidden p-0">
112
- <div className="overflow-x-auto">
113
- <table className="table text-sm">
114
- <thead>
115
- <tr className="border-b border-border-subtle">
116
- <th className="px-4 py-3 text-left font-semibold">
117
- <Tooltip content="The training data deal, showing the data provider (who owns the data) and the buyer (the AI company licensing it).">
118
- <div className="underline decoration-dotted cursor-help">Deal</div>
119
- </Tooltip>
120
- <div className="text-xs font-normal text-text-muted mt-0.5">Data provider β†’ Buyer</div>
121
- </th>
122
- <th className="px-4 py-3 text-left font-semibold">
123
- <Tooltip content="The AI model that may have been trained using data from this deal. Linkages are inferred based on company matches and timing.">
124
- <div className="underline decoration-dotted cursor-help">Model</div>
125
- </Tooltip>
126
- <div className="text-xs font-normal text-text-muted mt-0.5">AI model that may have used this data</div>
127
- </th>
128
- <th className="px-4 py-3 text-left font-semibold">
129
- <Tooltip content="The type of connection: 'Same Time Period' (deal and model within 1 year), 'Same Company' (buyer matches model provider), or 'Explicit' (directly stated).">
130
- <div className="underline decoration-dotted cursor-help">Connection Type</div>
131
- </Tooltip>
132
- <div className="text-xs font-normal text-text-muted mt-0.5">How the link was determined</div>
133
- </th>
134
- <th className="px-4 py-3 text-left font-semibold">
135
- <Tooltip content="The confidence level in the linkage: High (strong evidence like temporal overlap), Medium (moderate evidence), or Low (weak evidence).">
136
- <div className="underline decoration-dotted cursor-help">Confidence</div>
137
- </Tooltip>
138
- <div className="text-xs font-normal text-text-muted mt-0.5">How certain we are</div>
139
- </th>
140
- <th className="px-4 py-3 text-left font-semibold">
141
- <Tooltip content="An interpretation of what this linkage means - how the deal's data may have impacted the model's training.">
142
- <div className="underline decoration-dotted cursor-help">What This Means</div>
143
- </Tooltip>
144
- <div className="text-xs font-normal text-text-muted mt-0.5">Interpretation of the connection</div>
145
- </th>
146
- </tr>
147
- </thead>
148
- <tbody>
149
- {linkages.length === 0 ? (
150
- <tr>
151
- <td colSpan={5} className="text-center py-12 text-text-muted">
152
- {dealCount === 0 || modelCount === 0
153
- ? `No ${dealCount === 0 ? 'deals' : 'models'} found. Please seed the database first.`
154
- : 'No linkages found. Linkage creation will start automatically.'}
155
- </td>
156
- </tr>
157
- ) : (
158
- linkages
159
- .filter(linkage => linkage && linkage.deal && linkage.model)
160
- .map((linkage) => (
161
- <tr
162
- key={linkage.id}
163
- className="transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
164
- >
165
- <td className="px-4 py-3">
166
- <Link
167
- href={`/deals/${linkage.deal.id}`}
168
- className="font-medium text-accent hover:text-accent-hover"
169
- >
170
- {linkage.deal.provider} β†’ {linkage.deal.buyer}
171
- </Link>
172
- <div className="text-xs text-text-muted mt-0.5">
173
- {linkage.deal.modality} β€’ {linkage.deal.date ? formatDate(linkage.deal.date) : 'β€”'}
174
- </div>
175
- </td>
176
- <td className="px-4 py-3">
177
- <Link
178
- href={`/models/${linkage.model.id}`}
179
- className="font-medium text-accent hover:text-accent-hover"
180
- >
181
- {linkage.model.modelId}
182
- </Link>
183
- {linkage.model.family && (
184
- <div className="text-xs text-text-muted mt-0.5">
185
- {linkage.model.family} β€’ {linkage.model.provider}
186
- </div>
187
- )}
188
- {linkage.model.tokensEstMid && (
189
- <div className="text-xs text-text-muted mt-0.5">
190
- {formatTokens(linkage.model.tokensEstMid)} tokens
191
- </div>
192
- )}
193
- </td>
194
- <td className="px-4 py-3">
195
- <div className="flex flex-col gap-1">
196
- <span className="badge badge-secondary text-xs">
197
- {linkage.linkageType === 'temporal_overlap' ? 'Same Time Period' :
198
- linkage.linkageType === 'inferred' ? 'Same Company' :
199
- linkage.linkageType || 'β€”'}
200
- </span>
201
- <div className="text-xs text-text-muted/70">
202
- {linkage.linkageType === 'temporal_overlap'
203
- ? 'Deal & model within 1 year'
204
- : linkage.linkageType === 'inferred'
205
- ? 'Buyer matches model provider'
206
- : ''}
207
- </div>
208
- </div>
209
- </td>
210
- <td className="px-4 py-3">
211
- <span className={`badge ${
212
- linkage.linkageStrength === 'high'
213
- ? 'badge-primary'
214
- : 'badge-secondary'
215
- } text-xs`}>
216
- {linkage.linkageStrength === 'high' ? 'High' :
217
- linkage.linkageStrength === 'medium' ? 'Medium' :
218
- linkage.linkageStrength === 'low' ? 'Low' :
219
- 'β€”'}
220
- </span>
221
- </td>
222
- <td className="px-4 py-3">
223
- <div className="text-sm text-text leading-relaxed">
224
- {linkage.impactInference || 'β€”'}
225
- </div>
226
- </td>
227
- </tr>
228
- ))
229
- )}
230
- </tbody>
231
- </table>
232
- </div>
233
- </div>
234
-
235
- {/* Detailed Explanation */}
236
  {linkages.length > 0 && (
237
- <div className="card mt-8">
238
- <h2 className="text-xl font-semibold mb-4">How Linkages Work</h2>
239
- <div className="space-y-4 text-sm text-text-muted">
240
- <div>
241
- <h3 className="font-semibold text-text mb-2">Example:</h3>
242
- <p className="leading-relaxed">
243
- If you see a linkage: <strong className="text-text">News Corp β†’ OpenAI</strong> connected to <strong className="text-text">GPT-4</strong>,
244
- it means OpenAI signed a deal with News Corp, and because GPT-4 is an OpenAI model, there's a potential connection.
245
- If the deal happened in 2023 and GPT-4 was released in 2023, that's a stronger connection (temporal overlap).
246
- </p>
247
- </div>
248
-
249
- <div className="pt-3 border-t border-border-subtle">
250
- <h3 className="font-semibold text-text mb-2">Connection Types:</h3>
251
- <ul className="space-y-2">
252
- <li>
253
- <strong className="text-text">Same Time Period:</strong> Deal and model release are within 1 year.
254
- Suggests the deal's data may have been used in training.
255
- </li>
256
- <li>
257
- <strong className="text-text">Same Company:</strong> Deal buyer matches model provider, but different time periods.
258
- Shows organizational relationship but less direct connection.
259
- </li>
260
- </ul>
261
- </div>
262
-
263
- <div className="pt-3 border-t border-border-subtle">
264
- <h3 className="font-semibold text-text mb-2">Confidence Levels:</h3>
265
- <p className="leading-relaxed">
266
- Currently all linkages are marked as <strong className="text-text">High</strong> confidence because they require
267
- a clear match between the deal buyer and model provider. The system automatically creates these connections
268
- when it finds matching company names (e.g., "OpenAI" in both the deal and model).
269
- </p>
 
 
 
 
 
 
270
  </div>
 
 
 
 
 
 
 
 
 
 
 
271
  </div>
272
  </div>
 
 
273
  )}
274
- </div>
275
- </main>
276
- )
277
- }
 
1
  import { prisma } from '@/lib/prisma'
 
 
2
  import AutoCreate from '@/app/components/linkages/AutoCreate'
3
+ import LinkagesClient from './LinkagesClient'
4
 
5
  async function getLinkages() {
6
+ const linkages = await prisma.dealModelLinkage.findMany({
7
+ include: {
8
+ deal: {
9
+ select: {
10
+ id: true,
11
+ provider: true,
12
+ buyer: true,
13
+ modality: true,
14
+ priceUsd: true,
15
+ date: true,
 
 
 
16
  },
17
+ },
18
+ model: {
19
+ select: {
20
+ id: true,
21
+ modelId: true,
22
+ provider: true,
23
+ family: true,
24
+ tokensEstMid: true,
25
  },
26
  },
27
+ },
28
+ orderBy: [
29
+ { linkageStrength: 'desc' },
30
+ { analysisTimestamp: 'desc' },
31
+ ],
32
+ })
33
+ return linkages
 
 
 
 
34
  }
35
 
 
 
 
 
 
 
 
36
 
37
  async function getDealCount() {
38
+ return await prisma.deal.count()
 
 
 
 
39
  }
40
 
41
  async function getModelCount() {
42
+ return await prisma.modelRegistry.count()
 
 
 
 
43
  }
44
 
45
  export default async function LinkagesPage() {
 
58
  />
59
 
60
  {/* Header */}
61
+ <div className="mb-4">
62
+ <h1 className="text-3xl font-semibold mb-1">Deal-Model Linkages</h1>
63
+ <p className="text-text-muted text-sm">
64
+ Connections between training data deals and AI models
65
+ </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  </div>
67
 
68
+ {/* Detailed Explanation - Above table, collapsible */}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  {linkages.length > 0 && (
70
+ <div className="card mb-6">
71
+ <details className="group">
72
+ <summary className="cursor-pointer list-none flex items-center justify-between">
73
+ <h2 className="text-lg font-semibold">How Linkages Work</h2>
74
+ <span className="text-text-muted text-sm group-open:hidden">Click to expand</span>
75
+ <span className="text-text-muted text-sm hidden group-open:inline">Click to collapse</span>
76
+ </summary>
77
+ <div className="mt-3 pt-3 border-t border-border-subtle space-y-3 text-sm text-text-muted">
78
+ <div>
79
+ <h3 className="font-semibold text-text mb-2">Example:</h3>
80
+ <p className="leading-relaxed">
81
+ If you see a linkage: <strong className="text-text">News Corp β†’ OpenAI</strong> connected to <strong className="text-text">GPT-4</strong>,
82
+ it means OpenAI signed a deal with News Corp, and because GPT-4 is an OpenAI model, there's a potential connection.
83
+ If the deal happened in 2023 and GPT-4 was released in 2023, that's a stronger connection (temporal overlap).
84
+ </p>
85
+ </div>
86
+
87
+ <div className="pt-2 border-t border-border-subtle">
88
+ <h3 className="font-semibold text-text mb-2">Connection Types:</h3>
89
+ <ul className="space-y-2">
90
+ <li>
91
+ <strong className="text-text">Same Time Period:</strong> Deal and model release are within 1 year.
92
+ Suggests the deal's data may have been used in training.
93
+ </li>
94
+ <li>
95
+ <strong className="text-text">Same Company:</strong> Deal buyer matches model provider, but different time periods.
96
+ Shows organizational relationship but less direct connection.
97
+ </li>
98
+ </ul>
99
+ </div>
100
+
101
+ <div className="pt-2 border-t border-border-subtle">
102
+ <h3 className="font-semibold text-text mb-2">Confidence Levels:</h3>
103
+ <p className="leading-relaxed">
104
+ Currently all linkages are marked as <strong className="text-text">High</strong> confidence because they require
105
+ a clear match between the deal buyer and model provider. The system automatically creates these connections
106
+ when it finds matching company names (e.g., "OpenAI" in both the deal and model).
107
+ </p>
108
+ </div>
109
  </div>
110
+ </details>
111
+ </div>
112
+ )}
113
+
114
+ {/* Linkages Table with Filtering, Sorting, and Grouping */}
115
+ {linkages.length === 0 ? (
116
+ <div className="card">
117
+ <div className="text-center py-12 text-text-muted">
118
+ {dealCount === 0 || modelCount === 0
119
+ ? `No ${dealCount === 0 ? 'deals' : 'models'} found. Please seed the database first.`
120
+ : 'No linkages found. Linkage creation will start automatically.'}
121
  </div>
122
  </div>
123
+ ) : (
124
+ <LinkagesClient initialLinkages={linkages.filter(l => l && l.deal && l.model)} />
125
  )}
 
 
 
 
app/models/[id]/page.tsx CHANGED
@@ -91,15 +91,15 @@ export default async function ModelDetailPage({
91
 
92
  <div className="max-w-4xl">
93
  {/* Header Card */}
94
- <div className="card mb-8">
95
- <div className="mb-4">
96
- <h1 className="text-3xl font-semibold mb-2">{model.modelId}</h1>
97
  {model.family && (
98
- <p className="text-text-muted text-lg">{model.family}</p>
99
  )}
100
  </div>
101
 
102
- <div className="grid grid-cols-2 md:grid-cols-4 gap-4 pt-6 border-t border-border">
103
  <div>
104
  <div className="text-sm text-text-muted mb-1">Provider</div>
105
  <div className="font-medium">{model.provider}</div>
@@ -137,9 +137,9 @@ export default async function ModelDetailPage({
137
 
138
  {/* Token Estimates */}
139
  {(model.tokensEstMin || model.tokensEstMax || model.tokensEstMid) && (
140
- <div className="mb-8">
141
- <div className="card mb-6">
142
- <h2 className="text-xl font-semibold mb-4">Training Token Estimates</h2>
143
  <div className="grid grid-cols-1 md:grid-cols-3 gap-4">
144
  <div>
145
  <div className="text-sm text-text-muted mb-1">Minimum</div>
@@ -180,8 +180,8 @@ export default async function ModelDetailPage({
180
 
181
  {/* Evidence Profile */}
182
  {(evidenceTypes.length > 0 || model.evidenceStrength || uncertaintySources.length > 0) && (
183
- <div className="card mb-8">
184
- <h2 className="text-xl font-semibold mb-4">Evidence Profile</h2>
185
  <div className="space-y-4">
186
  {model.evidenceStrength && (
187
  <div>
@@ -221,8 +221,8 @@ export default async function ModelDetailPage({
221
 
222
  {/* Training Compute */}
223
  {(model.flopsReported || model.flopsEstimated) && (
224
- <div className="card mb-8">
225
- <h2 className="text-xl font-semibold mb-4">Training Compute</h2>
226
  <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
227
  {model.flopsReported && (
228
  <div>
@@ -242,13 +242,13 @@ export default async function ModelDetailPage({
242
 
243
  {/* Linked Deals */}
244
  {model.modelLinkages && model.modelLinkages.length > 0 && (
245
- <div className="card mb-8">
246
- <h2 className="text-xl font-semibold mb-4">Linked Training Data Deals</h2>
247
  <div className="space-y-3">
248
  {model.modelLinkages.map((linkage) => (
249
  <div
250
  key={linkage.id}
251
- className="p-4 border border-border-subtle rounded-sm hover:bg-border-subtle/30 transition-colors"
252
  >
253
  <div className="flex items-start justify-between mb-2">
254
  <div>
@@ -285,7 +285,7 @@ export default async function ModelDetailPage({
285
  {/* Sources */}
286
  {sources.length > 0 && (
287
  <div className="card">
288
- <h2 className="text-xl font-semibold mb-4">Sources</h2>
289
  <div className="space-y-2">
290
  {sources.map((source, idx) => (
291
  <div key={idx}>
 
91
 
92
  <div className="max-w-4xl">
93
  {/* Header Card */}
94
+ <div className="card mb-6">
95
+ <div className="mb-3">
96
+ <h1 className="text-2xl font-semibold mb-1">{model.modelId}</h1>
97
  {model.family && (
98
+ <p className="text-text-muted text-sm">{model.family}</p>
99
  )}
100
  </div>
101
 
102
+ <div className="grid grid-cols-2 md:grid-cols-4 gap-4 pt-4 border-t border-border">
103
  <div>
104
  <div className="text-sm text-text-muted mb-1">Provider</div>
105
  <div className="font-medium">{model.provider}</div>
 
137
 
138
  {/* Token Estimates */}
139
  {(model.tokensEstMin || model.tokensEstMax || model.tokensEstMid) && (
140
+ <div className="mb-6">
141
+ <div className="card mb-4">
142
+ <h2 className="text-lg font-semibold mb-3">Training Token Estimates</h2>
143
  <div className="grid grid-cols-1 md:grid-cols-3 gap-4">
144
  <div>
145
  <div className="text-sm text-text-muted mb-1">Minimum</div>
 
180
 
181
  {/* Evidence Profile */}
182
  {(evidenceTypes.length > 0 || model.evidenceStrength || uncertaintySources.length > 0) && (
183
+ <div className="card mb-6">
184
+ <h2 className="text-lg font-semibold mb-3">Evidence Profile</h2>
185
  <div className="space-y-4">
186
  {model.evidenceStrength && (
187
  <div>
 
221
 
222
  {/* Training Compute */}
223
  {(model.flopsReported || model.flopsEstimated) && (
224
+ <div className="card mb-6">
225
+ <h2 className="text-lg font-semibold mb-3">Training Compute</h2>
226
  <div className="grid grid-cols-1 md:grid-cols-2 gap-4">
227
  {model.flopsReported && (
228
  <div>
 
242
 
243
  {/* Linked Deals */}
244
  {model.modelLinkages && model.modelLinkages.length > 0 && (
245
+ <div className="card mb-6">
246
+ <h2 className="text-lg font-semibold mb-3">Linked Training Data Deals</h2>
247
  <div className="space-y-3">
248
  {model.modelLinkages.map((linkage) => (
249
  <div
250
  key={linkage.id}
251
+ className="p-4 border border-border-subtle rounded-none hover:bg-border-subtle/30 transition-colors"
252
  >
253
  <div className="flex items-start justify-between mb-2">
254
  <div>
 
285
  {/* Sources */}
286
  {sources.length > 0 && (
287
  <div className="card">
288
+ <h2 className="text-lg font-semibold mb-3">Sources</h2>
289
  <div className="space-y-2">
290
  {sources.map((source, idx) => (
291
  <div key={idx}>
app/models/page.tsx CHANGED
@@ -59,14 +59,12 @@ export default async function ModelsPage() {
59
  />
60
 
61
  {/* Header */}
62
- <div className="mb-6">
63
- <div className="mb-4">
64
- <h1 className="text-4xl font-semibold mb-2">Model Registry</h1>
65
- <p className="text-text-muted text-lg">
66
- Training data scale estimates for major AI models
67
- </p>
68
- </div>
69
- </div>
70
 
71
  {/* Stats */}
72
  <div className="grid grid-cols-2 md:grid-cols-3 gap-3 mb-8">
@@ -94,9 +92,9 @@ export default async function ModelsPage() {
94
  <table className="table text-sm">
95
  <thead>
96
  <tr className="border-b border-border-subtle">
97
- <th className="px-4 py-3 text-left font-semibold">Model</th>
98
- <th className="px-4 py-3 text-left font-semibold">Provider</th>
99
- <th className="px-4 py-3 text-left font-semibold">
100
  <Tooltip content="The number of trainable parameters in the model, typically measured in billions (B) or trillions (T). More parameters generally mean more capacity to learn complex patterns.">
101
  <span className="underline decoration-dotted cursor-help">Params</span>
102
  </Tooltip>
@@ -132,7 +130,7 @@ export default async function ModelsPage() {
132
  key={model.id}
133
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
134
  >
135
- <td className="px-4 py-3">
136
  <Link
137
  href={`/models/${model.id}`}
138
  className="font-medium text-accent hover:text-accent-hover"
@@ -143,10 +141,10 @@ export default async function ModelsPage() {
143
  <div className="text-xs text-text-muted mt-0.5">{model.family}</div>
144
  )}
145
  </td>
146
- <td className="px-4 py-3">
147
  <div className="text-sm">{model.provider}</div>
148
  </td>
149
- <td className="px-4 py-3">
150
  <div className="text-sm">{formatParams(model.params)}</div>
151
  {model.isMoe && (
152
  <Tooltip content="Mixture of Experts (MoE): A model architecture that uses multiple specialized sub-networks (experts) but only activates a subset for each input. This allows for larger models with lower computational costs.">
@@ -174,7 +172,7 @@ export default async function ModelsPage() {
174
  <span className="text-text-muted/60">β€”</span>
175
  )}
176
  </td>
177
- <td className="px-4 py-3">
178
  <div className="flex items-center gap-2">
179
  {model.architectureType && (
180
  <span className="badge badge-secondary text-xs">
@@ -188,14 +186,14 @@ export default async function ModelsPage() {
188
  )}
189
  </div>
190
  </td>
191
- <td className="px-4 py-3">
192
  {model.evidenceStrength && (
193
  <span className="badge badge-secondary text-xs">
194
  {model.evidenceStrength.replace('S-', '')}
195
  </span>
196
  )}
197
  </td>
198
- <td className="px-4 py-3">
199
  <div className="text-sm text-text-muted/80">
200
  {model.releaseDate ? formatDate(model.releaseDate instanceof Date ? model.releaseDate.toISOString() : String(model.releaseDate)) : 'β€”'}
201
  </div>
 
59
  />
60
 
61
  {/* Header */}
62
+ <div className="mb-4">
63
+ <h1 className="text-3xl font-semibold mb-1">Model Registry</h1>
64
+ <p className="text-text-muted text-sm">
65
+ Training data scale estimates for major AI models
66
+ </p>
67
+ </div>
 
 
68
 
69
  {/* Stats */}
70
  <div className="grid grid-cols-2 md:grid-cols-3 gap-3 mb-8">
 
92
  <table className="table text-sm">
93
  <thead>
94
  <tr className="border-b border-border-subtle">
95
+ <th className="text-left font-semibold">Model</th>
96
+ <th className="text-left font-semibold">Provider</th>
97
+ <th className="text-left font-semibold">
98
  <Tooltip content="The number of trainable parameters in the model, typically measured in billions (B) or trillions (T). More parameters generally mean more capacity to learn complex patterns.">
99
  <span className="underline decoration-dotted cursor-help">Params</span>
100
  </Tooltip>
 
130
  key={model.id}
131
  className="cursor-pointer transition-colors border-b border-border-subtle last:border-0 hover:bg-[rgba(232,225,217,0.3)]"
132
  >
133
+ <td>
134
  <Link
135
  href={`/models/${model.id}`}
136
  className="font-medium text-accent hover:text-accent-hover"
 
141
  <div className="text-xs text-text-muted mt-0.5">{model.family}</div>
142
  )}
143
  </td>
144
+ <td>
145
  <div className="text-sm">{model.provider}</div>
146
  </td>
147
+ <td>
148
  <div className="text-sm">{formatParams(model.params)}</div>
149
  {model.isMoe && (
150
  <Tooltip content="Mixture of Experts (MoE): A model architecture that uses multiple specialized sub-networks (experts) but only activates a subset for each input. This allows for larger models with lower computational costs.">
 
172
  <span className="text-text-muted/60">β€”</span>
173
  )}
174
  </td>
175
+ <td>
176
  <div className="flex items-center gap-2">
177
  {model.architectureType && (
178
  <span className="badge badge-secondary text-xs">
 
186
  )}
187
  </div>
188
  </td>
189
+ <td>
190
  {model.evidenceStrength && (
191
  <span className="badge badge-secondary text-xs">
192
  {model.evidenceStrength.replace('S-', '')}
193
  </span>
194
  )}
195
  </td>
196
+ <td>
197
  <div className="text-sm text-text-muted/80">
198
  {model.releaseDate ? formatDate(model.releaseDate instanceof Date ? model.releaseDate.toISOString() : String(model.releaseDate)) : 'β€”'}
199
  </div>
app/normalization/page.tsx CHANGED
@@ -141,16 +141,16 @@ export default async function NormalizationPage() {
141
  return (
142
  <main className="min-h-screen bg-background">
143
  <div className="container-content section-padding">
144
- <div className="mb-8">
145
- <h1 className="text-4xl font-semibold mb-4">Pricing Normalization Tool</h1>
146
- <p className="text-text-muted text-lg">
147
  Compare deals on an apples-to-apples basis by normalizing to per-unit pricing
148
  </p>
149
  </div>
150
 
151
- <div className="card mb-8">
152
- <h2 className="text-2xl font-semibold mb-4">How It Works</h2>
153
- <p className="text-text-muted leading-relaxed mb-4">
154
  Different deals use different pricing models (per-book, per-track, aggregate licensing, etc.).
155
  This tool normalizes prices to common units (tokens, records, images, minutes) to enable
156
  direct comparison.
 
141
  return (
142
  <main className="min-h-screen bg-background">
143
  <div className="container-content section-padding">
144
+ <div className="mb-4">
145
+ <h1 className="text-3xl font-semibold mb-1">Pricing Normalization Tool</h1>
146
+ <p className="text-text-muted text-sm">
147
  Compare deals on an apples-to-apples basis by normalizing to per-unit pricing
148
  </p>
149
  </div>
150
 
151
+ <div className="card mb-6">
152
+ <h2 className="text-xl font-semibold mb-3">How It Works</h2>
153
+ <p className="text-text-muted text-sm leading-relaxed mb-3">
154
  Different deals use different pricing models (per-book, per-track, aggregate licensing, etc.).
155
  This tool normalizes prices to common units (tokens, records, images, minutes) to enable
156
  direct comparison.
app/page.tsx CHANGED
@@ -176,11 +176,11 @@ export default async function Home() {
176
  {/* Auto-enrich notification */}
177
  <AutoEnrich dealCount={deals.length} dealsWithAllFields={dealsWithAllFields} />
178
  {/* Header */}
179
- <div className="mb-6">
180
- <div className="flex items-center justify-between mb-4">
181
  <div>
182
- <h1 className="text-4xl font-semibold mb-2">Deals Explorer</h1>
183
- <p className="text-text-muted text-lg">
184
  Global licensing, acquisition, and commissioning deals (2020–2025)
185
  </p>
186
  </div>
@@ -224,7 +224,7 @@ export default async function Home() {
224
  </div>
225
 
226
  {/* Scrollable Analytics Cards */}
227
- <div className="overflow-x-auto pb-2 -mx-6 px-6">
228
  <div className="flex gap-4 min-w-max">
229
  {/* Modality Breakdown */}
230
  <div className="card min-w-[280px] flex-shrink-0">
 
176
  {/* Auto-enrich notification */}
177
  <AutoEnrich dealCount={deals.length} dealsWithAllFields={dealsWithAllFields} />
178
  {/* Header */}
179
+ <div className="mb-4">
180
+ <div className="flex items-center justify-between">
181
  <div>
182
+ <h1 className="text-3xl font-semibold mb-1">Deals Explorer</h1>
183
+ <p className="text-text-muted text-sm">
184
  Global licensing, acquisition, and commissioning deals (2020–2025)
185
  </p>
186
  </div>
 
224
  </div>
225
 
226
  {/* Scrollable Analytics Cards */}
227
+ <div className="overflow-x-auto pb-2 -mx-2 px-2">
228
  <div className="flex gap-4 min-w-max">
229
  {/* Modality Breakdown */}
230
  <div className="card min-w-[280px] flex-shrink-0">
app/timeline/page.tsx CHANGED
@@ -208,8 +208,8 @@ export default async function TimelinePage() {
208
  <div className="min-h-screen bg-background">
209
  <div className="container-content section-padding">
210
  {/* Header */}
211
- <div className="mb-9">
212
- <h1 className="text-4xl font-semibold mb-2 tracking-tight">Major AI Training Data Deals (2020–2025)</h1>
213
  <p className="text-text-muted text-sm mb-1">Tracking the emergence of data markets</p>
214
  <p className="text-text-muted text-xs font-medium">Source: Open Data Labs (opendatalabs.xyz)</p>
215
  </div>
 
208
  <div className="min-h-screen bg-background">
209
  <div className="container-content section-padding">
210
  {/* Header */}
211
+ <div className="mb-4">
212
+ <h1 className="text-3xl font-semibold mb-1 tracking-tight">Major AI Training Data Deals (2020–2025)</h1>
213
  <p className="text-text-muted text-sm mb-1">Tracking the emergence of data markets</p>
214
  <p className="text-text-muted text-xs font-medium">Source: Open Data Labs (opendatalabs.xyz)</p>
215
  </div>
registry/enrich_all_models.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Batch enrichment script for all existing models
3
+ Enriches all models in the database with comprehensive metadata
4
+ """
5
+
6
+ import asyncio
7
+ import os
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import List, Optional
11
+ from datetime import datetime
12
+ import json
13
+
14
+ # Add project root to path
15
+ project_root = Path(__file__).parent.parent
16
+ sys.path.insert(0, str(project_root))
17
+
18
+ from registry.enrichment.comprehensive_enrichment import ComprehensiveModelEnricher
19
+ from dotenv import load_dotenv
20
+
21
+ # Prisma imports
22
+ try:
23
+ from prisma import Prisma
24
+ PRISMA_AVAILABLE = True
25
+ except ImportError:
26
+ PRISMA_AVAILABLE = False
27
+ print("Warning: Prisma not available. Install with: npm run db:generate")
28
+
29
+ load_dotenv()
30
+
31
+
32
+ async def enrich_all_models(
33
+ limit: Optional[int] = None,
34
+ use_web_search: bool = True,
35
+ use_llm_extraction: bool = True
36
+ ):
37
+ """
38
+ Enrich all models in the database
39
+
40
+ Args:
41
+ limit: Limit number of models to enrich (for testing)
42
+ use_web_search: Enable web search enrichment
43
+ use_llm_extraction: Enable LLM extraction
44
+ """
45
+ if not PRISMA_AVAILABLE:
46
+ raise RuntimeError("Prisma not available. Run: npm run db:generate")
47
+
48
+ # Connect to database
49
+ prisma = Prisma()
50
+ await prisma.connect()
51
+
52
+ try:
53
+ # Get all models
54
+ models = await prisma.modelregistry.find_many(
55
+ take=limit
56
+ )
57
+
58
+ print(f"πŸš€ Starting enrichment of {len(models)} models")
59
+ print(f" Started at: {datetime.now().isoformat()}\n")
60
+
61
+ # Initialize enricher
62
+ enricher = ComprehensiveModelEnricher(
63
+ use_web_search=use_web_search,
64
+ use_llm_extraction=use_llm_extraction
65
+ )
66
+
67
+ success_count = 0
68
+ error_count = 0
69
+
70
+ for i, model in enumerate(models, 1):
71
+ print(f"\n[{i}/{len(models)}] Enriching: {model.modelId} ({model.provider})")
72
+
73
+ try:
74
+ # Get existing data
75
+ existing_data = {
76
+ "params": model.params,
77
+ "releaseDate": model.releaseDate.isoformat() if model.releaseDate else None,
78
+ "architectureType": model.architectureType,
79
+ "isMoe": model.isMoe,
80
+ "multimodal": model.multimodal,
81
+ "tokensEstMid": model.tokensEstMid,
82
+ }
83
+
84
+ # Enrich model
85
+ enriched = await enricher.enrich_model(
86
+ model_id=model.modelId,
87
+ provider=model.provider,
88
+ family=model.family,
89
+ existing_data=existing_data
90
+ )
91
+
92
+ # Prepare update data
93
+ update_data = {}
94
+
95
+ # Release date
96
+ if enriched.get("releaseDate"):
97
+ if isinstance(enriched["releaseDate"], str):
98
+ update_data["releaseDate"] = datetime.fromisoformat(enriched["releaseDate"])
99
+ else:
100
+ update_data["releaseDate"] = enriched["releaseDate"]
101
+
102
+ # Architecture
103
+ if enriched.get("architectureType"):
104
+ update_data["architectureType"] = enriched["architectureType"]
105
+ if enriched.get("isMoe") is not None:
106
+ update_data["isMoe"] = enriched["isMoe"]
107
+ if enriched.get("numExperts"):
108
+ update_data["numExperts"] = enriched["numExperts"]
109
+ if enriched.get("multimodal") is not None:
110
+ update_data["multimodal"] = enriched["multimodal"]
111
+
112
+ # Parameters (only if not already set)
113
+ if not model.params and enriched.get("params"):
114
+ update_data["params"] = enriched["params"]
115
+
116
+ # Token estimates
117
+ if enriched.get("tokensEstMin"):
118
+ update_data["tokensEstMin"] = enriched["tokensEstMin"]
119
+ if enriched.get("tokensEstMax"):
120
+ update_data["tokensEstMax"] = enriched["tokensEstMax"]
121
+ if enriched.get("tokensEstMid"):
122
+ update_data["tokensEstMid"] = enriched["tokensEstMid"]
123
+ if enriched.get("tokensRangeGeneratedAt"):
124
+ update_data["tokensRangeGeneratedAt"] = enriched["tokensRangeGeneratedAt"]
125
+
126
+ # Evidence profile
127
+ if enriched.get("evidenceTypes"):
128
+ update_data["evidenceTypes"] = enriched["evidenceTypes"]
129
+ if enriched.get("evidenceStrength"):
130
+ update_data["evidenceStrength"] = enriched["evidenceStrength"]
131
+ if enriched.get("uncertaintySources"):
132
+ update_data["uncertaintySources"] = enriched["uncertaintySources"]
133
+ if enriched.get("evidenceProfileGeneratedAt"):
134
+ update_data["evidenceProfileGeneratedAt"] = enriched["evidenceProfileGeneratedAt"]
135
+
136
+ # Sources
137
+ if enriched.get("sources"):
138
+ update_data["sources"] = enriched["sources"]
139
+
140
+ # Raw evidence snippets
141
+ if enriched.get("rawEvidenceSnippets"):
142
+ update_data["rawEvidenceSnippets"] = enriched["rawEvidenceSnippets"]
143
+
144
+ # Composition estimates
145
+ if enriched.get("compositionEstimates"):
146
+ update_data["compositionEstimates"] = enriched["compositionEstimates"]
147
+
148
+ # Training period
149
+ if enriched.get("trainingPeriodStart"):
150
+ if isinstance(enriched["trainingPeriodStart"], str):
151
+ update_data["trainingPeriodStart"] = datetime.fromisoformat(enriched["trainingPeriodStart"])
152
+ else:
153
+ update_data["trainingPeriodStart"] = enriched["trainingPeriodStart"]
154
+ if enriched.get("trainingPeriodEnd"):
155
+ if isinstance(enriched["trainingPeriodEnd"], str):
156
+ update_data["trainingPeriodEnd"] = datetime.fromisoformat(enriched["trainingPeriodEnd"])
157
+ else:
158
+ update_data["trainingPeriodEnd"] = enriched["trainingPeriodEnd"]
159
+
160
+ # Update model
161
+ if update_data:
162
+ update_data["updatedAt"] = datetime.now()
163
+ await prisma.modelregistry.update(
164
+ where={"id": model.id},
165
+ data=update_data
166
+ )
167
+ print(f" βœ… Updated {len(update_data)} fields")
168
+ success_count += 1
169
+ else:
170
+ print(f" ⚠️ No new data to update")
171
+ success_count += 1
172
+
173
+ except Exception as e:
174
+ print(f" ❌ Error: {e}")
175
+ error_count += 1
176
+ continue
177
+
178
+ print(f"\nβœ… Enrichment complete!")
179
+ print(f" Successfully enriched: {success_count}/{len(models)}")
180
+ print(f" Errors: {error_count}")
181
+ print(f" Completed at: {datetime.now().isoformat()}")
182
+
183
+ finally:
184
+ await prisma.disconnect()
185
+
186
+
187
+ async def main():
188
+ """Main entry point"""
189
+ import argparse
190
+
191
+ parser = argparse.ArgumentParser(description="Enrich all models in database")
192
+ parser.add_argument("--limit", type=int, help="Limit number of models to enrich")
193
+ parser.add_argument("--no-web", action="store_true", help="Disable web search enrichment")
194
+ parser.add_argument("--no-llm", action="store_true", help="Disable LLM extraction")
195
+
196
+ args = parser.parse_args()
197
+
198
+ await enrich_all_models(
199
+ limit=args.limit,
200
+ use_web_search=not args.no_web,
201
+ use_llm_extraction=not args.no_llm
202
+ )
203
+
204
+
205
+ if __name__ == "__main__":
206
+ asyncio.run(main())
207
+
registry/enrichment/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """
2
+ Model enrichment module for web-based data collection
3
+ """
4
+
registry/enrichment/comprehensive_enrichment.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Comprehensive Model Enrichment Pipeline
3
+ Orchestrates all enrichment sources and merges data
4
+ """
5
+
6
+ import sys
7
+ import asyncio
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Any
10
+ from datetime import datetime
11
+ import json
12
+
13
+ # Add project root to path
14
+ project_root = Path(__file__).parent.parent.parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ from registry.collectors.epoch_collector import EpochCollector
18
+ from registry.collectors.hf_collector import HuggingFaceCollector
19
+ from registry.enrichment.web_enrichment import WebModelEnricher
20
+ from registry.evidence_profile import EvidenceProfileManager
21
+ from registry.inference.reconciliation import TokenInferenceReconciler
22
+
23
+
24
+ class ComprehensiveModelEnricher:
25
+ """Orchestrates all enrichment sources for comprehensive model metadata"""
26
+
27
+ def __init__(
28
+ self,
29
+ use_web_search: bool = True,
30
+ use_llm_extraction: bool = True,
31
+ exa_api_key: Optional[str] = None,
32
+ llm_provider: str = "openai",
33
+ llm_api_key: Optional[str] = None
34
+ ):
35
+ """
36
+ Initialize comprehensive enricher
37
+
38
+ Args:
39
+ use_web_search: Enable web search enrichment
40
+ use_llm_extraction: Enable LLM extraction
41
+ exa_api_key: Exa API key
42
+ llm_provider: LLM provider ("openai" or "anthropic")
43
+ llm_api_key: LLM API key
44
+ """
45
+ self.epoch_collector = EpochCollector()
46
+ self.hf_collector = HuggingFaceCollector()
47
+ self.inference_reconciler = TokenInferenceReconciler()
48
+ self.evidence_manager = EvidenceProfileManager()
49
+
50
+ if use_web_search:
51
+ try:
52
+ self.web_enricher = WebModelEnricher(
53
+ exa_api_key=exa_api_key,
54
+ llm_provider=llm_provider,
55
+ llm_api_key=llm_api_key
56
+ )
57
+ except Exception as e:
58
+ print(f"Warning: Web enricher initialization failed: {e}")
59
+ self.web_enricher = None
60
+ else:
61
+ self.web_enricher = None
62
+
63
+ async def enrich_model(
64
+ self,
65
+ model_id: str,
66
+ provider: str,
67
+ family: Optional[str] = None,
68
+ existing_data: Optional[Dict[str, Any]] = None
69
+ ) -> Dict[str, Any]:
70
+ """
71
+ Enrich model with data from all sources
72
+
73
+ Args:
74
+ model_id: Model identifier
75
+ provider: Model provider
76
+ family: Model family
77
+ existing_data: Existing model data
78
+
79
+ Returns:
80
+ Comprehensive enriched metadata
81
+ """
82
+ # Start with existing data or empty dict
83
+ enriched = existing_data.copy() if existing_data else {}
84
+ enriched["modelId"] = model_id
85
+ enriched["provider"] = provider
86
+ enriched["family"] = family or provider
87
+
88
+ # Source 1: Epoch AI
89
+ epoch_data = None
90
+ try:
91
+ # Epoch collector may be async or sync, handle both
92
+ if hasattr(self.epoch_collector, 'fetch_notable_models'):
93
+ epoch_models = self.epoch_collector.fetch_notable_models()
94
+ if asyncio.iscoroutine(epoch_models):
95
+ epoch_models = await epoch_models
96
+ for model in epoch_models:
97
+ if self._matches_model(model.get("model_name", ""), model_id, provider):
98
+ epoch_data = model
99
+ break
100
+ except Exception as e:
101
+ print(f"Epoch collection error: {e}")
102
+
103
+ # Source 2: HuggingFace
104
+ hf_data = None
105
+ try:
106
+ # HF collector may be async or sync, handle both
107
+ if hasattr(self.hf_collector, 'search_models'):
108
+ hf_result = self.hf_collector.search_models(model_id)
109
+ if asyncio.iscoroutine(hf_result):
110
+ hf_result = await hf_result
111
+ if hf_result:
112
+ hf_data = hf_result[0] if isinstance(hf_result, list) else hf_result
113
+ except Exception as e:
114
+ print(f"HF collection error: {e}")
115
+
116
+ # Source 3: Web search (if enabled)
117
+ web_data = None
118
+ if self.web_enricher:
119
+ try:
120
+ web_data = self.web_enricher.enrich_model(
121
+ model_id=model_id,
122
+ provider=provider,
123
+ existing_data=enriched
124
+ )
125
+ except Exception as e:
126
+ print(f"Web enrichment error: {e}")
127
+
128
+ # Merge all sources with priority
129
+ merged = self._merge_sources(
130
+ enriched,
131
+ epoch_data,
132
+ hf_data,
133
+ web_data
134
+ )
135
+
136
+ # Run token inference if we have params
137
+ if merged.get("params"):
138
+ try:
139
+ inference_input = {
140
+ "params": merged.get("params"),
141
+ "flops": merged.get("flopsReported"),
142
+ "architecture": {
143
+ "is_moe": merged.get("isMoe", False),
144
+ "num_experts": merged.get("numExperts"),
145
+ },
146
+ "provider": provider,
147
+ "model_id": model_id,
148
+ }
149
+ inference_result = self.inference_reconciler.reconcile(inference_input)
150
+
151
+ # Add token estimates
152
+ merged["tokensEstMin"] = inference_result.get("min")
153
+ merged["tokensEstMax"] = inference_result.get("max")
154
+ merged["tokensEstMid"] = inference_result.get("mid")
155
+ merged["tokensRangeGeneratedAt"] = datetime.now()
156
+ except Exception as e:
157
+ print(f"Token inference error: {e}")
158
+
159
+ # Generate evidence profile
160
+ evidence_profile = self._generate_evidence_profile(
161
+ epoch_data,
162
+ hf_data,
163
+ web_data,
164
+ merged
165
+ )
166
+
167
+ # Add evidence profile fields
168
+ merged["evidenceTypes"] = json.dumps(evidence_profile.get("evidence_types", []))
169
+ merged["evidenceStrength"] = evidence_profile.get("strength")
170
+ merged["uncertaintySources"] = json.dumps(evidence_profile.get("uncertainty", []))
171
+ merged["evidenceProfileGeneratedAt"] = datetime.now()
172
+
173
+ # Combine sources
174
+ sources = []
175
+ if epoch_data and epoch_data.get("source_url"):
176
+ sources.append({
177
+ "type": "epoch",
178
+ "url": epoch_data.get("source_url"),
179
+ "retrieved_at": datetime.now().isoformat(),
180
+ })
181
+ if hf_data and hf_data.get("url"):
182
+ sources.append({
183
+ "type": "huggingface",
184
+ "url": hf_data.get("url"),
185
+ "retrieved_at": datetime.now().isoformat(),
186
+ })
187
+ if web_data:
188
+ sources.extend(web_data.get("sources", []))
189
+
190
+ if sources:
191
+ merged["sources"] = json.dumps(sources)
192
+
193
+ # Add raw evidence snippets
194
+ raw_snippets = []
195
+ if web_data:
196
+ raw_snippets.extend(web_data.get("raw_evidence_snippets", []))
197
+
198
+ if raw_snippets:
199
+ merged["rawEvidenceSnippets"] = json.dumps(raw_snippets)
200
+
201
+ # Add composition estimates if training data info found
202
+ if web_data and web_data.get("training_data_composition"):
203
+ composition = {
204
+ "description": web_data.get("training_data_composition"),
205
+ "sources": web_data.get("training_data_sources", []),
206
+ }
207
+ merged["compositionEstimates"] = json.dumps(composition)
208
+
209
+ return merged
210
+
211
+ def _matches_model(self, name: str, model_id: str, provider: str) -> bool:
212
+ """Check if model name matches"""
213
+ name_lower = name.lower()
214
+ model_lower = model_id.lower()
215
+ provider_lower = provider.lower()
216
+
217
+ # Exact match
218
+ if model_lower in name_lower or name_lower in model_lower:
219
+ return True
220
+
221
+ # Provider match + partial model match
222
+ if provider_lower in name_lower:
223
+ # Check for common model patterns
224
+ if any(pattern in model_lower for pattern in ["gpt", "claude", "gemini", "llama", "mistral"]):
225
+ if any(pattern in name_lower for pattern in ["gpt", "claude", "gemini", "llama", "mistral"]):
226
+ return True
227
+
228
+ return False
229
+
230
+ def _merge_sources(
231
+ self,
232
+ base: Dict[str, Any],
233
+ epoch_data: Optional[Dict[str, Any]],
234
+ hf_data: Optional[Dict[str, Any]],
235
+ web_data: Optional[Dict[str, Any]]
236
+ ) -> Dict[str, Any]:
237
+ """Merge data from all sources with priority"""
238
+ merged = base.copy()
239
+
240
+ # Priority: Direct disclosures > Third-party analysis > Inferred
241
+
242
+ # From Epoch (high priority - curated dataset)
243
+ if epoch_data:
244
+ if not merged.get("params") and epoch_data.get("parameter_count"):
245
+ merged["params"] = epoch_data.get("parameter_count") / 1e9
246
+ if not merged.get("releaseDate") and epoch_data.get("release_date"):
247
+ merged["releaseDate"] = self._parse_date(epoch_data.get("release_date"))
248
+ if not merged.get("architectureType") and epoch_data.get("architecture_type"):
249
+ merged["architectureType"] = epoch_data.get("architecture_type")
250
+ if epoch_data.get("architecture_type", "").lower() == "moe":
251
+ merged["isMoe"] = True
252
+ if not merged.get("multimodal"):
253
+ modality = epoch_data.get("modality", "").lower()
254
+ merged["multimodal"] = modality in ["multimodal", "vision", "image"]
255
+ if not merged.get("flopsReported") and epoch_data.get("compute_PF_days"):
256
+ merged["flopsReported"] = epoch_data.get("compute_PF_days")
257
+
258
+ # From HuggingFace (medium priority)
259
+ if hf_data:
260
+ if not merged.get("params") and hf_data.get("params"):
261
+ merged["params"] = hf_data.get("params") / 1e9
262
+ if not merged.get("releaseDate") and hf_data.get("created_at"):
263
+ merged["releaseDate"] = self._parse_date(hf_data.get("created_at"))
264
+
265
+ # From Web search (lower priority but comprehensive)
266
+ if web_data:
267
+ if not merged.get("releaseDate") and web_data.get("release_date"):
268
+ merged["releaseDate"] = self._parse_date(web_data.get("release_date"))
269
+ if not merged.get("architectureType") and web_data.get("architecture_type"):
270
+ merged["architectureType"] = web_data.get("architecture_type")
271
+ if web_data.get("is_moe") is not None:
272
+ merged["isMoe"] = web_data.get("is_moe")
273
+ if web_data.get("num_experts"):
274
+ merged["numExperts"] = web_data.get("num_experts")
275
+ if web_data.get("multimodal") is not None:
276
+ merged["multimodal"] = web_data.get("multimodal")
277
+ if web_data.get("training_period_start"):
278
+ merged["trainingPeriodStart"] = self._parse_date(web_data.get("training_period_start"))
279
+ if web_data.get("training_period_end"):
280
+ merged["trainingPeriodEnd"] = self._parse_date(web_data.get("training_period_end"))
281
+
282
+ return merged
283
+
284
+ def _generate_evidence_profile(
285
+ self,
286
+ epoch_data: Optional[Dict[str, Any]],
287
+ hf_data: Optional[Dict[str, Any]],
288
+ web_data: Optional[Dict[str, Any]],
289
+ merged: Dict[str, Any]
290
+ ) -> Dict[str, Any]:
291
+ """Generate evidence profile from all sources"""
292
+ evidence_types = set()
293
+ sources_count = 0
294
+
295
+ # Epoch data - usually E4 (third-party analysis)
296
+ if epoch_data:
297
+ sources_count += 1
298
+ evidence_types.add("E4")
299
+ if epoch_data.get("parameter_count"):
300
+ evidence_types.add("E3") # Architecture evidence
301
+
302
+ # HuggingFace - E4 (third-party)
303
+ if hf_data:
304
+ sources_count += 1
305
+ evidence_types.add("E4")
306
+
307
+ # Web data - can be E1-E5 depending on source
308
+ if web_data:
309
+ sources_count += len(web_data.get("sources", []))
310
+ web_evidence = web_data.get("evidence_types", [])
311
+ evidence_types.update(web_evidence)
312
+
313
+ # Direct disclosure if we have official sources
314
+ if web_data:
315
+ for source in web_data.get("sources", []):
316
+ url = source.get("url", "").lower()
317
+ if any(domain in url for domain in ["openai.com", "anthropic.com", "google.com", "meta.com"]):
318
+ evidence_types.add("E1") # Direct disclosure
319
+
320
+ # Compute evidence
321
+ if merged.get("flopsReported"):
322
+ evidence_types.add("E2")
323
+
324
+ # Architecture evidence
325
+ if merged.get("params") or merged.get("architectureType"):
326
+ evidence_types.add("E3")
327
+
328
+ # Calculate strength
329
+ if sources_count >= 3 and "E1" in evidence_types:
330
+ strength = "S-High"
331
+ elif sources_count >= 2 or "E1" in evidence_types:
332
+ strength = "S-Medium"
333
+ else:
334
+ strength = "S-Low"
335
+
336
+ # Identify uncertainties
337
+ uncertainty = []
338
+ if not merged.get("releaseDate"):
339
+ uncertainty.append("U5")
340
+ if not merged.get("architectureType"):
341
+ uncertainty.append("U3")
342
+ if not merged.get("trainingPeriodStart"):
343
+ uncertainty.append("U2")
344
+
345
+ return {
346
+ "evidence_types": list(evidence_types),
347
+ "strength": strength,
348
+ "uncertainty": uncertainty,
349
+ }
350
+
351
+ def _parse_date(self, date_value: Any) -> Optional[datetime]:
352
+ """Parse date from various formats"""
353
+ if not date_value:
354
+ return None
355
+
356
+ if isinstance(date_value, datetime):
357
+ return date_value
358
+
359
+ if isinstance(date_value, str):
360
+ try:
361
+ return datetime.fromisoformat(date_value.replace("Z", "+00:00"))
362
+ except (ValueError, AttributeError):
363
+ if len(date_value) == 4 and date_value.isdigit():
364
+ return datetime(int(date_value), 1, 1)
365
+
366
+ if isinstance(date_value, int):
367
+ return datetime(date_value, 1, 1)
368
+
369
+ return None
370
+
registry/enrichment/llm_extractor.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Extraction Service
3
+ Uses OpenAI or Anthropic API to extract structured data from web content
4
+ """
5
+
6
+ import os
7
+ import json
8
+ from typing import Dict, List, Optional, Any
9
+ from datetime import datetime
10
+ import re
11
+
12
+
13
+ class LLMExtractor:
14
+ """Extracts structured model metadata from web content using LLM"""
15
+
16
+ def __init__(self, provider: str = "openai", api_key: Optional[str] = None):
17
+ """
18
+ Initialize LLM extractor
19
+
20
+ Args:
21
+ provider: "openai" or "anthropic"
22
+ api_key: API key (if None, reads from env)
23
+ """
24
+ self.provider = provider.lower()
25
+ self.api_key = api_key or os.getenv(
26
+ "OPENAI_API_KEY" if self.provider == "openai" else "ANTHROPIC_API_KEY"
27
+ )
28
+
29
+ if not self.api_key:
30
+ raise ValueError(
31
+ f"{provider.upper()}_API_KEY environment variable required"
32
+ )
33
+
34
+ if self.provider == "openai":
35
+ try:
36
+ import openai
37
+ self.client = openai.OpenAI(api_key=self.api_key)
38
+ except ImportError:
39
+ raise ImportError("openai package required. Install with: pip install openai")
40
+ elif self.provider == "anthropic":
41
+ try:
42
+ import anthropic
43
+ self.client = anthropic.Anthropic(api_key=self.api_key)
44
+ except ImportError:
45
+ raise ImportError("anthropic package required. Install with: pip install anthropic")
46
+ else:
47
+ raise ValueError(f"Unsupported provider: {provider}")
48
+
49
+ def extract_model_metadata(
50
+ self,
51
+ model_id: str,
52
+ provider: str,
53
+ web_content: str,
54
+ context: Optional[Dict[str, Any]] = None
55
+ ) -> Dict[str, Any]:
56
+ """
57
+ Extract structured metadata from web content
58
+
59
+ Args:
60
+ model_id: Model identifier (e.g., "GPT-4")
61
+ provider: Model provider (e.g., "OpenAI")
62
+ web_content: Text content from web sources
63
+ context: Additional context (existing model data)
64
+
65
+ Returns:
66
+ Dict with extracted metadata
67
+ """
68
+ # Build prompt
69
+ prompt = self._build_extraction_prompt(model_id, provider, web_content, context)
70
+
71
+ try:
72
+ if self.provider == "openai":
73
+ response = self.client.chat.completions.create(
74
+ model="gpt-4o-mini", # Use cheaper model for extraction
75
+ messages=[
76
+ {"role": "system", "content": "You are a data extraction assistant. Extract structured information about AI models from text. Always return valid JSON."},
77
+ {"role": "user", "content": prompt}
78
+ ],
79
+ response_format={"type": "json_object"},
80
+ temperature=0.1, # Low temperature for consistent extraction
81
+ )
82
+ result_text = response.choices[0].message.content
83
+ else: # anthropic
84
+ response = self.client.messages.create(
85
+ model="claude-3-5-sonnet-20241022",
86
+ max_tokens=2000,
87
+ system="You are a data extraction assistant. Extract structured information about AI models from text. Always return valid JSON.",
88
+ messages=[
89
+ {"role": "user", "content": prompt}
90
+ ],
91
+ )
92
+ result_text = response.content[0].text
93
+
94
+ # Parse JSON response
95
+ extracted = json.loads(result_text)
96
+
97
+ # Validate and normalize
98
+ return self._validate_extraction(extracted, model_id, provider)
99
+
100
+ except json.JSONDecodeError as e:
101
+ print(f"JSON decode error: {e}")
102
+ return self._empty_extraction()
103
+ except Exception as e:
104
+ print(f"LLM extraction error: {e}")
105
+ return self._empty_extraction()
106
+
107
+ def _build_extraction_prompt(
108
+ self,
109
+ model_id: str,
110
+ provider: str,
111
+ web_content: str,
112
+ context: Optional[Dict[str, Any]] = None
113
+ ) -> str:
114
+ """Build extraction prompt"""
115
+ context_str = ""
116
+ if context:
117
+ context_str = f"\n\nExisting known information:\n{json.dumps(context, indent=2)}"
118
+
119
+ prompt = f"""Extract structured information about the AI model "{model_id}" by {provider} from the following text.
120
+
121
+ Text content:
122
+ {web_content[:8000]} # Limit content size
123
+ {context_str}
124
+
125
+ Extract the following information and return as JSON:
126
+ {{
127
+ "release_date": "YYYY-MM-DD or null if not found",
128
+ "architecture_type": "Transformer, MoE, or null",
129
+ "is_moe": true/false/null,
130
+ "num_experts": number or null,
131
+ "multimodal": true/false/null,
132
+ "training_data_sources": ["source1", "source2"] or [],
133
+ "training_data_composition": "description or null",
134
+ "training_period_start": "YYYY-MM-DD or null",
135
+ "training_period_end": "YYYY-MM-DD or null",
136
+ "evidence_types": ["E1", "E2", "E3", "E4", "E5"] based on disclosure level,
137
+ "confidence": "high/medium/low",
138
+ "raw_snippets": ["relevant quote 1", "relevant quote 2"]
139
+ }}
140
+
141
+ Evidence type mapping:
142
+ - E1: Direct disclosure (company blog, paper, official announcement)
143
+ - E2: Compute evidence (FLOPs, hardware mentioned)
144
+ - E3: Architecture evidence (parameters, MoE details)
145
+ - E4: Third-party analysis (research paper, news article)
146
+ - E5: Qualitative hints (vague mentions, speculation)
147
+
148
+ Return only valid JSON, no additional text."""
149
+
150
+ return prompt
151
+
152
+ def _validate_extraction(
153
+ self,
154
+ extracted: Dict[str, Any],
155
+ model_id: str,
156
+ provider: str
157
+ ) -> Dict[str, Any]:
158
+ """Validate and normalize extracted data"""
159
+ validated = {
160
+ "release_date": self._parse_date(extracted.get("release_date")),
161
+ "architecture_type": extracted.get("architecture_type"),
162
+ "is_moe": extracted.get("is_moe"),
163
+ "num_experts": extracted.get("num_experts"),
164
+ "multimodal": extracted.get("multimodal"),
165
+ "training_data_sources": extracted.get("training_data_sources", []),
166
+ "training_data_composition": extracted.get("training_data_composition"),
167
+ "training_period_start": self._parse_date(extracted.get("training_period_start")),
168
+ "training_period_end": self._parse_date(extracted.get("training_period_end")),
169
+ "evidence_types": extracted.get("evidence_types", []),
170
+ "confidence": extracted.get("confidence", "medium"),
171
+ "raw_snippets": extracted.get("raw_snippets", []),
172
+ }
173
+
174
+ # Normalize architecture type
175
+ if validated["architecture_type"]:
176
+ arch = validated["architecture_type"].lower()
177
+ if "moe" in arch or "mixture" in arch:
178
+ validated["architecture_type"] = "MoE"
179
+ validated["is_moe"] = True
180
+ elif "transformer" in arch:
181
+ validated["architecture_type"] = "Transformer"
182
+ else:
183
+ validated["architecture_type"] = "Transformer" # Default
184
+
185
+ return validated
186
+
187
+ def _parse_date(self, date_str: Optional[str]) -> Optional[str]:
188
+ """Parse date string to ISO format"""
189
+ if not date_str or date_str.lower() in ["null", "none", ""]:
190
+ return None
191
+
192
+ # Try to extract date from various formats
193
+ # YYYY-MM-DD
194
+ if re.match(r'^\d{4}-\d{2}-\d{2}$', date_str):
195
+ return date_str
196
+
197
+ # Try to parse common formats
198
+ try:
199
+ dt = datetime.fromisoformat(date_str.replace('Z', '+00:00'))
200
+ return dt.strftime("%Y-%m-%d")
201
+ except (ValueError, AttributeError):
202
+ pass
203
+
204
+ # Try year only
205
+ year_match = re.search(r'\b(20\d{2})\b', date_str)
206
+ if year_match:
207
+ return f"{year_match.group(1)}-01-01"
208
+
209
+ return None
210
+
211
+ def _empty_extraction(self) -> Dict[str, Any]:
212
+ """Return empty extraction result"""
213
+ return {
214
+ "release_date": None,
215
+ "architecture_type": None,
216
+ "is_moe": None,
217
+ "num_experts": None,
218
+ "multimodal": None,
219
+ "training_data_sources": [],
220
+ "training_data_composition": None,
221
+ "training_period_start": None,
222
+ "training_period_end": None,
223
+ "evidence_types": [],
224
+ "confidence": "low",
225
+ "raw_snippets": [],
226
+ }
227
+
registry/enrichment/web_enrichment.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Search Enrichment Service
3
+ Uses Exa API to search for model information and extract metadata
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Any
10
+ from datetime import datetime, timedelta
11
+ import json
12
+
13
+ # Add project root to path
14
+ project_root = Path(__file__).parent.parent.parent
15
+ sys.path.insert(0, str(project_root))
16
+
17
+ try:
18
+ from ingestion.exa_client import ExaClient
19
+ except ImportError:
20
+ # Fallback if exa_client not available
21
+ ExaClient = None
22
+
23
+ from .llm_extractor import LLMExtractor
24
+
25
+
26
+ class WebModelEnricher:
27
+ """Enriches model metadata using web search via Exa API"""
28
+
29
+ def __init__(
30
+ self,
31
+ exa_api_key: Optional[str] = None,
32
+ llm_provider: str = "openai",
33
+ llm_api_key: Optional[str] = None
34
+ ):
35
+ """
36
+ Initialize web enricher
37
+
38
+ Args:
39
+ exa_api_key: Exa API key (if None, reads from env)
40
+ llm_provider: LLM provider ("openai" or "anthropic")
41
+ llm_api_key: LLM API key (if None, reads from env)
42
+ """
43
+ # Initialize Exa client
44
+ if ExaClient:
45
+ try:
46
+ self.exa_client = ExaClient(api_key=exa_api_key)
47
+ except Exception as e:
48
+ print(f"Warning: Exa client initialization failed: {e}")
49
+ self.exa_client = None
50
+ else:
51
+ self.exa_client = None
52
+
53
+ # Initialize LLM extractor
54
+ try:
55
+ self.llm_extractor = LLMExtractor(provider=llm_provider, api_key=llm_api_key)
56
+ except Exception as e:
57
+ print(f"Warning: LLM extractor initialization failed: {e}")
58
+ self.llm_extractor = None
59
+
60
+ def enrich_model(
61
+ self,
62
+ model_id: str,
63
+ provider: str,
64
+ existing_data: Optional[Dict[str, Any]] = None,
65
+ num_search_results: int = 5
66
+ ) -> Dict[str, Any]:
67
+ """
68
+ Enrich model metadata using web search
69
+
70
+ Args:
71
+ model_id: Model identifier
72
+ provider: Model provider
73
+ existing_data: Existing model data (for context)
74
+ num_search_results: Number of search results to fetch
75
+
76
+ Returns:
77
+ Dict with enriched metadata
78
+ """
79
+ if not self.exa_client:
80
+ return self._empty_enrichment()
81
+
82
+ # Build search queries
83
+ queries = self._build_search_queries(model_id, provider)
84
+
85
+ # Search for information
86
+ all_results = []
87
+ for query in queries:
88
+ try:
89
+ results = self.exa_client.search(
90
+ query=query,
91
+ num_results=num_search_results
92
+ )
93
+ all_results.extend(results)
94
+ except Exception as e:
95
+ print(f"Exa search error for query '{query}': {e}")
96
+ continue
97
+
98
+ if not all_results:
99
+ return self._empty_enrichment()
100
+
101
+ # Deduplicate by URL
102
+ seen_urls = set()
103
+ unique_results = []
104
+ for result in all_results:
105
+ if result.url not in seen_urls:
106
+ seen_urls.add(result.url)
107
+ unique_results.append(result)
108
+
109
+ # Sort by score
110
+ unique_results.sort(key=lambda x: x.score, reverse=True)
111
+
112
+ # Fetch content from top results
113
+ top_urls = [r.url for r in unique_results[:num_search_results]]
114
+ contents = {}
115
+ if self.exa_client:
116
+ try:
117
+ contents = self.exa_client.get_contents(top_urls)
118
+ except Exception as e:
119
+ print(f"Error fetching contents: {e}")
120
+
121
+ # Combine all content
122
+ combined_content = self._combine_content(unique_results, contents)
123
+
124
+ # Extract structured data using LLM
125
+ extracted = {}
126
+ if self.llm_extractor and combined_content:
127
+ try:
128
+ extracted = self.llm_extractor.extract_model_metadata(
129
+ model_id=model_id,
130
+ provider=provider,
131
+ web_content=combined_content,
132
+ context=existing_data
133
+ )
134
+ except Exception as e:
135
+ print(f"LLM extraction error: {e}")
136
+
137
+ # Build enrichment result
138
+ enrichment = {
139
+ "release_date": extracted.get("release_date"),
140
+ "architecture_type": extracted.get("architecture_type"),
141
+ "is_moe": extracted.get("is_moe"),
142
+ "num_experts": extracted.get("num_experts"),
143
+ "multimodal": extracted.get("multimodal"),
144
+ "training_data_sources": extracted.get("training_data_sources", []),
145
+ "training_data_composition": extracted.get("training_data_composition"),
146
+ "training_period_start": extracted.get("training_period_start"),
147
+ "training_period_end": extracted.get("training_period_end"),
148
+ "evidence_types": extracted.get("evidence_types", []),
149
+ "confidence": extracted.get("confidence", "medium"),
150
+ "sources": [
151
+ {
152
+ "type": "web_search",
153
+ "url": result.url,
154
+ "title": result.title,
155
+ "score": result.score,
156
+ "published_date": result.published_date,
157
+ }
158
+ for result in unique_results[:num_search_results]
159
+ ],
160
+ "raw_evidence_snippets": [
161
+ {
162
+ "text": snippet,
163
+ "source_url": unique_results[0].url if unique_results else None,
164
+ }
165
+ for snippet in extracted.get("raw_snippets", [])
166
+ ],
167
+ }
168
+
169
+ return enrichment
170
+
171
+ def _build_search_queries(self, model_id: str, provider: str) -> List[str]:
172
+ """Build search queries for model information"""
173
+ queries = [
174
+ f"{model_id} {provider} release date architecture training data",
175
+ f"{model_id} {provider} system card technical details",
176
+ f"{model_id} {provider} training dataset sources",
177
+ f"{model_id} {provider} model card paper",
178
+ ]
179
+
180
+ # Add provider-specific queries
181
+ if provider.lower() in ["openai", "anthropic", "google", "meta"]:
182
+ queries.append(f"{provider} {model_id} official announcement blog")
183
+
184
+ return queries
185
+
186
+ def _combine_content(
187
+ self,
188
+ results: List[Any],
189
+ contents: Dict[str, str]
190
+ ) -> str:
191
+ """Combine content from search results"""
192
+ combined = []
193
+
194
+ for result in results:
195
+ # Prefer full content, fallback to summary
196
+ if result.url in contents:
197
+ combined.append(f"--- Content from {result.url} ---\n{contents[result.url]}")
198
+ elif result.summary:
199
+ combined.append(f"--- Summary from {result.url} ---\n{result.title}\n{result.summary}")
200
+
201
+ return "\n\n".join(combined)
202
+
203
+ def _empty_enrichment(self) -> Dict[str, Any]:
204
+ """Return empty enrichment result"""
205
+ return {
206
+ "release_date": None,
207
+ "architecture_type": None,
208
+ "is_moe": None,
209
+ "num_experts": None,
210
+ "multimodal": None,
211
+ "training_data_sources": [],
212
+ "training_data_composition": None,
213
+ "training_period_start": None,
214
+ "training_period_end": None,
215
+ "evidence_types": [],
216
+ "confidence": "low",
217
+ "sources": [],
218
+ "raw_evidence_snippets": [],
219
+ }
220
+
registry/evidence_profile.py CHANGED
@@ -143,4 +143,68 @@ class EvidenceProfileManager:
143
  explanation += f". Uncertainties: {unc_desc}"
144
 
145
  return explanation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
 
143
  explanation += f". Uncertainties: {unc_desc}"
144
 
145
  return explanation
146
+
147
+ @staticmethod
148
+ def generate_from_web_data(
149
+ web_data: Dict[str, Any],
150
+ existing_evidence: Optional[Dict[str, Any]] = None
151
+ ) -> Dict[str, Any]:
152
+ """
153
+ Generate evidence profile from web-extracted data
154
+
155
+ Args:
156
+ web_data: Extracted data from web search/LLM
157
+ existing_evidence: Existing evidence profile to merge with
158
+
159
+ Returns:
160
+ Evidence profile dict
161
+ """
162
+ evidence_types = set(web_data.get("evidence_types", []))
163
+ confidence = web_data.get("confidence", "medium")
164
+ sources_count = len(web_data.get("sources", []))
165
+ raw_snippets = web_data.get("raw_evidence_snippets", [])
166
+
167
+ # Determine evidence strength based on sources and confidence
168
+ if sources_count >= 3 and confidence == "high":
169
+ strength = "S-High"
170
+ elif sources_count >= 2 or confidence == "high":
171
+ strength = "S-Medium"
172
+ else:
173
+ strength = "S-Low"
174
+
175
+ # Identify uncertainty sources for missing information
176
+ uncertainty_sources = []
177
+ if not web_data.get("release_date"):
178
+ uncertainty_sources.append("U5") # Intentional opacity or missing
179
+ if not web_data.get("architecture_type"):
180
+ uncertainty_sources.append("U3") # Architecture unclear
181
+ if not web_data.get("training_data_sources"):
182
+ uncertainty_sources.append("U2") # Data composition unknown
183
+
184
+ # If we have existing evidence, merge it
185
+ if existing_evidence:
186
+ existing_types = set(existing_evidence.get("evidence_types", []))
187
+ evidence_types.update(existing_types)
188
+
189
+ # Use higher strength if available
190
+ existing_strength = existing_evidence.get("strength")
191
+ if existing_strength:
192
+ strength_order = {"S-High": 3, "S-Medium": 2, "S-Low": 1}
193
+ if strength_order.get(existing_strength, 0) > strength_order.get(strength, 0):
194
+ strength = existing_strength
195
+
196
+ # Merge uncertainties
197
+ existing_unc = set(existing_evidence.get("uncertainty", []))
198
+ uncertainty_sources = list(set(uncertainty_sources) | existing_unc)
199
+
200
+ # Create evidence profile
201
+ profile = {
202
+ "evidence_types": list(evidence_types),
203
+ "strength": strength,
204
+ "uncertainty": uncertainty_sources,
205
+ "generated_at": datetime.now().isoformat(),
206
+ "evidence_version": "1.0",
207
+ }
208
+
209
+ return profile
210
 
registry/ingest_priority_models.py CHANGED
@@ -25,6 +25,7 @@ from registry.collectors.epoch_collector import EpochCollector
25
  from registry.collectors.hf_collector import HuggingFaceCollector
26
  from registry.inference.reconciliation import TokenInferenceReconciler
27
  from registry.linkage import create_deal_model_linkages
 
28
  from dotenv import load_dotenv
29
 
30
  # Prisma imports
@@ -41,12 +42,25 @@ load_dotenv()
41
  class PriorityModelIngester:
42
  """Programmatic ingester for priority models"""
43
 
44
- def __init__(self):
45
  self.epoch_collector = EpochCollector()
46
  self.hf_collector = HuggingFaceCollector()
47
  self.inference_reconciler = TokenInferenceReconciler()
48
  self.prisma = None
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  async def connect_db(self):
51
  """Connect to Prisma database"""
52
  if not PRISMA_AVAILABLE:
@@ -226,20 +240,63 @@ class PriorityModelIngester:
226
  where={"modelId": model_id}
227
  )
228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  if existing:
230
  # Update
 
231
  updated = await self.prisma.modelregistry.update(
232
  where={"id": existing.id},
233
- data={
234
- **{k: v for k, v in model_data.items() if k != "modelId"},
235
- "updatedAt": datetime.now(),
236
- }
237
  )
238
  return updated.id
239
  else:
240
  # Create
 
241
  created = await self.prisma.modelregistry.create(
242
- data=model_data
243
  )
244
  return created.id
245
 
@@ -317,24 +374,40 @@ class PriorityModelIngester:
317
 
318
  print(f"\nπŸ“¦ Processing: {model_id} ({provider})")
319
 
320
- # Step 1: Fetch metadata
321
- print(f" πŸ” Fetching metadata...")
322
  epoch_data = await self.fetch_epoch_data(model_id, provider)
323
  hf_data = await self.fetch_hf_data(model_id, provider)
324
 
325
- # Step 2: Merge metadata
326
  model_data = self.merge_metadata(priority_model, epoch_data, hf_data)
327
 
328
- # Step 3: Run token inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  print(f" 🧠 Running token inference...")
330
  inference_results = await self.run_token_inference(model_data)
331
  model_data.update(inference_results)
332
 
333
- # Step 4: Store in database
334
  print(f" πŸ’Ύ Storing in database...")
335
  await self.upsert_model(model_data)
336
 
337
- # Step 5: Create linkages
338
  print(f" πŸ”— Creating deal linkages...")
339
  await self.create_linkages(model_id)
340
 
 
25
  from registry.collectors.hf_collector import HuggingFaceCollector
26
  from registry.inference.reconciliation import TokenInferenceReconciler
27
  from registry.linkage import create_deal_model_linkages
28
+ from registry.enrichment.comprehensive_enrichment import ComprehensiveModelEnricher
29
  from dotenv import load_dotenv
30
 
31
  # Prisma imports
 
42
  class PriorityModelIngester:
43
  """Programmatic ingester for priority models"""
44
 
45
+ def __init__(self, use_web_enrichment: bool = True):
46
  self.epoch_collector = EpochCollector()
47
  self.hf_collector = HuggingFaceCollector()
48
  self.inference_reconciler = TokenInferenceReconciler()
49
  self.prisma = None
50
 
51
+ # Initialize comprehensive enricher if web enrichment enabled
52
+ if use_web_enrichment:
53
+ try:
54
+ self.comprehensive_enricher = ComprehensiveModelEnricher(
55
+ use_web_search=True,
56
+ use_llm_extraction=True
57
+ )
58
+ except Exception as e:
59
+ print(f"Warning: Comprehensive enricher initialization failed: {e}")
60
+ self.comprehensive_enricher = None
61
+ else:
62
+ self.comprehensive_enricher = None
63
+
64
  async def connect_db(self):
65
  """Connect to Prisma database"""
66
  if not PRISMA_AVAILABLE:
 
240
  where={"modelId": model_id}
241
  )
242
 
243
+ # Prepare data for Prisma (convert field names)
244
+ prisma_data = {}
245
+
246
+ # Map field names from model_data to Prisma schema
247
+ field_mapping = {
248
+ "modelId": "modelId",
249
+ "provider": "provider",
250
+ "family": "family",
251
+ "params": "params",
252
+ "releaseDate": "releaseDate",
253
+ "architectureType": "architectureType",
254
+ "isMoe": "isMoe",
255
+ "numExperts": "numExperts",
256
+ "multimodal": "multimodal",
257
+ "tokensEstMin": "tokensEstMin",
258
+ "tokensEstMax": "tokensEstMax",
259
+ "tokensEstMid": "tokensEstMid",
260
+ "tokensRangeGeneratedAt": "tokensRangeGeneratedAt",
261
+ "evidenceTypes": "evidenceTypes",
262
+ "evidenceStrength": "evidenceStrength",
263
+ "uncertaintySources": "uncertaintySources",
264
+ "evidenceProfileGeneratedAt": "evidenceProfileGeneratedAt",
265
+ "sources": "sources",
266
+ "rawEvidenceSnippets": "rawEvidenceSnippets",
267
+ "compositionEstimates": "compositionEstimates",
268
+ "trainingPeriodStart": "trainingPeriodStart",
269
+ "trainingPeriodEnd": "trainingPeriodEnd",
270
+ }
271
+
272
+ for key, value in model_data.items():
273
+ if key in field_mapping and value is not None:
274
+ prisma_key = field_mapping[key]
275
+ # Convert date strings to datetime if needed
276
+ if prisma_key in ["releaseDate", "tokensRangeGeneratedAt", "evidenceProfileGeneratedAt", "trainingPeriodStart", "trainingPeriodEnd"]:
277
+ if isinstance(value, str):
278
+ try:
279
+ prisma_data[prisma_key] = datetime.fromisoformat(value.replace("Z", "+00:00"))
280
+ except:
281
+ pass
282
+ elif isinstance(value, datetime):
283
+ prisma_data[prisma_key] = value
284
+ else:
285
+ prisma_data[prisma_key] = value
286
+
287
  if existing:
288
  # Update
289
+ prisma_data["updatedAt"] = datetime.now()
290
  updated = await self.prisma.modelregistry.update(
291
  where={"id": existing.id},
292
+ data=prisma_data
 
 
 
293
  )
294
  return updated.id
295
  else:
296
  # Create
297
+ prisma_data["modelId"] = model_id
298
  created = await self.prisma.modelregistry.create(
299
+ data=prisma_data
300
  )
301
  return created.id
302
 
 
374
 
375
  print(f"\nπŸ“¦ Processing: {model_id} ({provider})")
376
 
377
+ # Step 1: Fetch metadata from Epoch and HF
378
+ print(f" πŸ” Fetching metadata from Epoch/HF...")
379
  epoch_data = await self.fetch_epoch_data(model_id, provider)
380
  hf_data = await self.fetch_hf_data(model_id, provider)
381
 
382
+ # Step 2: Merge metadata from Epoch/HF
383
  model_data = self.merge_metadata(priority_model, epoch_data, hf_data)
384
 
385
+ # Step 3: Web enrichment (if enabled)
386
+ if self.comprehensive_enricher:
387
+ print(f" 🌐 Running web enrichment...")
388
+ try:
389
+ # Use comprehensive enricher which includes web search
390
+ enriched = await self.comprehensive_enricher.enrich_model(
391
+ model_id=model_id,
392
+ provider=provider,
393
+ family=priority_model.get("family"),
394
+ existing_data=model_data
395
+ )
396
+ # Merge web enrichment results
397
+ model_data.update(enriched)
398
+ except Exception as e:
399
+ print(f" Warning: Web enrichment error: {e}")
400
+
401
+ # Step 4: Run token inference
402
  print(f" 🧠 Running token inference...")
403
  inference_results = await self.run_token_inference(model_data)
404
  model_data.update(inference_results)
405
 
406
+ # Step 5: Store in database
407
  print(f" πŸ’Ύ Storing in database...")
408
  await self.upsert_model(model_data)
409
 
410
+ # Step 6: Create linkages
411
  print(f" πŸ”— Creating deal linkages...")
412
  await self.create_linkages(model_id)
413
 
registry/requirements.txt CHANGED
@@ -4,3 +4,6 @@ huggingface-hub>=0.16.0
4
  pandas>=2.0.0
5
  python-dotenv>=1.0.0
6
  prisma>=0.11.0
 
 
 
 
4
  pandas>=2.0.0
5
  python-dotenv>=1.0.0
6
  prisma>=0.11.0
7
+ openai>=1.0.0
8
+ anthropic>=0.18.0
9
+ requests>=2.31.0