Spaces:
Running
Running
| /** | |
| * Open Dataset Collection Engine | |
| * | |
| * Opt-in data collection for building an open source research dataset. | |
| * Researchers who enable `contribute_to_dataset: true` in their requests | |
| * have their (anonymized) interaction data stored for the community. | |
| * | |
| * Stored data: | |
| * - Messages sent and received (no API keys, no IPs) | |
| * - AutoTune parameters and context detection results | |
| * - Model used and response metadata | |
| * - User feedback/ratings | |
| * - Parseltongue and STM pipeline metadata | |
| * | |
| * Privacy guarantees: | |
| * - Strictly opt-in per request | |
| * - No PII: API keys, IPs, and auth tokens are NEVER stored | |
| * - Dataset is exportable via GET /v1/dataset/export | |
| * - Caller can request deletion via DELETE /v1/dataset/:id | |
| */ | |
| import { randomUUID } from 'crypto' | |
| // ββ Types ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export interface DatasetEntry { | |
| id: string | |
| timestamp: number | |
| // Request metadata | |
| endpoint: string // which API endpoint was called | |
| model: string | |
| mode: 'standard' | 'ultraplinian' | |
| // Messages (stripped of system prompts to avoid leaking custom prompts) | |
| messages: Array<{ role: string; content: string }> | |
| response: string | |
| // AutoTune data | |
| autotune?: { | |
| strategy: string | |
| detected_context: string | |
| confidence: number | |
| params: Record<string, number> | |
| reasoning: string | |
| } | |
| // Parseltongue data | |
| parseltongue?: { | |
| triggers_found: string[] | |
| technique_used: string | |
| transformations_count: number | |
| } | |
| // STM data | |
| stm?: { | |
| modules_applied: string[] | |
| } | |
| // ULTRAPLINIAN race data | |
| ultraplinian?: { | |
| tier: string | |
| models_queried: string[] | |
| winner_model: string | |
| all_scores: Array<{ model: string; score: number; duration_ms: number; success: boolean }> | |
| total_duration_ms: number | |
| } | |
| // Feedback (added later via POST /v1/feedback if user rates) | |
| feedback?: { | |
| rating: 1 | -1 | |
| heuristics?: { | |
| response_length: number | |
| repetition_score: number | |
| vocabulary_diversity: number | |
| } | |
| } | |
| } | |
| // ββ In-Memory Store ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| // For a research preview, in-memory is fine. For production, swap with | |
| // a persistent store (SQLite, PostgreSQL, or HF Dataset repo push). | |
| let dataset: DatasetEntry[] = [] | |
| const MAX_ENTRIES = 10000 // Cap to prevent unbounded memory growth | |
| // ββ Public API βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export function addEntry(entry: Omit<DatasetEntry, 'id' | 'timestamp'>): string { | |
| const id = randomUUID() | |
| const record: DatasetEntry = { | |
| ...entry, | |
| id, | |
| timestamp: Date.now(), | |
| } | |
| dataset.push(record) | |
| // Evict oldest entries if over cap | |
| if (dataset.length > MAX_ENTRIES) { | |
| dataset = dataset.slice(dataset.length - MAX_ENTRIES) | |
| } | |
| return id | |
| } | |
| export function addFeedbackToEntry( | |
| entryId: string, | |
| feedback: DatasetEntry['feedback'], | |
| ): boolean { | |
| const entry = dataset.find(e => e.id === entryId) | |
| if (!entry) return false | |
| entry.feedback = feedback | |
| return true | |
| } | |
| export function deleteEntry(id: string): boolean { | |
| const idx = dataset.findIndex(e => e.id === id) | |
| if (idx === -1) return false | |
| dataset.splice(idx, 1) | |
| return true | |
| } | |
| export function getDataset(): DatasetEntry[] { | |
| return dataset | |
| } | |
| export function getDatasetStats(): { | |
| total_entries: number | |
| entries_with_feedback: number | |
| mode_breakdown: Record<string, number> | |
| model_breakdown: Record<string, number> | |
| context_breakdown: Record<string, number> | |
| oldest_entry: number | null | |
| newest_entry: number | null | |
| } { | |
| const modeBreakdown: Record<string, number> = {} | |
| const modelBreakdown: Record<string, number> = {} | |
| const contextBreakdown: Record<string, number> = {} | |
| let withFeedback = 0 | |
| for (const entry of dataset) { | |
| modeBreakdown[entry.mode] = (modeBreakdown[entry.mode] || 0) + 1 | |
| modelBreakdown[entry.model] = (modelBreakdown[entry.model] || 0) + 1 | |
| if (entry.autotune?.detected_context) { | |
| const ctx = entry.autotune.detected_context | |
| contextBreakdown[ctx] = (contextBreakdown[ctx] || 0) + 1 | |
| } | |
| if (entry.feedback) withFeedback++ | |
| } | |
| return { | |
| total_entries: dataset.length, | |
| entries_with_feedback: withFeedback, | |
| mode_breakdown: modeBreakdown, | |
| model_breakdown: modelBreakdown, | |
| context_breakdown: contextBreakdown, | |
| oldest_entry: dataset.length > 0 ? dataset[0].timestamp : null, | |
| newest_entry: dataset.length > 0 ? dataset[dataset.length - 1].timestamp : null, | |
| } | |
| } | |