"use client" import { useState, useEffect, useMemo } from "react" import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card" import { Button } from "@/components/ui/button" import { RadioGroup, RadioGroupItem } from "@/components/ui/radio-group" import { Label } from "@/components/ui/label" import { Textarea } from "@/components/ui/textarea" import { Input } from "@/components/ui/input" import { Badge } from "@/components/ui/badge" import { Separator } from "@/components/ui/separator" import type { CategoryScore } from "@/components/ai-evaluation-dashboard" import { HelpCircle, CheckCircle, Plus, Trash2 } from "lucide-react" import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "@/components/ui/tooltip" import { BENCHMARK_QUESTIONS, PROCESS_QUESTIONS, SOURCE_TYPES, ADDITIONAL_ASPECTS_SECTION, getFieldPlaceholder, getHint } from "@/lib/category-data" // The detailed per-category and per-question hints, plus recommended placeholders, // are centralized in `lib/category-data.ts`. This component uses the exported // helpers `getHint` and `getFieldPlaceholder` and the question lists. const CustomFieldComponent = ({ questionId, fieldType, value, onChange, }: { questionId: string fieldType: string value: string onChange: (value: string) => void }) => { const getFieldConfig = (questionId: string, fieldType: string) => { const configs: Record> = { A2: { thresholds: { label: "Quantitative Thresholds", placeholder: "e.g., >85% accuracy, <0.1 error rate" }, thresholdSource: { label: "Threshold Source", placeholder: "e.g., industry standard, research paper, policy requirement", }, passFail: { label: "Pass/Fail Determination", placeholder: "e.g., Pass - exceeded 85% threshold" }, }, A3: { comparativeScores: { label: "Comparative Scores", placeholder: "e.g., Our model: 87.2%, GPT-4: 85.1%, Previous version: 82.3%", }, baselineType: { label: "Baseline Type", placeholder: "e.g., SOTA, previous version, industry standard" }, significance: { label: "Statistical Significance", placeholder: "e.g., p<0.05, 95% CI: [1.2, 3.8]" }, }, A4: { testTypes: { label: "Test Types", placeholder: "e.g., adversarial attacks, load testing, distribution shift" }, failureRates: { label: "Failure/Degradation Rates", placeholder: "e.g., 15% failure under adversarial inputs" }, robustnessMetrics: { label: "Robustness Metrics", placeholder: "e.g., attack success rate, performance drop %", }, }, A5: { liveMetrics: { label: "Live Metrics Tracked", placeholder: "e.g., error rates, latency, drift detection" }, samplingCadence: { label: "Sampling Cadence", placeholder: "e.g., every 1000 requests, hourly, daily" }, alertThresholds: { label: "Alert Thresholds", placeholder: "e.g., >5% error rate, >500ms latency" }, }, A6: { procedure: { label: "Contamination Check Procedure", placeholder: "e.g., n-gram overlap analysis, URL deduplication", }, contaminationRate: { label: "Contamination Rate", placeholder: "e.g., <1% overlap detected, 0.3% exact matches", }, mitigations: { label: "Mitigations Taken", placeholder: "e.g., removed overlapping samples, used holdout set" }, }, A7: { comparisonSystems: { label: "Comparison Systems", placeholder: "e.g., GPT-4, Claude-3, Gemini Pro" }, evaluationConditions: { label: "Evaluation Conditions", placeholder: "e.g., same prompts, temperature=0, identical hardware", }, relativeMetrics: { label: "Relative Performance Metrics", placeholder: "e.g., 15% better accuracy, 2x faster inference", }, }, B1: { scope: { label: "Evaluation Scope", placeholder: "e.g., measures reasoning capability in mathematical contexts", }, successFailureDefinitions: { label: "Success/Failure Definitions", placeholder: "e.g., success = >80% on grade-level problems", }, hypotheses: { label: "Hypotheses Being Tested", placeholder: "e.g., model can solve multi-step word problems" }, }, B2: { replicationPackage: { label: "Replication Package", placeholder: "e.g., GitHub repo with code, configs, prompts", }, accessLevel: { label: "Access Level", placeholder: "e.g., public, access-controlled, internal only" }, proxies: { label: "Proxies (if not shareable)", placeholder: "e.g., synthetic examples, anonymized data" }, }, B5: { reviewers: { label: "Reviewers", placeholder: "e.g., domain experts, affected user groups, ethics board" }, feedbackChanges: { label: "Changes from Feedback", placeholder: "e.g., added bias metrics, revised interpretation", }, disagreements: { label: "Unresolved Disagreements", placeholder: "e.g., threshold levels, risk severity ratings", }, }, B6: { uncertaintyDisclosure: { label: "Uncertainty Disclosure", placeholder: "e.g., error bars, confidence intervals, variance across runs", }, axesConsistency: { label: "Axes Consistency", placeholder: "e.g., consistent 0-100 scale, no truncated axes" }, sampleSizes: { label: "Sample Sizes", placeholder: "e.g., n=1000 test samples, 5 random seeds" }, selectionCriteria: { label: "Selection Criteria", placeholder: "e.g., all results shown, no cherry-picking" }, }, B8: { triggers: { label: "Re-evaluation Triggers", placeholder: "e.g., model updates, data drift >5%, security incidents", }, versionedSpecs: { label: "Versioned Eval Specs", placeholder: "e.g., eval spec v2.1, change log maintained" }, auditTrail: { label: "Audit Trail", placeholder: "e.g., all changes logged with timestamps and rationale" }, mitigationProtocols: { label: "Mitigation Protocols", placeholder: "e.g., automated rollback, manual review process", }, retestProcedures: { label: "Retest Procedures", placeholder: "e.g., full eval suite after fixes, regression testing", }, }, } return configs[questionId]?.[fieldType] || { label: fieldType, placeholder: "" } } const config = getFieldConfig(questionId, fieldType) return (