Spaces:
Running
Running
| /** | |
| * | |
| * Copyright 2023-2025 InspectorRAGet Team | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| * | |
| **/ | |
| 'use client'; | |
| import { countBy, isEmpty } from 'lodash'; | |
| import cx from 'classnames'; | |
| import { useState, useMemo, useEffect, useRef } from 'react'; | |
| import { Tile, Button, Slider } from '@carbon/react'; | |
| import { WarningAlt } from '@carbon/icons-react'; | |
| import { ScatterChart } from '@carbon/charts-react'; | |
| import { useTheme } from '@/src/theme'; | |
| import { Model, Metric, TaskEvaluation } from '@/src/types'; | |
| import { | |
| castToNumber, | |
| AgreementLevels, | |
| extractMetricDisplayName, | |
| } from '@/src/utilities/metrics'; | |
| import { calculateFisherRandomization } from '@/src/utilities/significance'; | |
| import { areObjectsIntersecting } from '@/src/utilities/objects'; | |
| import { hash } from '@/src/utilities/strings'; | |
| import Filters from '@/src/components/filters/Filters'; | |
| import TasksTable from '@/src/views/tasks-table/TasksTable'; | |
| import ModelSelector from '@/src/components/selectors/ModelSelector'; | |
| import MetricSelector from '@/src/components/selectors/MetricSelector'; | |
| import { getModelColorPalette } from '@/src/utilities/colors'; | |
| import '@carbon/charts-react/styles.css'; | |
| import classes from './ModelComparator.module.scss'; | |
| // =================================================================================== | |
| // TYPES | |
| // =================================================================================== | |
| type StatisticalInformation = { | |
| p: number; | |
| distributionA: number[]; | |
| meanA: number; | |
| distributionB: number[]; | |
| meanB: number; | |
| taskIds?: string[]; | |
| }; | |
| interface Props { | |
| evaluationsPerMetric: { [key: string]: TaskEvaluation[] }; | |
| models: Model[]; | |
| metrics: Metric[]; | |
| filters: { [key: string]: string[] }; | |
| onTaskSelection: Function; | |
| } | |
| // =================================================================================== | |
| // COMPUTE FUNCTIONS | |
| // =================================================================================== | |
| /** | |
| * Build an array containing evaluations only for selected models for each task. | |
| * | |
| * Eligbility criteria: | |
| * | |
| * 1. Must have evaluations for both selected models | |
| * | |
| * 2. Each evaluation must have an agreement value for selected metric | |
| * | |
| * | |
| * @param evaluations evaluations for all task | |
| * @param modelA selected model | |
| * @param modelB selected model | |
| * @param metric selected metric | |
| * @returns | |
| */ | |
| function extractEvaluationsPerTask( | |
| evaluations: TaskEvaluation[], | |
| modelA: Model, | |
| modelB: Model, | |
| metric: string, | |
| selectedFilters: { [key: string]: string[] }, | |
| selectedMetricRange?: number[], | |
| ) { | |
| // Step 1: Initiaze necessary variable | |
| const modelEvaluationsPerTask: { [key: string]: TaskEvaluation[] } = {}; | |
| // Step 2: Add to model evaluations for a task, if evaluation meets eligbility criteria | |
| evaluations.forEach((evaluation) => { | |
| if ( | |
| (evaluation.modelId === modelA.modelId || | |
| evaluation.modelId === modelB.modelId) && | |
| evaluation[`${metric}_agg`].level !== AgreementLevels.NO_AGREEMENT && | |
| (!isEmpty(selectedFilters) | |
| ? areObjectsIntersecting(selectedFilters, evaluation) | |
| : true) | |
| ) { | |
| const modelEvaluationsForTask = | |
| modelEvaluationsPerTask[evaluation.taskId]; | |
| if (modelEvaluationsForTask) { | |
| modelEvaluationsForTask.push(evaluation); | |
| } else { | |
| modelEvaluationsPerTask[evaluation.taskId] = [evaluation]; | |
| } | |
| } | |
| }); | |
| // Step 3: Retain only those task which has evaluations for both models | |
| // and one or more models have aggregate value in the selected range | |
| return Object.values(modelEvaluationsPerTask).filter( | |
| (entry) => | |
| entry.length == 2 && | |
| (selectedMetricRange | |
| ? (entry[0][`${metric}_agg`].value >= selectedMetricRange[0] && | |
| entry[0][`${metric}_agg`].value <= selectedMetricRange[1]) || | |
| (entry[1][`${metric}_agg`].value >= selectedMetricRange[0] && | |
| entry[1][`${metric}_agg`].value <= selectedMetricRange[1]) | |
| : true), | |
| ); | |
| } | |
| /** | |
| * Run statistical significance test based on Fisher randomization method. | |
| * @param evaluationsPerMetric evaluations per metric | |
| * @param metrics metrics | |
| * @param modelA selected model | |
| * @param modelB selected model | |
| * @param selectedMetric If `undefined`, run for all metrics in `evaluationsPerMetric` object | |
| * @returns | |
| */ | |
| function runStatisticalSignificanceTest( | |
| evaluationsPerMetric: { [key: string]: TaskEvaluation[] }, | |
| metrics: Metric[], | |
| modelA: Model, | |
| modelB: Model, | |
| selectedMetric: Metric | undefined, | |
| selectedFilters: { [key: string]: string[] }, | |
| selectedMetricRange?: number[], | |
| ) { | |
| // Step 1: Initialize necessary variables | |
| const evaluationsPerMetricPerTask: { [key: string]: TaskEvaluation[][] } = {}; | |
| // Step 2: Retain evaluations for tasks where both models have agreement value | |
| if (selectedMetric) { | |
| const evaluationsPerTask = extractEvaluationsPerTask( | |
| evaluationsPerMetric[selectedMetric.name], | |
| modelA, | |
| modelB, | |
| selectedMetric.name, | |
| selectedFilters, | |
| selectedMetricRange, | |
| ); | |
| if (evaluationsPerTask.length !== 0) { | |
| evaluationsPerMetricPerTask[selectedMetric.name] = evaluationsPerTask; | |
| } | |
| } else { | |
| Object.keys(evaluationsPerMetric).forEach((metric) => { | |
| const evaluationsPerTask = extractEvaluationsPerTask( | |
| evaluationsPerMetric[metric], | |
| modelA, | |
| modelB, | |
| metric, | |
| selectedFilters, | |
| selectedMetricRange, | |
| ); | |
| if (evaluationsPerTask.length !== 0) { | |
| evaluationsPerMetricPerTask[metric] = evaluationsPerTask; | |
| } | |
| }); | |
| } | |
| // Step 3: Compute model value distribution for every metric | |
| const distributionA: { [key: string]: number[] } = {}; | |
| const distributionB: { [key: string]: number[] } = {}; | |
| const taskIds: { [key: string]: string[] } = {}; | |
| Object.keys(evaluationsPerMetricPerTask).forEach((metric) => { | |
| const metricValues = metrics.find((entry) => entry.name === metric)?.values; | |
| taskIds[metric] = evaluationsPerMetricPerTask[metric].map( | |
| (entry) => entry[0].taskId, | |
| ); | |
| distributionA[metric] = evaluationsPerMetricPerTask[metric].map((entry) => | |
| castToNumber( | |
| entry[0].modelId === modelA.modelId | |
| ? entry[0][`${metric}_agg`].value | |
| : entry[1][`${metric}_agg`].value, | |
| metricValues, | |
| ), | |
| ); | |
| distributionB[metric] = evaluationsPerMetricPerTask[metric].map((entry) => | |
| castToNumber( | |
| entry[1].modelId === modelB.modelId | |
| ? entry[1][`${metric}_agg`].value | |
| : entry[0][`${metric}_agg`].value, | |
| metricValues, | |
| ), | |
| ); | |
| }); | |
| // Step 3: Compute p value and means for every metric by comparing distributions | |
| const information: { [key: string]: StatisticalInformation } = {}; | |
| Object.keys(evaluationsPerMetricPerTask).forEach((metric) => { | |
| const [p, meanA, meanB] = calculateFisherRandomization( | |
| distributionA[metric], | |
| distributionB[metric], | |
| ); | |
| information[metric] = { | |
| p: p, | |
| distributionA: distributionA[metric], | |
| meanA: meanA, | |
| distributionB: distributionB[metric], | |
| meanB: meanB, | |
| taskIds: taskIds[metric], | |
| }; | |
| }); | |
| return information; | |
| } | |
| // =================================================================================== | |
| // RENDER FUNCTIONS | |
| // =================================================================================== | |
| function prepareScatterPlotData( | |
| modelA: string, | |
| distributionA: number[], | |
| modelB: string, | |
| distributionB: number[], | |
| taskIds?: string[], | |
| ) { | |
| if (distributionA.length !== distributionB.length) { | |
| return []; | |
| } | |
| // Step 2: Collate model wise predictions per task | |
| const distributions: { values: number[]; taskId: string }[] = []; | |
| distributionA.forEach((valueA, index) => { | |
| distributions.push({ | |
| taskId: taskIds ? taskIds[index] : `${index}`, | |
| values: [valueA, distributionB[index]], | |
| }); | |
| }); | |
| // Step 3: Primary sort based on model A's value | |
| distributions.sort((a, b) => a.values[0] - b.values[0]); | |
| // Step 4: Scondary sort based on Model B's value | |
| distributions.sort((a, b) => a.values[1] - b.values[1]); | |
| // Step 5: Prepare chart data | |
| const chartData: { [key: string]: string | number }[] = []; | |
| distributions.forEach((entry, idx) => { | |
| // Model A record | |
| chartData.push({ | |
| group: modelA, | |
| key: idx, | |
| value: entry.values[0], | |
| ...(taskIds && { taskId: entry.taskId }), | |
| }); | |
| // Model B record | |
| chartData.push({ | |
| group: modelB, | |
| key: idx, | |
| value: entry.values[1], | |
| ...(taskIds && { taskId: entry.taskId }), | |
| }); | |
| }); | |
| return chartData; | |
| } | |
| function renderResult( | |
| statisticalInformationPerMetric: { [key: string]: StatisticalInformation }, | |
| metric: Metric, | |
| modelA: Model, | |
| modelB: Model, | |
| numEvaluations: number, | |
| modelColors: { [key: string]: string }, | |
| modelOrder: string[], | |
| theme?: string, | |
| ) { | |
| if (statisticalInformationPerMetric.hasOwnProperty(metric.name)) { | |
| return ( | |
| <div | |
| key={'statisticalInformation-metric-' + metric.name} | |
| className={classes.performanceInformation} | |
| > | |
| <h5> | |
| <strong>{extractMetricDisplayName(metric)}</strong> | |
| </h5> | |
| <Tile className={classes.tile}> | |
| <div className={classes.tileContent}> | |
| <span className={classes.tileContentInformation}>p-value</span> | |
| <span | |
| className={classes.tileContentValue} | |
| suppressHydrationWarning={true} | |
| > | |
| {statisticalInformationPerMetric[metric.name]['p'].toFixed(4)} | |
| </span> | |
| <span | |
| className={classes.tileContentDecision} | |
| suppressHydrationWarning={true} | |
| > | |
| {statisticalInformationPerMetric[metric.name]['p'] <= 0.05 | |
| ? 'Significant' | |
| : 'Not significant'} | |
| </span> | |
| </div> | |
| </Tile> | |
| <ScatterChart | |
| data={prepareScatterPlotData( | |
| modelA.name, | |
| statisticalInformationPerMetric[metric.name].distributionA, | |
| modelB.name, | |
| statisticalInformationPerMetric[metric.name].distributionB, | |
| statisticalInformationPerMetric[metric.name].taskIds, | |
| )} | |
| options={{ | |
| axes: { | |
| left: { | |
| mapsTo: 'value', | |
| ...(metric.type === 'numerical' && | |
| typeof metric.minValue === 'number' && | |
| typeof metric.maxValue === 'number' && { | |
| domain: [metric.minValue, metric.maxValue], | |
| }), | |
| ...(metric.type === 'categorical' && | |
| typeof metric.minValue !== 'number' && | |
| typeof metric.maxValue !== 'number' && { | |
| domain: [ | |
| castToNumber(metric.minValue?.value || 0, metric.values), | |
| castToNumber(metric.maxValue?.value || 4, metric.values), | |
| ], | |
| }), | |
| title: extractMetricDisplayName(metric), | |
| }, | |
| bottom: { | |
| mapsTo: 'key', | |
| ticks: { | |
| values: [], | |
| }, | |
| title: `Tasks (${ | |
| statisticalInformationPerMetric[metric.name].distributionA | |
| .length | |
| }/${numEvaluations})`, | |
| }, | |
| }, | |
| width: '500px', | |
| height: '500px', | |
| toolbar: { | |
| enabled: false, | |
| }, | |
| color: { | |
| scale: modelColors, | |
| }, | |
| legend: { | |
| order: modelOrder, | |
| }, | |
| theme: theme, | |
| }} | |
| ></ScatterChart> | |
| </div> | |
| ); | |
| } else { | |
| return null; | |
| } | |
| } | |
| // =================================================================================== | |
| // MAIN FUNCTION | |
| // =================================================================================== | |
| export default function ModelComparator({ | |
| evaluationsPerMetric, | |
| models, | |
| metrics, | |
| filters, | |
| onTaskSelection, | |
| }: Props) { | |
| // Step 1: Initialize state and necessary variables | |
| const [WindowWidth, setWindowWidth] = useState<number>( | |
| global?.window && window.innerWidth, | |
| ); | |
| const [modelA, setModelA] = useState<Model>(models[0]); | |
| const [modelB, setModelB] = useState<Model>(models[1]); | |
| const [selectedMetric, setSelectedMetric] = useState<Metric | undefined>( | |
| undefined, | |
| ); | |
| const [selectedFilters, setSelectedFilters] = useState<{ | |
| [key: string]: string[]; | |
| }>({}); | |
| const [statisticalInformationPerMetric, setStatisticalInformationPerMetric] = | |
| useState<{ [key: string]: StatisticalInformation } | undefined>(undefined); | |
| const [modelColors, modelOrder] = getModelColorPalette(models); | |
| const [selectedMetricRange, setSelectedMetricRange] = useState<number[]>(); | |
| const chartRef = useRef(null); | |
| // Step 2: Run effects | |
| // Step 2.a: Window resizing | |
| useEffect(() => { | |
| const handleWindowResize = () => { | |
| setWindowWidth(window.innerWidth); | |
| }; | |
| // Step: Add event listener | |
| window.addEventListener('resize', handleWindowResize); | |
| // Step: Cleanup to remove event listener | |
| return () => { | |
| window.removeEventListener('resize', handleWindowResize); | |
| }; | |
| }, []); | |
| // Step 2.a: Fetch theme | |
| const { theme } = useTheme(); | |
| //Step 2.c: Bucket human and algoritmic metrics | |
| const [humanMetrics, algorithmMetrics] = useMemo(() => { | |
| const hMetrics: Metric[] = []; | |
| const aMetrics: Metric[] = []; | |
| Object.values(metrics).forEach((metric) => { | |
| if (metric.author === 'human') { | |
| hMetrics.push(metric); | |
| } else if (metric.author === 'algorithm') { | |
| aMetrics.push(metric); | |
| } | |
| }); | |
| return [hMetrics, aMetrics]; | |
| }, [metrics]); | |
| // Step 2.d: Reset selected metric range, only applicable for numerical metrics | |
| useEffect(() => { | |
| if ( | |
| selectedMetric && | |
| selectedMetric.type === 'numerical' && | |
| selectedMetric.range | |
| ) { | |
| setSelectedMetricRange([ | |
| selectedMetric.range[0], | |
| selectedMetric.range[1], | |
| ]); | |
| } else setSelectedMetricRange(undefined); | |
| }, [selectedMetric]); | |
| // Step 2.e: Identify visible evaluations | |
| const filteredEvaluations = useMemo(() => { | |
| if (selectedMetric) { | |
| // Step 1: Identify evaluations for selected models | |
| const evaluationsForSelectedModels = evaluationsPerMetric[ | |
| selectedMetric.name | |
| ].filter( | |
| (evaluation) => | |
| (evaluation.modelId === modelA.modelId || | |
| evaluation.modelId === modelB.modelId) && | |
| (!isEmpty(selectedFilters) | |
| ? areObjectsIntersecting(selectedFilters, evaluation) | |
| : true), | |
| ); | |
| // Step 2: Collate evaluation per task id | |
| const evaluationsPerTask: { [key: string]: { [key: string]: number } } = | |
| {}; | |
| evaluationsForSelectedModels.forEach((evaluation) => { | |
| const entry = evaluationsPerTask[evaluation.taskId]; | |
| if (entry) { | |
| entry[evaluation.modelId] = | |
| evaluation[`${selectedMetric.name}_agg`].value; | |
| } else { | |
| evaluationsPerTask[evaluation.taskId] = { | |
| [evaluation.modelId]: | |
| evaluation[`${selectedMetric.name}_agg`].value, | |
| }; | |
| } | |
| }); | |
| // Step 3: Only select evaluation tasks where models aggregate values differe | |
| // and one or more models have aggregate value in the selected range | |
| const visibleEvaluationTaskIds = Object.keys(evaluationsPerTask).filter( | |
| (taskId) => | |
| Object.keys(countBy(Object.values(evaluationsPerTask[taskId]))) | |
| .length > 1 && | |
| (selectedMetricRange | |
| ? (Object.values(evaluationsPerTask[taskId])[0] >= | |
| selectedMetricRange[0] && | |
| Object.values(evaluationsPerTask[taskId])[0] <= | |
| selectedMetricRange[1]) || | |
| (Object.values(evaluationsPerTask[taskId])[1] >= | |
| selectedMetricRange[0] && | |
| Object.values(evaluationsPerTask[taskId])[1] <= | |
| selectedMetricRange[1]) | |
| : true), | |
| ); | |
| // Step 4: Return evaluations for selected evaluation tasks where models aggregate values differe | |
| return evaluationsForSelectedModels.filter((evaluation) => | |
| visibleEvaluationTaskIds.includes(evaluation.taskId), | |
| ); | |
| } | |
| return []; | |
| }, [ | |
| evaluationsPerMetric, | |
| selectedMetric, | |
| modelA, | |
| modelB, | |
| selectedMetricRange, | |
| ]); | |
| // Step 2.f: Reset statistical information, if either of model changes or filters are changed | |
| useEffect(() => { | |
| setStatisticalInformationPerMetric(undefined); | |
| }, [modelA, modelB, selectedFilters]); | |
| // Step 2.g: Recalculate statistical information, if metric changes | |
| useEffect(() => { | |
| if ( | |
| !selectedMetric && | |
| statisticalInformationPerMetric && | |
| Object.keys(statisticalInformationPerMetric).length == 1 | |
| ) { | |
| setStatisticalInformationPerMetric( | |
| runStatisticalSignificanceTest( | |
| evaluationsPerMetric, | |
| metrics, | |
| modelA, | |
| modelB, | |
| selectedMetric, | |
| selectedFilters, | |
| selectedMetricRange, | |
| ), | |
| ); | |
| } else if ( | |
| selectedMetric && | |
| selectedMetricRange && | |
| statisticalInformationPerMetric && | |
| statisticalInformationPerMetric.hasOwnProperty(selectedMetric.name) | |
| ) { | |
| setStatisticalInformationPerMetric( | |
| runStatisticalSignificanceTest( | |
| evaluationsPerMetric, | |
| metrics, | |
| modelA, | |
| modelB, | |
| selectedMetric, | |
| selectedFilters, | |
| selectedMetricRange, | |
| ), | |
| ); | |
| } | |
| }, [selectedMetric, selectedMetricRange]); | |
| // Step 2.h: Compute computation complexity | |
| const complexity = useMemo(() => { | |
| let size = 0; | |
| if (selectedMetric) { | |
| size = evaluationsPerMetric[selectedMetric.name].length / models.length; | |
| } else { | |
| size = Object.values(evaluationsPerMetric) | |
| .map((evaluations) => evaluations.length / models.length) | |
| .reduce((a, b) => a + b, 0); | |
| } | |
| if (size > 1000) { | |
| return 'high'; | |
| } | |
| return 'low'; | |
| }, [evaluationsPerMetric, selectedMetric]); | |
| // Step 2.i: Add chart event | |
| useEffect(() => { | |
| // Step 2.i.*: Local copy of reference | |
| let ref = null; | |
| // Step 2.i.**: Update reference and add event | |
| if (chartRef && chartRef.current) { | |
| ref = chartRef.current; | |
| //@ts-ignore | |
| ref.chart.services.events.addEventListener( | |
| 'scatter-click', | |
| ({ detail }) => { | |
| onTaskSelection(detail.datum.taskId); | |
| }, | |
| ); | |
| } | |
| // Step 2.i.***: Cleanup function | |
| return () => { | |
| if (ref) { | |
| //@ts-ignore | |
| ref.chart.services.events.removeEventListener( | |
| 'scatter-click', | |
| ({ detail }) => { | |
| onTaskSelection(detail.datum.taskId); | |
| }, | |
| ); | |
| } | |
| }; | |
| }, [chartRef, selectedMetric, statisticalInformationPerMetric]); | |
| // Step 3: Render | |
| return ( | |
| <div className={classes.page}> | |
| <div className={classes.selectors}> | |
| <div className={classes.modelSelector}> | |
| <ModelSelector | |
| id={'modelA-selector-excluding-model-' + modelB.modelId} | |
| key={'modelA-selector-excluding-model-' + modelB.modelId} | |
| models={models} | |
| defaultValue={modelA} | |
| onSelect={(modelId: string) => { | |
| const selectedModel = models.find( | |
| (model) => model.modelId === modelId, | |
| ); | |
| if (selectedModel) { | |
| setModelA(selectedModel); | |
| } | |
| }} | |
| disabledModels={[modelB]} | |
| /> | |
| </div> | |
| <div className={classes.modelSelector}> | |
| <ModelSelector | |
| id={'modelB-selector-excluding-model-' + modelA.modelId} | |
| key={'modelB-selector-excluding-model-' + modelA.modelId} | |
| models={models} | |
| defaultValue={modelB} | |
| onSelect={(modelId: string) => { | |
| const selectedModel = models.find( | |
| (model) => model.modelId === modelId, | |
| ); | |
| if (selectedModel) { | |
| setModelB(selectedModel); | |
| } | |
| }} | |
| disabledModels={[modelA]} | |
| /> | |
| </div> | |
| <div className={classes.metricSelector}> | |
| <MetricSelector | |
| metrics={metrics} | |
| onSelect={(metric: Metric | undefined) => { | |
| setSelectedMetric(metric); | |
| }} | |
| warn={!selectedMetric} | |
| warnText={'You must select a single metric to view tasks. '} | |
| /> | |
| </div> | |
| {selectedMetric && | |
| selectedMetric.type === 'numerical' && | |
| selectedMetric.range ? ( | |
| <div> | |
| <Slider | |
| ariaLabelInput="Lower bound" | |
| unstable_ariaLabelInputUpper="Upper bound" | |
| labelText={`Choose range`} | |
| value={ | |
| selectedMetricRange | |
| ? selectedMetricRange[0] | |
| : selectedMetric.range[0] | |
| } | |
| unstable_valueUpper={ | |
| selectedMetricRange | |
| ? selectedMetricRange[1] | |
| : selectedMetric.range[1] | |
| } | |
| min={selectedMetric.range[0]} | |
| max={selectedMetric.range[1]} | |
| step={ | |
| selectedMetric.range.length === 3 ? selectedMetric.range[2] : 1 | |
| } | |
| onChange={({ | |
| value, | |
| valueUpper, | |
| }: { | |
| value: number; | |
| valueUpper?: number; | |
| }) => { | |
| setSelectedMetricRange((prev) => [ | |
| value, | |
| valueUpper | |
| ? valueUpper | |
| : prev | |
| ? prev[1] | |
| : selectedMetric.range | |
| ? selectedMetric.range[2] | |
| : 100, | |
| ]); | |
| }} | |
| /> | |
| </div> | |
| ) : null} | |
| <div className={classes.calculateBtn}> | |
| <Button | |
| onClick={() => { | |
| // Run statistical significance calculations | |
| setStatisticalInformationPerMetric( | |
| runStatisticalSignificanceTest( | |
| evaluationsPerMetric, | |
| metrics, | |
| modelA, | |
| modelB, | |
| selectedMetric, | |
| selectedFilters, | |
| selectedMetricRange, | |
| ), | |
| ); | |
| }} | |
| > | |
| Calculate | |
| </Button> | |
| </div> | |
| </div> | |
| {!isEmpty(filters) ? ( | |
| <Filters | |
| keyPrefix="ModelComparator" | |
| filters={filters} | |
| selectedFilters={selectedFilters} | |
| setSelectedFilters={setSelectedFilters} | |
| /> | |
| ) : null} | |
| {statisticalInformationPerMetric ? ( | |
| <div className={classes.row}> | |
| <div className={classes.hypothesisContainer}> | |
| <span className={classes.hypothesisStatement}> | |
| H<sub>0</sub>: {modelA.name} and {modelB.name} scores are derived | |
| from the same distribution. | |
| </span> | |
| <span className={classes.hypothesisValidityCondition}> | |
| <span>{'Reject the null hypothesis if p < 0.05'}</span> | |
| </span> | |
| </div> | |
| {!selectedMetric && humanMetrics.length ? ( | |
| <div className={classes.row}> | |
| <h4>Human Evaluations</h4> | |
| <div | |
| className={cx( | |
| humanMetrics.length > 3 | |
| ? classes.graphsGrid | |
| : classes.graphsFlex, | |
| )} | |
| > | |
| {humanMetrics.map((metric) => | |
| renderResult( | |
| statisticalInformationPerMetric, | |
| metric, | |
| modelA, | |
| modelB, | |
| evaluationsPerMetric[metric.name].length / models.length, | |
| modelColors, | |
| modelOrder, | |
| theme, | |
| ), | |
| )} | |
| </div> | |
| </div> | |
| ) : null} | |
| {!selectedMetric && algorithmMetrics.length ? ( | |
| <div className={classes.row}> | |
| <h4>Algorithmic Evaluations</h4> | |
| <div | |
| className={cx( | |
| algorithmMetrics.length > 3 | |
| ? classes.graphsGrid | |
| : classes.graphsFlex, | |
| )} | |
| > | |
| {algorithmMetrics.map((metric) => | |
| renderResult( | |
| statisticalInformationPerMetric, | |
| metric, | |
| modelA, | |
| modelB, | |
| evaluationsPerMetric[metric.name].length / models.length, | |
| modelColors, | |
| modelOrder, | |
| theme, | |
| ), | |
| )} | |
| </div> | |
| </div> | |
| ) : null} | |
| {selectedMetric && | |
| statisticalInformationPerMetric.hasOwnProperty( | |
| selectedMetric.name, | |
| ) ? ( | |
| <div className={classes.row}> | |
| <div | |
| key={`statisticalInformation-metric-${selectedMetric.name}--${hash(JSON.stringify(statisticalInformationPerMetric[selectedMetric.name]))}`} | |
| className={classes.performanceInformation} | |
| > | |
| <h5> | |
| <strong>{extractMetricDisplayName(selectedMetric)}</strong> | |
| </h5> | |
| <Tile className={classes.tile}> | |
| <div className={classes.tileContent}> | |
| <span className={classes.tileContentInformation}> | |
| p-value | |
| </span> | |
| <span | |
| className={classes.tileContentValue} | |
| suppressHydrationWarning={true} | |
| > | |
| {statisticalInformationPerMetric[selectedMetric.name][ | |
| 'p' | |
| ].toFixed(4)} | |
| </span> | |
| <span | |
| className={classes.tileContentDecision} | |
| suppressHydrationWarning={true} | |
| > | |
| {statisticalInformationPerMetric[selectedMetric.name][ | |
| 'p' | |
| ] <= 0.05 | |
| ? 'Significant' | |
| : 'Not significant'} | |
| </span> | |
| </div> | |
| </Tile> | |
| <ScatterChart | |
| ref={chartRef} | |
| data={prepareScatterPlotData( | |
| modelA.name, | |
| statisticalInformationPerMetric[selectedMetric.name] | |
| .distributionA, | |
| modelB.name, | |
| statisticalInformationPerMetric[selectedMetric.name] | |
| .distributionB, | |
| statisticalInformationPerMetric[selectedMetric.name] | |
| .taskIds, | |
| )} | |
| options={{ | |
| axes: { | |
| left: { | |
| mapsTo: 'value', | |
| ...(selectedMetric.type === 'numerical' && | |
| typeof selectedMetric.minValue === 'number' && | |
| typeof selectedMetric.maxValue === 'number' && { | |
| domain: [ | |
| selectedMetric.minValue, | |
| selectedMetric.maxValue, | |
| ], | |
| }), | |
| ...(selectedMetric.type === 'categorical' && | |
| typeof selectedMetric.minValue !== 'number' && | |
| typeof selectedMetric.maxValue !== 'number' && { | |
| domain: [ | |
| castToNumber( | |
| selectedMetric.minValue?.value || 0, | |
| selectedMetric.values, | |
| ), | |
| castToNumber( | |
| selectedMetric.maxValue?.value || 4, | |
| selectedMetric.values, | |
| ), | |
| ], | |
| }), | |
| title: extractMetricDisplayName(selectedMetric), | |
| }, | |
| bottom: { | |
| mapsTo: 'key', | |
| ticks: { | |
| values: [], | |
| }, | |
| title: `Tasks (${ | |
| statisticalInformationPerMetric[selectedMetric.name] | |
| .distributionA.length | |
| }/${ | |
| evaluationsPerMetric[selectedMetric.name].length / | |
| models.length | |
| })`, | |
| }, | |
| }, | |
| width: `${Math.round(WindowWidth * 0.8)}px`, | |
| height: '500px', | |
| toolbar: { | |
| enabled: false, | |
| }, | |
| color: { | |
| scale: modelColors, | |
| }, | |
| legend: { | |
| order: modelOrder, | |
| }, | |
| theme: theme, | |
| }} | |
| ></ScatterChart> | |
| </div> | |
| </div> | |
| ) : ( | |
| <> | |
| <div className={classes.tasksContainerNotification}> | |
| <span | |
| className={classes.tasksContainerNotificationText} | |
| >{`Press calculate to measure statistical significance ${selectedMetric ? 'for' : 'across'} "${selectedMetric ? extractMetricDisplayName(selectedMetric) : 'all'}" metric${selectedMetric ? '' : 's'}`}</span> | |
| <span | |
| className={classes.tasksContainerNotificationText} | |
| >{`for "${modelA.name}" and "${modelB.name}" models.`}</span> | |
| {complexity === 'high' ? ( | |
| <div className={classes.tasksContainerWarning}> | |
| <WarningAlt | |
| height={'24px'} | |
| width={'24px'} | |
| className={classes.tasksContainerWarningIcon} | |
| /> | |
| <span className={classes.tasksContainerWarningText}> | |
| It might take few minutes to build this view. | |
| </span> | |
| </div> | |
| ) : null} | |
| </div> | |
| </> | |
| )} | |
| </div> | |
| ) : ( | |
| <> | |
| <div className={classes.tasksContainerNotification}> | |
| <span | |
| className={classes.tasksContainerNotificationText} | |
| >{`Press calculate to measure statistical significance ${selectedMetric ? 'for' : 'across'} "${selectedMetric ? extractMetricDisplayName(selectedMetric) : 'all'}" metric${selectedMetric ? '' : 's'}`}</span> | |
| <span | |
| className={classes.tasksContainerNotificationText} | |
| >{`for "${modelA.name}" and "${modelB.name}" models.`}</span> | |
| {complexity === 'high' ? ( | |
| <div className={classes.tasksContainerWarning}> | |
| <WarningAlt | |
| height={'24px'} | |
| width={'24px'} | |
| className={classes.tasksContainerWarningIcon} | |
| /> | |
| <span className={classes.tasksContainerWarningText}> | |
| It might take few minutes to build this view. | |
| </span> | |
| </div> | |
| ) : null} | |
| </div> | |
| </> | |
| )} | |
| {selectedMetric && | |
| statisticalInformationPerMetric && | |
| statisticalInformationPerMetric.hasOwnProperty(selectedMetric.name) && ( | |
| <div className={classes.row}> | |
| <h4> | |
| Tasks{selectedMetric && filteredEvaluations && <sup>*</sup>} | |
| </h4> | |
| {filteredEvaluations ? ( | |
| <> | |
| <TasksTable | |
| metrics={[selectedMetric]} | |
| evaluations={filteredEvaluations} | |
| models={[modelA, modelB]} | |
| filters={filters} | |
| onClick={onTaskSelection} | |
| /> | |
| <span className={classes.tasksTableWarning}> | |
| <sup>*</sup> Only tasks with different model aggregate scores | |
| are shown in the above table. | |
| </span> | |
| </> | |
| ) : null} | |
| </div> | |
| )} | |
| </div> | |
| ); | |
| } | |