Spaces:
Running
Running
| /** | |
| * | |
| * Copyright 2023-present InspectorRAGet Team | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| * | |
| **/ | |
| import { describe, it, expect } from 'vitest'; | |
| import { processData, DataErrorKinds } from '@/src/processor'; | |
| import { RawData } from '@/src/types'; | |
| // --- Fixtures --- | |
| function minimalData(overrides?: Partial<RawData>): RawData { | |
| return { | |
| name: 'Test', | |
| models: [ | |
| { modelId: 'm1', name: 'Model 1', owner: 'owner1' }, | |
| { modelId: 'm2', name: 'Model 2', owner: 'owner2' }, | |
| ], | |
| metrics: [ | |
| { | |
| name: 'accuracy', | |
| author: 'algorithm', | |
| type: 'numerical', | |
| range: [0, 1], | |
| }, | |
| ], | |
| tasks: [ | |
| { | |
| taskId: 't1', | |
| taskType: 'generation', | |
| input: 'Hello', | |
| }, | |
| ], | |
| results: [ | |
| { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi there' }, | |
| scores: { accuracy: { system: { value: 0.9 } } }, | |
| }, | |
| { | |
| taskId: 't1', | |
| modelId: 'm2', | |
| output: { type: 'text', value: 'Hey' }, | |
| scores: { accuracy: { system: { value: 0.8 } } }, | |
| }, | |
| ], | |
| ...overrides, | |
| } as RawData; | |
| } | |
| // --- processData: basic qualification --- | |
| describe('processData', () => { | |
| it('qualifies tasks that have results for all models and all metrics', () => { | |
| const [data, disqualified] = processData(minimalData()); | |
| expect(data.tasks).toHaveLength(1); | |
| expect(data.results).toHaveLength(2); | |
| expect(data.numTasks).toBe(1); | |
| expect(Object.keys(disqualified)).toHaveLength(0); | |
| }); | |
| it('uses the provided name or defaults to "Example"', () => { | |
| const [withName] = processData(minimalData({ name: 'My Experiment' })); | |
| expect(withName.name).toBe('My Experiment'); | |
| const noName = minimalData(); | |
| delete (noName as any).name; | |
| const [withDefault] = processData(noName); | |
| expect(withDefault.name).toBe('Example'); | |
| }); | |
| it('preserves model and metric data', () => { | |
| const [data] = processData(minimalData()); | |
| expect(data.models).toHaveLength(2); | |
| expect(data.metrics).toHaveLength(1); | |
| expect(data.metrics[0].name).toBe('accuracy'); | |
| }); | |
| it('extracts annotator IDs from results', () => { | |
| const [data] = processData(minimalData()); | |
| expect(data.annotators).toContain('system'); | |
| }); | |
| // --- Disqualification: missing metrics --- | |
| it('disqualifies a task when an evaluation is missing a metric annotation', () => { | |
| const raw = minimalData(); | |
| raw.results[0] = { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi' }, | |
| scores: {}, | |
| } as any; | |
| const [data, disqualified] = processData(raw); | |
| expect(data.tasks).toHaveLength(0); | |
| expect(Object.keys(disqualified)).toContain('t1'); | |
| expect( | |
| disqualified['t1'].reasons.some( | |
| (r) => r.kind === DataErrorKinds.MISSING_METRIC, | |
| ), | |
| ).toBe(true); | |
| }); | |
| it('disqualifies a task when a metric annotation has empty evaluators', () => { | |
| const raw = minimalData(); | |
| raw.results[0] = { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi' }, | |
| scores: { accuracy: {} }, | |
| } as any; | |
| const [data, disqualified] = processData(raw); | |
| expect(data.tasks).toHaveLength(0); | |
| expect( | |
| disqualified['t1'].reasons.some( | |
| (r) => r.kind === DataErrorKinds.MISSING_VALUE, | |
| ), | |
| ).toBe(true); | |
| }); | |
| it('disqualifies a task when an annotation is missing the value field', () => { | |
| const raw = minimalData(); | |
| raw.results[0] = { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi' }, | |
| scores: { accuracy: { system: { timestamp: 123 } } }, | |
| } as any; | |
| const [data, disqualified] = processData(raw); | |
| expect(data.tasks).toHaveLength(0); | |
| expect( | |
| disqualified['t1'].reasons.some( | |
| (r) => r.kind === DataErrorKinds.MISSING_VALUE, | |
| ), | |
| ).toBe(true); | |
| }); | |
| // --- Disqualification: missing models --- | |
| it('disqualifies a task when not all models have results', () => { | |
| const raw = minimalData(); | |
| // Remove result for m2 | |
| raw.results = [raw.results[0]]; | |
| const [data, disqualified] = processData(raw); | |
| expect(data.tasks).toHaveLength(0); | |
| expect( | |
| disqualified['t1'].reasons.some( | |
| (r) => r.kind === DataErrorKinds.MISSING_MODEL, | |
| ), | |
| ).toBe(true); | |
| }); | |
| it('ignores results for models not in the models list', () => { | |
| const raw = minimalData(); | |
| // Add result for unlisted model | |
| raw.results.push({ | |
| taskId: 't1', | |
| modelId: 'unknown_model', | |
| output: { type: 'text', value: 'Yo' }, | |
| scores: { accuracy: { system: { value: 0.5 } } }, | |
| } as any); | |
| const [data] = processData(raw); | |
| // Should still qualify with the two known models | |
| expect(data.results).toHaveLength(2); | |
| expect(data.results.every((e) => e.modelId !== 'unknown_model')).toBe(true); | |
| }); | |
| // --- Text-only metrics are not used for qualification --- | |
| it('does not use text metrics for qualification', () => { | |
| const raw = minimalData({ | |
| metrics: [ | |
| { | |
| name: 'accuracy', | |
| author: 'algorithm', | |
| type: 'numerical', | |
| range: [0, 1], | |
| }, | |
| { name: 'explanation', author: 'algorithm', type: 'text' }, | |
| ], | |
| }); | |
| // Evaluations only have 'accuracy', not 'explanation' | |
| const [data] = processData(raw); | |
| expect(data.tasks).toHaveLength(1); | |
| }); | |
| // --- Categorical metric value sorting --- | |
| it('sorts categorical metric values by numericValue', () => { | |
| const raw = minimalData({ | |
| metrics: [ | |
| { | |
| name: 'quality', | |
| author: 'human', | |
| type: 'categorical', | |
| values: [ | |
| { value: 'high', numericValue: 3 }, | |
| { value: 'low', numericValue: 1 }, | |
| { value: 'medium', numericValue: 2 }, | |
| ], | |
| }, | |
| ], | |
| }); | |
| raw.results = [ | |
| { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi' }, | |
| scores: { quality: { human1: { value: 'high' } } }, | |
| }, | |
| { | |
| taskId: 't1', | |
| modelId: 'm2', | |
| output: { type: 'text', value: 'Hey' }, | |
| scores: { quality: { human1: { value: 'low' } } }, | |
| }, | |
| ] as any; | |
| const [data] = processData(raw); | |
| const qualityMetric = data.metrics.find((m) => m.name === 'quality'); | |
| expect(qualityMetric?.values?.[0].value).toBe('low'); | |
| expect(qualityMetric?.values?.[2].value).toBe('high'); | |
| }); | |
| it('sets minValue and maxValue for categorical metrics', () => { | |
| const raw = minimalData({ | |
| metrics: [ | |
| { | |
| name: 'quality', | |
| author: 'human', | |
| type: 'categorical', | |
| values: [ | |
| { value: 'good', numericValue: 1 }, | |
| { value: 'bad', numericValue: 0 }, | |
| ], | |
| }, | |
| ], | |
| }); | |
| raw.results = [ | |
| { | |
| taskId: 't1', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Hi' }, | |
| scores: { quality: { h: { value: 'good' } } }, | |
| }, | |
| { | |
| taskId: 't1', | |
| modelId: 'm2', | |
| output: { type: 'text', value: 'Hey' }, | |
| scores: { quality: { h: { value: 'bad' } } }, | |
| }, | |
| ] as any; | |
| const [data] = processData(raw); | |
| const metric = data.metrics.find((m) => m.name === 'quality'); | |
| expect(metric?.minValue).toEqual({ value: 'bad', numericValue: 0 }); | |
| expect(metric?.maxValue).toEqual({ value: 'good', numericValue: 1 }); | |
| }); | |
| it('sets minValue and maxValue for numerical metrics with range', () => { | |
| const [data] = processData(minimalData()); | |
| const metric = data.metrics.find((m) => m.name === 'accuracy'); | |
| expect(metric?.minValue).toBe(0); | |
| expect(metric?.maxValue).toBe(1); | |
| }); | |
| // --- Multiple tasks --- | |
| it('handles multiple tasks independently', () => { | |
| const raw = minimalData(); | |
| raw.tasks.push({ | |
| taskId: 't2', | |
| taskType: 'generation', | |
| input: 'Bye', | |
| } as any); | |
| // t2 only has a result for m1, not m2 — should be disqualified | |
| raw.results.push({ | |
| taskId: 't2', | |
| modelId: 'm1', | |
| output: { type: 'text', value: 'Goodbye' }, | |
| scores: { accuracy: { system: { value: 0.7 } } }, | |
| } as any); | |
| const [data, disqualified] = processData(raw); | |
| expect(data.tasks).toHaveLength(1); | |
| expect(data.tasks[0].taskId).toBe('t1'); | |
| expect(Object.keys(disqualified)).toContain('t2'); | |
| }); | |
| // --- Filters --- | |
| it('preserves filters from raw data', () => { | |
| const [data] = processData(minimalData({ filters: ['category'] })); | |
| expect(data.filters).toEqual(['category']); | |
| }); | |
| it('omits filters when not provided', () => { | |
| const [data] = processData(minimalData()); | |
| expect(data.filters).toBeUndefined(); | |
| }); | |
| // --- Documents --- | |
| it('preserves documents from raw data', () => { | |
| const [data] = processData( | |
| minimalData({ | |
| documents: [{ documentId: 'd1', text: 'doc text' }], | |
| }), | |
| ); | |
| expect(data.documents).toHaveLength(1); | |
| }); | |
| // --- labels pass-through --- | |
| it('preserves labels on qualified results with snake_case keys intact', () => { | |
| const raw = minimalData(); | |
| (raw.results[0] as any).labels = { | |
| error_type: 'force_terminated', | |
| response_language: null, | |
| }; | |
| const [data] = processData(raw); | |
| const result = data.results.find( | |
| (r) => r.taskId === 't1' && r.modelId === 'm1', | |
| ); | |
| expect(result?.labels).toEqual({ | |
| error_type: 'force_terminated', | |
| response_language: null, | |
| }); | |
| }); | |
| it('qualifies results that have no labels field', () => { | |
| const [data] = processData(minimalData()); | |
| expect(data.results).toHaveLength(2); | |
| data.results.forEach((r) => expect(r.labels).toBeUndefined()); | |
| }); | |
| // --- migrated flag --- | |
| it('sets migrated=true on the returned Data when the flag is passed in', () => { | |
| const [data] = processData(minimalData(), true); | |
| expect(data.migrated).toBe(true); | |
| }); | |
| it('omits migrated from Data when flag is false', () => { | |
| const [data] = processData(minimalData(), false); | |
| expect(data.migrated).toBeUndefined(); | |
| }); | |
| }); | |