InspectorRAGet / src /__tests__ /processor.test.ts
kpfadnis's picture
feat: add ModelCharacteristics view and fix BFCL converter data quality
2542139
raw
history blame
10.7 kB
/**
*
* Copyright 2023-present InspectorRAGet Team
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import { describe, it, expect } from 'vitest';
import { processData, DataErrorKinds } from '@/src/processor';
import { RawData } from '@/src/types';
// --- Fixtures ---
function minimalData(overrides?: Partial<RawData>): RawData {
return {
name: 'Test',
models: [
{ modelId: 'm1', name: 'Model 1', owner: 'owner1' },
{ modelId: 'm2', name: 'Model 2', owner: 'owner2' },
],
metrics: [
{
name: 'accuracy',
author: 'algorithm',
type: 'numerical',
range: [0, 1],
},
],
tasks: [
{
taskId: 't1',
taskType: 'generation',
input: 'Hello',
},
],
results: [
{
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi there' },
scores: { accuracy: { system: { value: 0.9 } } },
},
{
taskId: 't1',
modelId: 'm2',
output: { type: 'text', value: 'Hey' },
scores: { accuracy: { system: { value: 0.8 } } },
},
],
...overrides,
} as RawData;
}
// --- processData: basic qualification ---
describe('processData', () => {
it('qualifies tasks that have results for all models and all metrics', () => {
const [data, disqualified] = processData(minimalData());
expect(data.tasks).toHaveLength(1);
expect(data.results).toHaveLength(2);
expect(data.numTasks).toBe(1);
expect(Object.keys(disqualified)).toHaveLength(0);
});
it('uses the provided name or defaults to "Example"', () => {
const [withName] = processData(minimalData({ name: 'My Experiment' }));
expect(withName.name).toBe('My Experiment');
const noName = minimalData();
delete (noName as any).name;
const [withDefault] = processData(noName);
expect(withDefault.name).toBe('Example');
});
it('preserves model and metric data', () => {
const [data] = processData(minimalData());
expect(data.models).toHaveLength(2);
expect(data.metrics).toHaveLength(1);
expect(data.metrics[0].name).toBe('accuracy');
});
it('extracts annotator IDs from results', () => {
const [data] = processData(minimalData());
expect(data.annotators).toContain('system');
});
// --- Disqualification: missing metrics ---
it('disqualifies a task when an evaluation is missing a metric annotation', () => {
const raw = minimalData();
raw.results[0] = {
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi' },
scores: {},
} as any;
const [data, disqualified] = processData(raw);
expect(data.tasks).toHaveLength(0);
expect(Object.keys(disqualified)).toContain('t1');
expect(
disqualified['t1'].reasons.some(
(r) => r.kind === DataErrorKinds.MISSING_METRIC,
),
).toBe(true);
});
it('disqualifies a task when a metric annotation has empty evaluators', () => {
const raw = minimalData();
raw.results[0] = {
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi' },
scores: { accuracy: {} },
} as any;
const [data, disqualified] = processData(raw);
expect(data.tasks).toHaveLength(0);
expect(
disqualified['t1'].reasons.some(
(r) => r.kind === DataErrorKinds.MISSING_VALUE,
),
).toBe(true);
});
it('disqualifies a task when an annotation is missing the value field', () => {
const raw = minimalData();
raw.results[0] = {
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi' },
scores: { accuracy: { system: { timestamp: 123 } } },
} as any;
const [data, disqualified] = processData(raw);
expect(data.tasks).toHaveLength(0);
expect(
disqualified['t1'].reasons.some(
(r) => r.kind === DataErrorKinds.MISSING_VALUE,
),
).toBe(true);
});
// --- Disqualification: missing models ---
it('disqualifies a task when not all models have results', () => {
const raw = minimalData();
// Remove result for m2
raw.results = [raw.results[0]];
const [data, disqualified] = processData(raw);
expect(data.tasks).toHaveLength(0);
expect(
disqualified['t1'].reasons.some(
(r) => r.kind === DataErrorKinds.MISSING_MODEL,
),
).toBe(true);
});
it('ignores results for models not in the models list', () => {
const raw = minimalData();
// Add result for unlisted model
raw.results.push({
taskId: 't1',
modelId: 'unknown_model',
output: { type: 'text', value: 'Yo' },
scores: { accuracy: { system: { value: 0.5 } } },
} as any);
const [data] = processData(raw);
// Should still qualify with the two known models
expect(data.results).toHaveLength(2);
expect(data.results.every((e) => e.modelId !== 'unknown_model')).toBe(true);
});
// --- Text-only metrics are not used for qualification ---
it('does not use text metrics for qualification', () => {
const raw = minimalData({
metrics: [
{
name: 'accuracy',
author: 'algorithm',
type: 'numerical',
range: [0, 1],
},
{ name: 'explanation', author: 'algorithm', type: 'text' },
],
});
// Evaluations only have 'accuracy', not 'explanation'
const [data] = processData(raw);
expect(data.tasks).toHaveLength(1);
});
// --- Categorical metric value sorting ---
it('sorts categorical metric values by numericValue', () => {
const raw = minimalData({
metrics: [
{
name: 'quality',
author: 'human',
type: 'categorical',
values: [
{ value: 'high', numericValue: 3 },
{ value: 'low', numericValue: 1 },
{ value: 'medium', numericValue: 2 },
],
},
],
});
raw.results = [
{
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi' },
scores: { quality: { human1: { value: 'high' } } },
},
{
taskId: 't1',
modelId: 'm2',
output: { type: 'text', value: 'Hey' },
scores: { quality: { human1: { value: 'low' } } },
},
] as any;
const [data] = processData(raw);
const qualityMetric = data.metrics.find((m) => m.name === 'quality');
expect(qualityMetric?.values?.[0].value).toBe('low');
expect(qualityMetric?.values?.[2].value).toBe('high');
});
it('sets minValue and maxValue for categorical metrics', () => {
const raw = minimalData({
metrics: [
{
name: 'quality',
author: 'human',
type: 'categorical',
values: [
{ value: 'good', numericValue: 1 },
{ value: 'bad', numericValue: 0 },
],
},
],
});
raw.results = [
{
taskId: 't1',
modelId: 'm1',
output: { type: 'text', value: 'Hi' },
scores: { quality: { h: { value: 'good' } } },
},
{
taskId: 't1',
modelId: 'm2',
output: { type: 'text', value: 'Hey' },
scores: { quality: { h: { value: 'bad' } } },
},
] as any;
const [data] = processData(raw);
const metric = data.metrics.find((m) => m.name === 'quality');
expect(metric?.minValue).toEqual({ value: 'bad', numericValue: 0 });
expect(metric?.maxValue).toEqual({ value: 'good', numericValue: 1 });
});
it('sets minValue and maxValue for numerical metrics with range', () => {
const [data] = processData(minimalData());
const metric = data.metrics.find((m) => m.name === 'accuracy');
expect(metric?.minValue).toBe(0);
expect(metric?.maxValue).toBe(1);
});
// --- Multiple tasks ---
it('handles multiple tasks independently', () => {
const raw = minimalData();
raw.tasks.push({
taskId: 't2',
taskType: 'generation',
input: 'Bye',
} as any);
// t2 only has a result for m1, not m2 — should be disqualified
raw.results.push({
taskId: 't2',
modelId: 'm1',
output: { type: 'text', value: 'Goodbye' },
scores: { accuracy: { system: { value: 0.7 } } },
} as any);
const [data, disqualified] = processData(raw);
expect(data.tasks).toHaveLength(1);
expect(data.tasks[0].taskId).toBe('t1');
expect(Object.keys(disqualified)).toContain('t2');
});
// --- Filters ---
it('preserves filters from raw data', () => {
const [data] = processData(minimalData({ filters: ['category'] }));
expect(data.filters).toEqual(['category']);
});
it('omits filters when not provided', () => {
const [data] = processData(minimalData());
expect(data.filters).toBeUndefined();
});
// --- Documents ---
it('preserves documents from raw data', () => {
const [data] = processData(
minimalData({
documents: [{ documentId: 'd1', text: 'doc text' }],
}),
);
expect(data.documents).toHaveLength(1);
});
// --- labels pass-through ---
it('preserves labels on qualified results with snake_case keys intact', () => {
const raw = minimalData();
(raw.results[0] as any).labels = {
error_type: 'force_terminated',
response_language: null,
};
const [data] = processData(raw);
const result = data.results.find(
(r) => r.taskId === 't1' && r.modelId === 'm1',
);
expect(result?.labels).toEqual({
error_type: 'force_terminated',
response_language: null,
});
});
it('qualifies results that have no labels field', () => {
const [data] = processData(minimalData());
expect(data.results).toHaveLength(2);
data.results.forEach((r) => expect(r.labels).toBeUndefined());
});
// --- migrated flag ---
it('sets migrated=true on the returned Data when the flag is passed in', () => {
const [data] = processData(minimalData(), true);
expect(data.migrated).toBe(true);
});
it('omits migrated from Data when flag is false', () => {
const [data] = processData(minimalData(), false);
expect(data.migrated).toBeUndefined();
});
});