Spaces:

kpfadnis
/

InspectorRAGet

Running

App Files Files

InspectorRAGet / src /__tests__ /processor.test.ts

kpfadnis

feat: add ModelCharacteristics view and fix BFCL converter data quality

2542139 about 1 month ago

raw

history blame

10.7 kB

	/**
	*
	* Copyright 2023-present InspectorRAGet Team
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	**/

	import { describe, it, expect } from 'vitest';
	import { processData, DataErrorKinds } from '@/src/processor';
	import { RawData } from '@/src/types';

	// --- Fixtures ---

	function minimalData(overrides?: Partial<RawData>): RawData {
	return {
	name: 'Test',
	models: [
	{ modelId: 'm1', name: 'Model 1', owner: 'owner1' },
	{ modelId: 'm2', name: 'Model 2', owner: 'owner2' },
	],
	metrics: [
	{
	name: 'accuracy',
	author: 'algorithm',
	type: 'numerical',
	range: [0, 1],
	},
	],
	tasks: [
	{
	taskId: 't1',
	taskType: 'generation',
	input: 'Hello',
	},
	],
	results: [
	{
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi there' },
	scores: { accuracy: { system: { value: 0.9 } } },
	},
	{
	taskId: 't1',
	modelId: 'm2',
	output: { type: 'text', value: 'Hey' },
	scores: { accuracy: { system: { value: 0.8 } } },
	},
	],
	...overrides,
	} as RawData;
	}

	// --- processData: basic qualification ---

	describe('processData', () => {
	it('qualifies tasks that have results for all models and all metrics', () => {
	const [data, disqualified] = processData(minimalData());
	expect(data.tasks).toHaveLength(1);
	expect(data.results).toHaveLength(2);
	expect(data.numTasks).toBe(1);
	expect(Object.keys(disqualified)).toHaveLength(0);
	});

	it('uses the provided name or defaults to "Example"', () => {
	const [withName] = processData(minimalData({ name: 'My Experiment' }));
	expect(withName.name).toBe('My Experiment');

	const noName = minimalData();
	delete (noName as any).name;
	const [withDefault] = processData(noName);
	expect(withDefault.name).toBe('Example');
	});

	it('preserves model and metric data', () => {
	const [data] = processData(minimalData());
	expect(data.models).toHaveLength(2);
	expect(data.metrics).toHaveLength(1);
	expect(data.metrics[0].name).toBe('accuracy');
	});

	it('extracts annotator IDs from results', () => {
	const [data] = processData(minimalData());
	expect(data.annotators).toContain('system');
	});

	// --- Disqualification: missing metrics ---

	it('disqualifies a task when an evaluation is missing a metric annotation', () => {
	const raw = minimalData();
	raw.results[0] = {
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi' },
	scores: {},
	} as any;

	const [data, disqualified] = processData(raw);
	expect(data.tasks).toHaveLength(0);
	expect(Object.keys(disqualified)).toContain('t1');
	expect(
	disqualified['t1'].reasons.some(
	(r) => r.kind === DataErrorKinds.MISSING_METRIC,
	),
	).toBe(true);
	});

	it('disqualifies a task when a metric annotation has empty evaluators', () => {
	const raw = minimalData();
	raw.results[0] = {
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi' },
	scores: { accuracy: {} },
	} as any;

	const [data, disqualified] = processData(raw);
	expect(data.tasks).toHaveLength(0);
	expect(
	disqualified['t1'].reasons.some(
	(r) => r.kind === DataErrorKinds.MISSING_VALUE,
	),
	).toBe(true);
	});

	it('disqualifies a task when an annotation is missing the value field', () => {
	const raw = minimalData();
	raw.results[0] = {
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi' },
	scores: { accuracy: { system: { timestamp: 123 } } },
	} as any;

	const [data, disqualified] = processData(raw);
	expect(data.tasks).toHaveLength(0);
	expect(
	disqualified['t1'].reasons.some(
	(r) => r.kind === DataErrorKinds.MISSING_VALUE,
	),
	).toBe(true);
	});

	// --- Disqualification: missing models ---

	it('disqualifies a task when not all models have results', () => {
	const raw = minimalData();
	// Remove result for m2
	raw.results = [raw.results[0]];

	const [data, disqualified] = processData(raw);
	expect(data.tasks).toHaveLength(0);
	expect(
	disqualified['t1'].reasons.some(
	(r) => r.kind === DataErrorKinds.MISSING_MODEL,
	),
	).toBe(true);
	});

	it('ignores results for models not in the models list', () => {
	const raw = minimalData();
	// Add result for unlisted model
	raw.results.push({
	taskId: 't1',
	modelId: 'unknown_model',
	output: { type: 'text', value: 'Yo' },
	scores: { accuracy: { system: { value: 0.5 } } },
	} as any);

	const [data] = processData(raw);
	// Should still qualify with the two known models
	expect(data.results).toHaveLength(2);
	expect(data.results.every((e) => e.modelId !== 'unknown_model')).toBe(true);
	});

	// --- Text-only metrics are not used for qualification ---

	it('does not use text metrics for qualification', () => {
	const raw = minimalData({
	metrics: [
	{
	name: 'accuracy',
	author: 'algorithm',
	type: 'numerical',
	range: [0, 1],
	},
	{ name: 'explanation', author: 'algorithm', type: 'text' },
	],
	});
	// Evaluations only have 'accuracy', not 'explanation'
	const [data] = processData(raw);
	expect(data.tasks).toHaveLength(1);
	});

	// --- Categorical metric value sorting ---

	it('sorts categorical metric values by numericValue', () => {
	const raw = minimalData({
	metrics: [
	{
	name: 'quality',
	author: 'human',
	type: 'categorical',
	values: [
	{ value: 'high', numericValue: 3 },
	{ value: 'low', numericValue: 1 },
	{ value: 'medium', numericValue: 2 },
	],
	},
	],
	});
	raw.results = [
	{
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi' },
	scores: { quality: { human1: { value: 'high' } } },
	},
	{
	taskId: 't1',
	modelId: 'm2',
	output: { type: 'text', value: 'Hey' },
	scores: { quality: { human1: { value: 'low' } } },
	},
	] as any;

	const [data] = processData(raw);
	const qualityMetric = data.metrics.find((m) => m.name === 'quality');
	expect(qualityMetric?.values?.[0].value).toBe('low');
	expect(qualityMetric?.values?.[2].value).toBe('high');
	});

	it('sets minValue and maxValue for categorical metrics', () => {
	const raw = minimalData({
	metrics: [
	{
	name: 'quality',
	author: 'human',
	type: 'categorical',
	values: [
	{ value: 'good', numericValue: 1 },
	{ value: 'bad', numericValue: 0 },
	],
	},
	],
	});
	raw.results = [
	{
	taskId: 't1',
	modelId: 'm1',
	output: { type: 'text', value: 'Hi' },
	scores: { quality: { h: { value: 'good' } } },
	},
	{
	taskId: 't1',
	modelId: 'm2',
	output: { type: 'text', value: 'Hey' },
	scores: { quality: { h: { value: 'bad' } } },
	},
	] as any;

	const [data] = processData(raw);
	const metric = data.metrics.find((m) => m.name === 'quality');
	expect(metric?.minValue).toEqual({ value: 'bad', numericValue: 0 });
	expect(metric?.maxValue).toEqual({ value: 'good', numericValue: 1 });
	});

	it('sets minValue and maxValue for numerical metrics with range', () => {
	const [data] = processData(minimalData());
	const metric = data.metrics.find((m) => m.name === 'accuracy');
	expect(metric?.minValue).toBe(0);
	expect(metric?.maxValue).toBe(1);
	});

	// --- Multiple tasks ---

	it('handles multiple tasks independently', () => {
	const raw = minimalData();
	raw.tasks.push({
	taskId: 't2',
	taskType: 'generation',
	input: 'Bye',
	} as any);
	// t2 only has a result for m1, not m2 — should be disqualified
	raw.results.push({
	taskId: 't2',
	modelId: 'm1',
	output: { type: 'text', value: 'Goodbye' },
	scores: { accuracy: { system: { value: 0.7 } } },
	} as any);

	const [data, disqualified] = processData(raw);
	expect(data.tasks).toHaveLength(1);
	expect(data.tasks[0].taskId).toBe('t1');
	expect(Object.keys(disqualified)).toContain('t2');
	});

	// --- Filters ---

	it('preserves filters from raw data', () => {
	const [data] = processData(minimalData({ filters: ['category'] }));
	expect(data.filters).toEqual(['category']);
	});

	it('omits filters when not provided', () => {
	const [data] = processData(minimalData());
	expect(data.filters).toBeUndefined();
	});

	// --- Documents ---

	it('preserves documents from raw data', () => {
	const [data] = processData(
	minimalData({
	documents: [{ documentId: 'd1', text: 'doc text' }],
	}),
	);
	expect(data.documents).toHaveLength(1);
	});

	// --- labels pass-through ---

	it('preserves labels on qualified results with snake_case keys intact', () => {
	const raw = minimalData();
	(raw.results[0] as any).labels = {
	error_type: 'force_terminated',
	response_language: null,
	};

	const [data] = processData(raw);
	const result = data.results.find(
	(r) => r.taskId === 't1' && r.modelId === 'm1',
	);
	expect(result?.labels).toEqual({
	error_type: 'force_terminated',
	response_language: null,
	});
	});

	it('qualifies results that have no labels field', () => {
	const [data] = processData(minimalData());
	expect(data.results).toHaveLength(2);
	data.results.forEach((r) => expect(r.labels).toBeUndefined());
	});

	// --- migrated flag ---

	it('sets migrated=true on the returned Data when the flag is passed in', () => {
	const [data] = processData(minimalData(), true);
	expect(data.migrated).toBe(true);
	});

	it('omits migrated from Data when flag is false', () => {
	const [data] = processData(minimalData(), false);
	expect(data.migrated).toBeUndefined();
	});
	});