Spaces:

kpfadnis
/

InspectorRAGet

Running

File size: 10,689 Bytes

/**
 *
 * Copyright 2023-present InspectorRAGet Team
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **/

import { describe, it, expect } from 'vitest';
import { processData, DataErrorKinds } from '@/src/processor';
import { RawData } from '@/src/types';

// --- Fixtures ---

function minimalData(overrides?: Partial<RawData>): RawData {
  return {
    name: 'Test',
    models: [
      { modelId: 'm1', name: 'Model 1', owner: 'owner1' },
      { modelId: 'm2', name: 'Model 2', owner: 'owner2' },
    ],
    metrics: [
      {
        name: 'accuracy',
        author: 'algorithm',
        type: 'numerical',
        range: [0, 1],
      },
    ],
    tasks: [
      {
        taskId: 't1',
        taskType: 'generation',
        input: 'Hello',
      },
    ],
    results: [
      {
        taskId: 't1',
        modelId: 'm1',
        output: { type: 'text', value: 'Hi there' },
        scores: { accuracy: { system: { value: 0.9 } } },
      },
      {
        taskId: 't1',
        modelId: 'm2',
        output: { type: 'text', value: 'Hey' },
        scores: { accuracy: { system: { value: 0.8 } } },
      },
    ],
    ...overrides,
  } as RawData;
}

// --- processData: basic qualification ---

describe('processData', () => {
  it('qualifies tasks that have results for all models and all metrics', () => {
    const [data, disqualified] = processData(minimalData());
    expect(data.tasks).toHaveLength(1);
    expect(data.results).toHaveLength(2);
    expect(data.numTasks).toBe(1);
    expect(Object.keys(disqualified)).toHaveLength(0);
  });

  it('uses the provided name or defaults to "Example"', () => {
    const [withName] = processData(minimalData({ name: 'My Experiment' }));
    expect(withName.name).toBe('My Experiment');

    const noName = minimalData();
    delete (noName as any).name;
    const [withDefault] = processData(noName);
    expect(withDefault.name).toBe('Example');
  });

  it('preserves model and metric data', () => {
    const [data] = processData(minimalData());
    expect(data.models).toHaveLength(2);
    expect(data.metrics).toHaveLength(1);
    expect(data.metrics[0].name).toBe('accuracy');
  });

  it('extracts annotator IDs from results', () => {
    const [data] = processData(minimalData());
    expect(data.annotators).toContain('system');
  });

  // --- Disqualification: missing metrics ---

  it('disqualifies a task when an evaluation is missing a metric annotation', () => {
    const raw = minimalData();
    raw.results[0] = {
      taskId: 't1',
      modelId: 'm1',
      output: { type: 'text', value: 'Hi' },
      scores: {},
    } as any;

    const [data, disqualified] = processData(raw);
    expect(data.tasks).toHaveLength(0);
    expect(Object.keys(disqualified)).toContain('t1');
    expect(
      disqualified['t1'].reasons.some(
        (r) => r.kind === DataErrorKinds.MISSING_METRIC,
      ),
    ).toBe(true);
  });

  it('disqualifies a task when a metric annotation has empty evaluators', () => {
    const raw = minimalData();
    raw.results[0] = {
      taskId: 't1',
      modelId: 'm1',
      output: { type: 'text', value: 'Hi' },
      scores: { accuracy: {} },
    } as any;

    const [data, disqualified] = processData(raw);
    expect(data.tasks).toHaveLength(0);
    expect(
      disqualified['t1'].reasons.some(
        (r) => r.kind === DataErrorKinds.MISSING_VALUE,
      ),
    ).toBe(true);
  });

  it('disqualifies a task when an annotation is missing the value field', () => {
    const raw = minimalData();
    raw.results[0] = {
      taskId: 't1',
      modelId: 'm1',
      output: { type: 'text', value: 'Hi' },
      scores: { accuracy: { system: { timestamp: 123 } } },
    } as any;

    const [data, disqualified] = processData(raw);
    expect(data.tasks).toHaveLength(0);
    expect(
      disqualified['t1'].reasons.some(
        (r) => r.kind === DataErrorKinds.MISSING_VALUE,
      ),
    ).toBe(true);
  });

  // --- Disqualification: missing models ---

  it('disqualifies a task when not all models have results', () => {
    const raw = minimalData();
    // Remove result for m2
    raw.results = [raw.results[0]];

    const [data, disqualified] = processData(raw);
    expect(data.tasks).toHaveLength(0);
    expect(
      disqualified['t1'].reasons.some(
        (r) => r.kind === DataErrorKinds.MISSING_MODEL,
      ),
    ).toBe(true);
  });

  it('ignores results for models not in the models list', () => {
    const raw = minimalData();
    // Add result for unlisted model
    raw.results.push({
      taskId: 't1',
      modelId: 'unknown_model',
      output: { type: 'text', value: 'Yo' },
      scores: { accuracy: { system: { value: 0.5 } } },
    } as any);

    const [data] = processData(raw);
    // Should still qualify with the two known models
    expect(data.results).toHaveLength(2);
    expect(data.results.every((e) => e.modelId !== 'unknown_model')).toBe(true);
  });

  // --- Text-only metrics are not used for qualification ---

  it('does not use text metrics for qualification', () => {
    const raw = minimalData({
      metrics: [
        {
          name: 'accuracy',
          author: 'algorithm',
          type: 'numerical',
          range: [0, 1],
        },
        { name: 'explanation', author: 'algorithm', type: 'text' },
      ],
    });
    // Evaluations only have 'accuracy', not 'explanation'
    const [data] = processData(raw);
    expect(data.tasks).toHaveLength(1);
  });

  // --- Categorical metric value sorting ---

  it('sorts categorical metric values by numericValue', () => {
    const raw = minimalData({
      metrics: [
        {
          name: 'quality',
          author: 'human',
          type: 'categorical',
          values: [
            { value: 'high', numericValue: 3 },
            { value: 'low', numericValue: 1 },
            { value: 'medium', numericValue: 2 },
          ],
        },
      ],
    });
    raw.results = [
      {
        taskId: 't1',
        modelId: 'm1',
        output: { type: 'text', value: 'Hi' },
        scores: { quality: { human1: { value: 'high' } } },
      },
      {
        taskId: 't1',
        modelId: 'm2',
        output: { type: 'text', value: 'Hey' },
        scores: { quality: { human1: { value: 'low' } } },
      },
    ] as any;

    const [data] = processData(raw);
    const qualityMetric = data.metrics.find((m) => m.name === 'quality');
    expect(qualityMetric?.values?.[0].value).toBe('low');
    expect(qualityMetric?.values?.[2].value).toBe('high');
  });

  it('sets minValue and maxValue for categorical metrics', () => {
    const raw = minimalData({
      metrics: [
        {
          name: 'quality',
          author: 'human',
          type: 'categorical',
          values: [
            { value: 'good', numericValue: 1 },
            { value: 'bad', numericValue: 0 },
          ],
        },
      ],
    });
    raw.results = [
      {
        taskId: 't1',
        modelId: 'm1',
        output: { type: 'text', value: 'Hi' },
        scores: { quality: { h: { value: 'good' } } },
      },
      {
        taskId: 't1',
        modelId: 'm2',
        output: { type: 'text', value: 'Hey' },
        scores: { quality: { h: { value: 'bad' } } },
      },
    ] as any;

    const [data] = processData(raw);
    const metric = data.metrics.find((m) => m.name === 'quality');
    expect(metric?.minValue).toEqual({ value: 'bad', numericValue: 0 });
    expect(metric?.maxValue).toEqual({ value: 'good', numericValue: 1 });
  });

  it('sets minValue and maxValue for numerical metrics with range', () => {
    const [data] = processData(minimalData());
    const metric = data.metrics.find((m) => m.name === 'accuracy');
    expect(metric?.minValue).toBe(0);
    expect(metric?.maxValue).toBe(1);
  });

  // --- Multiple tasks ---

  it('handles multiple tasks independently', () => {
    const raw = minimalData();
    raw.tasks.push({
      taskId: 't2',
      taskType: 'generation',
      input: 'Bye',
    } as any);
    // t2 only has a result for m1, not m2 — should be disqualified
    raw.results.push({
      taskId: 't2',
      modelId: 'm1',
      output: { type: 'text', value: 'Goodbye' },
      scores: { accuracy: { system: { value: 0.7 } } },
    } as any);

    const [data, disqualified] = processData(raw);
    expect(data.tasks).toHaveLength(1);
    expect(data.tasks[0].taskId).toBe('t1');
    expect(Object.keys(disqualified)).toContain('t2');
  });

  // --- Filters ---

  it('preserves filters from raw data', () => {
    const [data] = processData(minimalData({ filters: ['category'] }));
    expect(data.filters).toEqual(['category']);
  });

  it('omits filters when not provided', () => {
    const [data] = processData(minimalData());
    expect(data.filters).toBeUndefined();
  });

  // --- Documents ---

  it('preserves documents from raw data', () => {
    const [data] = processData(
      minimalData({
        documents: [{ documentId: 'd1', text: 'doc text' }],
      }),
    );
    expect(data.documents).toHaveLength(1);
  });

  // --- labels pass-through ---

  it('preserves labels on qualified results with snake_case keys intact', () => {
    const raw = minimalData();
    (raw.results[0] as any).labels = {
      error_type: 'force_terminated',
      response_language: null,
    };

    const [data] = processData(raw);
    const result = data.results.find(
      (r) => r.taskId === 't1' && r.modelId === 'm1',
    );
    expect(result?.labels).toEqual({
      error_type: 'force_terminated',
      response_language: null,
    });
  });

  it('qualifies results that have no labels field', () => {
    const [data] = processData(minimalData());
    expect(data.results).toHaveLength(2);
    data.results.forEach((r) => expect(r.labels).toBeUndefined());
  });

  // --- migrated flag ---

  it('sets migrated=true on the returned Data when the flag is passed in', () => {
    const [data] = processData(minimalData(), true);
    expect(data.migrated).toBe(true);
  });

  it('omits migrated from Data when flag is false', () => {
    const [data] = processData(minimalData(), false);
    expect(data.migrated).toBeUndefined();
  });
});