Spaces:

kpfadnis
/

InspectorRAGet

Running

App Files Files

kpfadnis commited on Sep 3, 2024

Commit

4594d15

1 Parent(s): d769391

fix (export): Minor fix to export with additional notebook to merge.

Browse files

Files changed (5) hide show

notebooks/merge_input_files.ipynb +208 -0
notebooks/validate_input_file.ipynb +487 -0
src/processor.ts +31 -7
src/types.ts +1 -0
src/utilities/objects.ts +32 -1

notebooks/merge_input_files.ipynb ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Merge input files\n",
+    "\n",
+    "### ✅ Prerequisites\n",
+    "\n",
+    "[Python 3.10](https://www.python.org/downloads/)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> [!CAUTION]\n",
+    "> Please make sure all input files are valid before trying to merge them. You can check validity of each file with a [`validate_input_file`](./validate_input_file.ipynb) notebook.\n",
+    "\n",
+    "\n",
+    "> [!IMPORTANT]\n",
+    "> Only common `tasks` across all input files and associated documents, models and evaluations are preserved in the resultant file.\n",
+    "\n",
+    "### Merge function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Dict, Set\n",
+    "import json\n",
+    "\n",
+    "\n",
+    "# =========================================================\n",
+    "#                   HELPER FUNCTIONS\n",
+    "# =========================================================\n",
+    "def read_json(filename: str, encoding=\"utf-8\"):\n",
+    "    with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
+    "        return json.load(fp)\n",
+    "\n",
+    "\n",
+    "def write_json(filename: str, content: dict, encoding=\"utf-8\"):\n",
+    "    with open(filename, mode=\"w\", encoding=encoding) as fp:\n",
+    "        return json.dump(content, fp)\n",
+    "\n",
+    "\n",
+    "# =========================================================\n",
+    "#                   MAIN FUNCTION\n",
+    "# =========================================================\n",
+    "def merge(inputs: list[dict]) -> dict:\n",
+    "    # Step 1: Return, if single JSON\n",
+    "    if len(inputs) == 1:\n",
+    "        return inputs[0]\n",
+    "\n",
+    "    # Step 2: When multiple input JSONs\n",
+    "    # Step 2.a: Initialize necessary variables\n",
+    "    merged_tasks: Dict[str, dict] = {}\n",
+    "    tasks_to_models: Dict[str, Set[str]] = {}\n",
+    "    evaluations: Dict[str, dict] = {}\n",
+    "    all_models = {}\n",
+    "    all_filters = set()\n",
+    "\n",
+    "    # Step 2.b: Iterate over each input JSON\n",
+    "    for entry in inputs:\n",
+    "        # Step 2.b.i: Add model to dictionary of all models, if not present already\n",
+    "        for model in entry[\"models\"]:\n",
+    "            if model[\"model_id\"] in all_models:\n",
+    "                if model[\"name\"] != all_models[model[\"model_id\"]][\"name\"]:\n",
+    "                    print(\n",
+    "                        f\"Mismatched model information for model with id: ${model['model_id']}\"\n",
+    "                    )\n",
+    "            else:\n",
+    "                all_models[model[\"model_id\"]] = model\n",
+    "\n",
+    "        # Step 2.b.ii: Add filters to set of all filter\n",
+    "        if \"filters\" in entry and entry[\"filters\"]:\n",
+    "            for filter in entry[\"filters\"]:\n",
+    "                all_filters.add(filter)\n",
+    "\n",
+    "        # Step 2.b.iii: Iterate over each evaluation\n",
+    "        for evaluation in entry[\"evaluations\"]:\n",
+    "            # Step 2.b.iii.*: Extend map of task IDs to model IDs based on evaluations\n",
+    "            try:\n",
+    "                tasks_to_models[evaluation[\"task_id\"]].add(evaluation[\"model_id\"])\n",
+    "            except KeyError:\n",
+    "                tasks_to_models[evaluation[\"task_id\"]] = set([evaluation[\"model_id\"]])\n",
+    "\n",
+    "            # Step 2.b.iii.*: Extend evaluations map, if necessary\n",
+    "            if (\n",
+    "                f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
+    "                not in evaluations\n",
+    "            ):\n",
+    "                evaluations[\n",
+    "                    f\"{evaluation['task_id']}<:SEP:>{evaluation['model_id']}\"\n",
+    "                ] = evaluation\n",
+    "\n",
+    "        # Step 2.b.iv: Create merged tasks as follows\n",
+    "        # 1. Merge comments for same task from different input JSONs\n",
+    "        # 2. Merge flagged status for same task from different input JSONs (preserved flagged=True, if any of the input JSONs has it to be 'True')\n",
+    "        for task in entry[\"tasks\"]:\n",
+    "            if task[\"task_id\"] in merged_tasks:\n",
+    "                if \"comments\" in task and task[\"comments\"]:\n",
+    "                    try:\n",
+    "                        merged_tasks[task[\"task_id\"]][\"comments\"].extend(\n",
+    "                            task[\"comments\"]\n",
+    "                        )\n",
+    "                    except KeyError:\n",
+    "                        merged_tasks[task[\"task_id\"]][\"comments\"] = [task[\"comments\"]]\n",
+    "\n",
+    "                if \"flagged\" in task:\n",
+    "                    try:\n",
+    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = (\n",
+    "                            merged_tasks[task[\"task_id\"]][\"flagged\"] or task[\"flagged\"]\n",
+    "                        )\n",
+    "                    except KeyError:\n",
+    "                        merged_tasks[task[\"task_id\"]][\"flagged\"] = task[\"flagged\"]\n",
+    "            else:\n",
+    "                merged_tasks[task[\"task_id\"]] = task\n",
+    "\n",
+    "    # Step 3: Find candidate models\n",
+    "    # Criterion: A group of models which has evaluations for all tasks\n",
+    "    candidate_models = {\n",
+    "        model_id: all_models[model_id]\n",
+    "        for model_id in set.intersection(*list(tasks_to_models.values()))\n",
+    "    }\n",
+    "\n",
+    "    # Step 4: Create potential filters\n",
+    "    candidate_filters = all_filters\n",
+    "    for task in merged_tasks.values():\n",
+    "        candidate_filters  = candidate_filters.intersection(task.keys())\n",
+    "\n",
+    "    # Step 4: Return\n",
+    "    if candidate_models:\n",
+    "        return {\n",
+    "            \"name\": f\"Merged from ${len(inputs)} files\",\n",
+    "            \"filters\": list(candidate_filters),\n",
+    "            \"models\": list(candidate_models.values()),\n",
+    "            \"metrics\": inputs[0][\"metrics\"],\n",
+    "            \"documents\": inputs[0][\"documents\"],\n",
+    "            \"tasks\": inputs[0][\"tasks\"],\n",
+    "            \"evaluations\": [\n",
+    "                evaluations[f\"{task['task_id']}<:SEP:>{model_id}\"]\n",
+    "                for task in inputs[0][\"tasks\"]\n",
+    "                for model_id in candidate_models\n",
+    "            ],\n",
+    "        }\n",
+    "    else:\n",
+    "        print(\"Failed to find models with evaluations for all tasks.\")\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "# =========================================================\n",
+    "#                   EXECUTE\n",
+    "# =========================================================\n",
+    "# Step 1: Load input files to be merged\n",
+    "inputs = [\n",
+    "    read_json(\n",
+    "        filename=\"<PATH TO INPUT JSON 1>\"\n",
+    "    ),\n",
+    "    read_json(\n",
+    "        filename=\"<PATH TO INPUT JSON 2>\"\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "# Step 2: Run merging function\n",
+    "output = merge(inputs=inputs)\n",
+    "\n",
+    "# Step 3: Save merged output\n",
+    "if output:\n",
+    "    write_json(\n",
+    "        filename=\"<PATH TO MERGED FILE>\",\n",
+    "        content=output,\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

notebooks/validate_input_file.ipynb ADDED Viewed

	@@ -0,0 +1,487 @@

+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Validate analytics JSON\n",
+    "\n",
+    "### ✅ Prerequisites\n",
+    "\n",
+    "[Python 3.10](https://www.python.org/downloads/)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import Literal\n",
+    "import json\n",
+    "\n",
+    "def read_json(filename: str, encoding=\"utf-8\"):\n",
+    "    with open(filename, mode=\"r\", encoding=encoding) as fp:\n",
+    "        return json.load(fp)\n",
+    "\n",
+    "\n",
+    "def is_valid_model(model: dict) -> bool:\n",
+    "    if \"model_id\" not in model:\n",
+    "        raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
+    "    if \"name\" not in model:\n",
+    "        raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
+    "    if \"owner\" not in model:\n",
+    "        raise ValueError(f\"Missing mandatory 'model_id' field in {model}\")\n",
+    "\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def is_valid_metric(metric: dict) -> bool:\n",
+    "    def is_valid_metric_value(metric_value: dict) -> bool:\n",
+    "        # Validate \"value\" field\n",
+    "        if \"value\" not in metric_value or not metric_value[\"value\"]:\n",
+    "            raise ValueError(f\"Missing mandatory 'value' field in {metric_value}\")\n",
+    "\n",
+    "        if not (\n",
+    "            isinstance(metric_value[\"value\"], str)\n",
+    "            or isinstance(metric_value[\"value\"], float)\n",
+    "            or isinstance(metric_value[\"value\"], int)\n",
+    "        ):\n",
+    "            raise ValueError(\n",
+    "                f\"Invalid type: {type(metric_value['value'])} for 'value' field in {metric_value}\"\n",
+    "            )\n",
+    "\n",
+    "        return True\n",
+    "\n",
+    "    # Validate \"name\" field\n",
+    "    if \"name\" not in metric:\n",
+    "        raise ValueError(f\"Missing mandatory 'name' field in {metric}\")\n",
+    "\n",
+    "    if not isinstance(metric[\"name\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(metric['name'])} for 'name' field in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"author\" field\n",
+    "    if \"author\" not in metric:\n",
+    "        raise ValueError(f\"Missing mandatory 'name' field in {metric}\")\n",
+    "\n",
+    "    if not isinstance(metric[\"author\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(metric['author'])} for 'author' field in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    if metric[\"author\"] not in [\"human\", \"algorithm\"]:\n",
+    "        raise ValueError(f\"Unsupported author: {metric['author']} in {metric}\")\n",
+    "\n",
+    "    # Validate \"type\" field\n",
+    "    if \"type\" not in metric:\n",
+    "        raise ValueError(f\"Missing mandatory 'type' field in {metric}\")\n",
+    "\n",
+    "    if metric[\"type\"] not in [\"categorical\", \"numerical\", \"text\"]:\n",
+    "        raise ValueError(f\"Unsupported type: {metric['type']} in {metric}\")\n",
+    "\n",
+    "    # Validate \"categorical\" type  metric\n",
+    "    if metric[\"type\"] == \"categorical\" and (\n",
+    "        \"values\" not in metric or not metric[\"values\"]\n",
+    "    ):\n",
+    "        raise ValueError(\n",
+    "            f\"Missing mandatory 'values' field for 'categorical' type metric in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    if metric[\"type\"] == \"categorical\" and not all(\n",
+    "        [\n",
+    "            is_valid_metric_value(metric_value=metric_value)\n",
+    "            for metric_value in metric[\"values\"]\n",
+    "        ]\n",
+    "    ):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid metric values for 'categorical' type of metric in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"numerical\" type metric\n",
+    "    if metric[\"type\"] == \"numerical\" and not (\n",
+    "        \"range\" in metric or metric[\"range\"] or 2 <= len(metric[\"range\"]) > 3\n",
+    "    ):\n",
+    "        raise ValueError(\n",
+    "            f\"Missing or invalid 'range' field for 'numerical' type of metric in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"aggregator\" field\n",
+    "    if metric[\"type\"] != \"text\" and \"aggregator\" not in metric:\n",
+    "        raise ValueError(f\"Missing mandatory 'aggregator' field in {metric}\")\n",
+    "\n",
+    "    if metric[\"type\"] == \"numerical\" and metric[\"aggregator\"] != \"average\":\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid 'aggregator' field for 'numerical' type of metric in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate 'display_name' field, if present\n",
+    "    if \"display_name\" in metric and not isinstance(metric[\"display_name\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(metric['display_name'])} for 'display_name' field in {metric}\"\n",
+    "        )\n",
+    "\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def is_valid_document(document: dict) -> bool:\n",
+    "    # Validate \"document_id\" field\n",
+    "    if \"document_id\" not in document:\n",
+    "        raise ValueError(f\"Missing mandatory 'document_id' field in {document}\")\n",
+    "\n",
+    "    if not isinstance(document[\"document_id\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(document['document_id'])} for 'document_id' field in {document}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"text\" field\n",
+    "    if \"text\" not in document:\n",
+    "        raise ValueError(f\"Missing mandatory 'text' field in {document}\")\n",
+    "\n",
+    "    if not isinstance(document[\"text\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(document['text'])} for 'text' field in {document}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate 'title' field, if present\n",
+    "    if \"title\" in document and not isinstance(document[\"title\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(document['title'])} for 'title' field in {document}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate 'url' field, if present\n",
+    "    if \"url\" in document and not isinstance(document[\"url\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(document['url'])} for 'url' field in {document}\"\n",
+    "        )\n",
+    "\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def is_valid_task(task: dict) -> bool:\n",
+    "    def is_valid_context(context: dict) -> bool:\n",
+    "        # Validate \"document_id\" field\n",
+    "        if \"document_id\" not in context:\n",
+    "            raise ValueError(f\"Missing mandatory 'document_id' field in {context}\")\n",
+    "\n",
+    "        if not isinstance(context[\"document_id\"], str):\n",
+    "            raise ValueError(\n",
+    "                f\"Invalid type: {type(context['document_id'])} for 'document_id' field in {context}\"\n",
+    "            )\n",
+    "\n",
+    "        return True\n",
+    "\n",
+    "    # Validate \"task_id\" field\n",
+    "    if \"task_id\" not in task:\n",
+    "        raise ValueError(f\"Missing mandatory 'task_id' field in {task}\")\n",
+    "\n",
+    "    if not isinstance(task[\"task_id\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(task['task_id'])} for 'task_id' field in {task}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"task_type\" field\n",
+    "    if \"task_type\" not in task:\n",
+    "        raise ValueError(f\"Missing mandatory 'task_type' field in {task}\")\n",
+    "\n",
+    "    if not isinstance(task[\"task_type\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(task['task_type'])} for 'task_type' field in {task}\"\n",
+    "        )\n",
+    "\n",
+    "    if task[\"task_type\"] not in [\"question_answering\", \"conversation\", \"rag\", \"text_generation\", \"json_generation\"]:\n",
+    "        raise ValueError(f\"Invalid task_type: {task['task_type']} in {task}\")\n",
+    "\n",
+    "    # Validate `contexts` field\n",
+    "    if not all([is_valid_context(context=context) for context in task[\"contexts\"]]):\n",
+    "        raise ValueError(f\"Invalid context values in {task}\")\n",
+    "\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def is_valid_evaluation(\n",
+    "    evaluation: dict, metrics: list[str], models: list[str]\n",
+    ") -> bool:\n",
+    "    def is_valid_annotations(annotations: dict, metric: str) -> bool:\n",
+    "        for annotator_id, rating in annotations.items():\n",
+    "            if not isinstance(annotator_id, str):\n",
+    "                raise ValueError(\n",
+    "                    f\"Invalid type: {type(annotator_id)} for 'annotator_id' in {annotations} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
+    "                )\n",
+    "\n",
+    "            if not isinstance(rating, dict):\n",
+    "                raise ValueError(\n",
+    "                    f\"Invalid type: {type(rating)} for 'rating' in {annotations} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
+    "                )\n",
+    "\n",
+    "            # Validate \"task_id\" field\n",
+    "            if \"value\" not in rating:\n",
+    "                raise ValueError(\n",
+    "                    f\"Missing mandatory 'value' field in {rating} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
+    "                )\n",
+    "\n",
+    "            if not (\n",
+    "                isinstance(rating[\"value\"], str)\n",
+    "                or isinstance(rating[\"value\"], float)\n",
+    "                or isinstance(rating[\"value\"], int)\n",
+    "            ):\n",
+    "                raise ValueError(\n",
+    "                    f\"Invalid type: {type(rating['value'])} for 'value' in {rating} for '{metric}' metric in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
+    "                )\n",
+    "\n",
+    "        return True\n",
+    "\n",
+    "    # Validate \"task_id\" field\n",
+    "    if \"task_id\" not in evaluation:\n",
+    "        raise ValueError(f\"Missing mandatory 'task_id' field in {evaluation}\")\n",
+    "\n",
+    "    if not isinstance(evaluation[\"task_id\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(evaluation['task_id'])} for 'task_id' field in {evaluation}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"model_id\" field\n",
+    "    if \"model_id\" not in evaluation:\n",
+    "        raise ValueError(f\"Missing mandatory 'model_id' field in {evaluation}\")\n",
+    "\n",
+    "    if not isinstance(evaluation[\"model_id\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(evaluation['model_id'])} for 'model_id' field in {evaluation}\"\n",
+    "        )\n",
+    "\n",
+    "    if evaluation[\"model_id\"] not in models:\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid model with model_id: {evaluation['model_id']} for evaluation with task_id: {evaluation['task_id']}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"model_response\" field\n",
+    "    if \"task_id\" not in evaluation:\n",
+    "        raise ValueError(f\"Missing mandatory 'model_response' field in {evaluation}\")\n",
+    "\n",
+    "    if not isinstance(evaluation[\"model_response\"], str):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid type: {type(evaluation['model_response'])} for 'model_response' field in {evaluation}\"\n",
+    "        )\n",
+    "\n",
+    "    # Validate \"annotations\" field\n",
+    "    if \"annotations\" not in evaluation:\n",
+    "        raise ValueError(f\"Missing mandatory 'annotations' field in {evaluation}\")\n",
+    "\n",
+    "    if not all(\n",
+    "        is_valid_annotations(annotations=annotations, metric=metric)\n",
+    "        for metric, annotations in evaluation[\"annotations\"].items()\n",
+    "    ):\n",
+    "        raise ValueError(\n",
+    "            f\"Invalid annotations in evaluation with with task_id: {evaluation['task_id']} and model_id: {evaluation['model_id']}\"\n",
+    "        )\n",
+    "\n",
+    "    return True\n",
+    "\n",
+    "\n",
+    "def validate(data: dict, level: Literal[\"minimal\", \"aggresive\"] = \"minimal\") -> None:\n",
+    "    # Validate \"models\" field\n",
+    "    if \"models\" not in data:\n",
+    "        raise ValueError(f\"Missing mandatory 'models' field in {data}\")\n",
+    "\n",
+    "    if not all(is_valid_model(model) for model in data[\"models\"]):\n",
+    "        raise ValueError(f\"Invalid model in {data['models']}\")\n",
+    "\n",
+    "    # Validate \"metrics\" field\n",
+    "    if \"metrics\" not in data:\n",
+    "        raise ValueError(f\"Missing mandatory 'metrics' field in {data}\")\n",
+    "\n",
+    "    if not all(is_valid_metric(metric) for metric in data[\"metrics\"]):\n",
+    "        raise ValueError(f\"Invalid metric in {data['metrics']}\")\n",
+    "\n",
+    "    # Validate \"documents\" field\n",
+    "    if \"documents\" not in data:\n",
+    "        raise ValueError(f\"Missing mandatory 'documents' field in {data}\")\n",
+    "\n",
+    "    if not all(is_valid_document(document) for document in data[\"documents\"]):\n",
+    "        raise ValueError(f\"Invalid document in {data['documents']}\")\n",
+    "\n",
+    "    # Validate \"tasks\" field\n",
+    "    if \"tasks\" not in data:\n",
+    "        raise ValueError(f\"Missing mandatory 'tasks' field in {data}\")\n",
+    "\n",
+    "    if not all(is_valid_task(task) for task in data[\"tasks\"]):\n",
+    "        raise ValueError(f\"Invalid task in {data['tasks']}\")\n",
+    "\n",
+    "    # Warn about duplicate task IDs\n",
+    "    task_ids = set()\n",
+    "    for task in data[\"tasks\"]:\n",
+    "        task_id = task[\"task_id\"]\n",
+    "        if task_id in task_ids:\n",
+    "            print(f\"Duplicate task_id: {task_id} found in 'tasks' field\")\n",
+    "        else:\n",
+    "            task_ids.add(task_id)\n",
+    "\n",
+    "    # Validate \"evaluations\" field\n",
+    "    if \"evaluations\" not in data:\n",
+    "        raise ValueError(f\"Missing mandatory 'evaluations' field in {data}\")\n",
+    "\n",
+    "    applicable_metrics = [metric[\"name\"] for metric in data[\"metrics\"]]\n",
+    "    applicable_models = [model[\"model_id\"] for model in data[\"models\"]]\n",
+    "    if not all(\n",
+    "        is_valid_evaluation(\n",
+    "            evaluation, metrics=applicable_metrics, models=applicable_models\n",
+    "        )\n",
+    "        for evaluation in data[\"evaluations\"]\n",
+    "    ):\n",
+    "        raise ValueError(f\"Invalid evaluation in {data['evaluations']}\")\n",
+    "\n",
+    "    # Validate evaluations exists for all task for all models with all metrics\n",
+    "    evaluated_models_per_task = {}\n",
+    "    evaluated_metrics_per_model_per_task = {}\n",
+    "    for evaluation in data[\"evaluations\"]:\n",
+    "        task_id = evaluation[\"task_id\"]\n",
+    "        model_id = evaluation[\"model_id\"]\n",
+    "        try:\n",
+    "            evaluated_models_per_task[task_id].append(model_id)\n",
+    "        except KeyError:\n",
+    "            evaluated_models_per_task[task_id] = [model_id]\n",
+    "\n",
+    "        for metric in evaluation[\"annotations\"].keys():\n",
+    "            try:\n",
+    "                evaluated_metrics_per_model_per_task[f\"{task_id}:++:{model_id}\"].append(\n",
+    "                    metric\n",
+    "                )\n",
+    "            except KeyError:\n",
+    "                evaluated_metrics_per_model_per_task[f\"{task_id}:++:{model_id}\"] = [\n",
+    "                    metric\n",
+    "                ]\n",
+    "\n",
+    "    evaluated_task_ids = set(evaluated_models_per_task.keys())\n",
+    "    if evaluated_task_ids != task_ids:\n",
+    "        if len(evaluated_task_ids) > len(task_ids):\n",
+    "            print(\n",
+    "                f\"Evaluations found for following additional tasks: {evaluated_task_ids - task_ids}\"\n",
+    "            )\n",
+    "        elif len(task_ids) > len(evaluated_task_ids):\n",
+    "            print(\n",
+    "                f\"Missing evaluations following tasks: {task_ids - evaluated_task_ids}\"\n",
+    "            )\n",
+    "        else:\n",
+    "            print(\n",
+    "                f\"Missing evaluations following tasks: {task_ids - evaluated_task_ids}\"\n",
+    "            )\n",
+    "            print(\n",
+    "                f\"Evaluations found for following additional tasks: {evaluated_task_ids - task_ids}\"\n",
+    "            )\n",
+    "\n",
+    "    evaluations_with_missing_models = {}\n",
+    "    evaluations_with_additional_models = {}\n",
+    "    for task_id, models in evaluated_models_per_task.items():\n",
+    "        if set(models) != set(applicable_models):\n",
+    "            if set(applicable_models) - set(models):\n",
+    "                evaluations_with_missing_models[task_id] = set(applicable_models) - set(\n",
+    "                    models\n",
+    "                )\n",
+    "            elif set(models) - set(applicable_models):\n",
+    "                evaluations_with_additional_models[task_id] = set(models) - set(\n",
+    "                    applicable_models\n",
+    "                )\n",
+    "\n",
+    "    if evaluations_with_missing_models:\n",
+    "        for task_id, missing_models in evaluations_with_missing_models.items():\n",
+    "            print(\n",
+    "                f\"Missing following models: {missing_models} for task with task_id: {task_id}\"\n",
+    "            )\n",
+    "\n",
+    "    evaluations_per_model_with_missing_metrics = {}\n",
+    "    evaluations_per_model_with_additional_metrics = {}\n",
+    "    for key, metrics in evaluated_metrics_per_model_per_task.items():\n",
+    "        if set(metrics) != set(applicable_metrics):\n",
+    "            if set(applicable_metrics) - set(metrics):\n",
+    "                evaluations_per_model_with_missing_metrics[key] = set(\n",
+    "                    applicable_metrics\n",
+    "                ) - set(metrics)\n",
+    "            elif set(metrics) - set(applicable_metrics):\n",
+    "                evaluations_per_model_with_additional_metrics[key] = set(metrics) - set(\n",
+    "                    applicable_metrics\n",
+    "                )\n",
+    "\n",
+    "    if evaluations_per_model_with_missing_metrics:\n",
+    "        for key, missing_metrics in evaluations_per_model_with_missing_metrics.items():\n",
+    "            segments = key.split(\":++:\")\n",
+    "            print(\n",
+    "                f\"Missing following metrics: {missing_metrics} for task with task_id: {segments[0]} and model_id: {segments[1]}\"\n",
+    "            )\n",
+    "\n",
+    "    # Additional checks\n",
+    "    if level == \"aggresive\":\n",
+    "        if evaluations_with_additional_models:\n",
+    "            print(\"====================================================\")\n",
+    "            print(\"Evaluations with additional models\")\n",
+    "            print(\"====================================================\")\n",
+    "            for (\n",
+    "                task_id,\n",
+    "                additional_models,\n",
+    "            ) in evaluations_with_additional_models.items():\n",
+    "                print(f\"Task ID: {task_id}\\tAdditional models: {additional_models}\")\n",
+    "\n",
+    "        if evaluations_per_model_with_additional_metrics:\n",
+    "            print(\"====================================================\")\n",
+    "            print(\"Evaluations with additional metrics\")\n",
+    "            print(\"====================================================\")\n",
+    "            for (\n",
+    "                key,\n",
+    "                additional_metrics,\n",
+    "            ) in evaluations_per_model_with_additional_metrics.items():\n",
+    "                segments = key.split(\":++:\")\n",
+    "                print(\n",
+    "                    f\"Task ID: {segments[0]}\\tModel: {segments[1]}\\tAdditional metrics: {additional_metrics}\"\n",
+    "                )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run validator\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validate(\n",
+    "    data=read_json(\n",
+    "        filename=\"<PATH_TO_INPUT_FILE>\"\n",
+    "    ),\n",
+    "    level=\"aggresive\",\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}

src/processor.ts CHANGED Viewed

@@ -18,7 +18,7 @@
 import { isEmpty, isNumber } from 'lodash';
 import { hash } from '@/src/utilities/strings';
 import {
   Data,
   MetricValue,
@@ -350,7 +350,15 @@ export function exportData(
         documents: data.documents,
       }),
       tasks: data.tasks,
-      evaluations: data.evaluations,
     };
     // Step 1: If tasks are defined
@@ -401,9 +409,17 @@ export function exportData(
             documents: Array.from(relevantDocuments),
           }),
           tasks: tasks,
-          evaluations: data.evaluations.filter((evaluation) =>
-            relevantTaskIds.has(evaluation.taskId),
-          ),
         };
       } else {
         // Step 1.b: Create an object to be exported by copying over tasks information
@@ -416,7 +432,15 @@ export function exportData(
             documents: data.documents,
           }),
           tasks: tasks,
-          evaluations: data.evaluations,
         };
       }
     }
@@ -428,7 +452,7 @@ export function exportData(
     element.setAttribute(
       'href',
       'data:application/json;charset=utf-8, ' +
-        encodeURIComponent(JSON.stringify(dataToExport)),
     );
     element.setAttribute('download', 'analytics.json');

 import { isEmpty, isNumber } from 'lodash';
 import { hash } from '@/src/utilities/strings';
+import { snakeCaseKeys } from '@/src/utilities/objects';
 import {
   Data,
   MetricValue,
         documents: data.documents,
       }),
       tasks: data.tasks,
+      evaluations: data.evaluations.map((evaluation) => {
+        return {
+          taskId: evaluation.taskId,
+          modelId: evaluation.modelId,
+          modelResponse: evaluation.modelResponse,
+          annotations: evaluation.annotations,
+          ...(evaluation.contexts && { contexts: evaluation.contexts }),
+        };
+      }),
     };
     // Step 1: If tasks are defined
             documents: Array.from(relevantDocuments),
           }),
           tasks: tasks,
+          evaluations: data.evaluations
+            .filter((evaluation) => relevantTaskIds.has(evaluation.taskId))
+            .map((evaluation) => {
+              return {
+                taskId: evaluation.taskId,
+                modelId: evaluation.modelId,
+                modelResponse: evaluation.modelResponse,
+                annotations: evaluation.annotations,
+                ...(evaluation.contexts && { contexts: evaluation.contexts }),
+              };
+            }),
         };
       } else {
         // Step 1.b: Create an object to be exported by copying over tasks information
             documents: data.documents,
           }),
           tasks: tasks,
+          evaluations: data.evaluations.map((evaluation) => {
+            return {
+              taskId: evaluation.taskId,
+              modelId: evaluation.modelId,
+              modelResponse: evaluation.modelResponse,
+              annotations: evaluation.annotations,
+              ...(evaluation.contexts && { contexts: evaluation.contexts }),
+            };
+          }),
         };
       }
     }
     element.setAttribute(
       'href',
       'data:application/json;charset=utf-8, ' +
+        encodeURIComponent(JSON.stringify(snakeCaseKeys(dataToExport))),
     );
     element.setAttribute('download', 'analytics.json');

src/types.ts CHANGED Viewed

@@ -182,6 +182,7 @@ export interface Annotation {
   readonly timestamp?: number;
   readonly duration?: number;
 }
 export interface TaskEvaluation {
   readonly taskId: string;
   readonly modelId: string;

   readonly timestamp?: number;
   readonly duration?: number;
 }
 export interface TaskEvaluation {
   readonly taskId: string;
   readonly modelId: string;

src/utilities/objects.ts CHANGED Viewed

@@ -16,7 +16,7 @@
  *
  **/
-import { camelCase, isPlainObject, isArray, isEmpty } from 'lodash';
 export function camelCaseKeys(
   obj: { [key: string]: any },
@@ -52,6 +52,37 @@ export function camelCaseKeys(
   return obj;
 }
 function areArraysIntersecting(
   a: string | string[],
   b: string | string[],

  *
  **/
+import { camelCase, snakeCase, isPlainObject, isArray, isEmpty } from 'lodash';
 export function camelCaseKeys(
   obj: { [key: string]: any },
   return obj;
 }
+export function snakeCaseKeys(
+  obj: { [key: string]: any },
+  keys: string[] = [
+    'taskId',
+    'modelId',
+    'modelResponse',
+    'displayValue',
+    'numericValue',
+    'minValue',
+    'maxValue',
+    'taskType',
+    'documentId',
+    'displayName',
+  ],
+) {
+  if (isArray(obj)) {
+    return obj.map((v) => snakeCaseKeys(v));
+  } else if (isPlainObject(obj)) {
+    return Object.keys(obj).reduce(
+      (result, key) => ({
+        ...result,
+        ...(keys.includes(key)
+          ? { [snakeCase(key)]: snakeCaseKeys(obj[key]) }
+          : { [key]: snakeCaseKeys(obj[key]) }),
+      }),
+      {},
+    );
+  }
+  return obj;
+}
 function areArraysIntersecting(
   a: string | string[],
   b: string | string[],