Spaces:

davebulaval
/

cole

Running

App Files Files Community

davebulaval commited on Oct 25, 2025

Commit

8fa3acc

1 Parent(s): 2857720

v1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +69 -0
frontend/README.md +38 -0
frontend/cole.pdf +0 -0
frontend/eslint.config.mjs +14 -0
frontend/jsconfig.json +7 -0
frontend/next.config.mjs +4 -0
frontend/package-lock.json +0 -0
frontend/postcss.config.mjs +5 -0
frontend/src/app/FAQ/page.js +45 -0
frontend/src/app/benchmarks/page.js +181 -0
frontend/src/app/components/BigBlueButton.js +17 -0
frontend/src/app/components/ClientHeader.js +17 -0
frontend/src/app/components/CodeBlock.js +10 -0
frontend/src/app/components/ErrorMessage.js +14 -0
frontend/src/app/components/LanguageSwitcher.js +22 -0
frontend/src/app/components/Modal.js +20 -0
frontend/src/app/components/ModalManager.js +26 -0
frontend/src/app/components/ModelDetailsModal.js +44 -0
frontend/src/app/components/SubmitForm.js +177 -0
frontend/src/app/components/UploadButton.js +27 -0
frontend/src/app/components/taskbar.js +64 -0
frontend/src/app/contact/page.js +32 -0
frontend/src/app/en/translation.json +132 -0
frontend/src/app/fr/translation.json +135 -0
frontend/src/app/globals.css +26 -0
frontend/src/app/guide/page.js +76 -0
frontend/src/app/i18n.js +28 -0
frontend/src/app/layout.js +37 -0
frontend/src/app/leaderboard/page.js +272 -0
frontend/src/app/leaderboard/util.js +47 -0
frontend/src/app/page.js +74 -0
frontend/src/app/papers/page.js +31 -0
frontend/src/app/resources/BenchmarksResource.js +35 -0
frontend/src/app/resources/ResourcesPaths.js +2 -0
frontend/src/app/results/[id]/page.js +128 -0
frontend/src/app/results/page.js +31 -0
src/__init__.py +3 -0
src/backend/__init__.py +0 -0
src/backend/evaluation.py +36 -0
src/backend/results/leaderboard.json +0 -0
src/backend/submission_api.py +224 -0
src/backend/submit_tools.py +19 -0
src/backend/validation_tools.py +93 -0
src/dataset/__init__.py +0 -0
src/dataset/dataset.py +96 -0
src/dataset/datasets_data.py +602 -0
src/dataset/prompt_builder.py +43 -0
src/docker_requirements.txt +17 -0
src/evaluation/__init__.py +0 -0
src/evaluation/evaluation_pipeline.py +138 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,69 @@

+# Build frontend
+FROM node:18 as frontend-build
+WORKDIR /app/frontend
+COPY frontend/package*.json ./
+RUN npm install
+COPY frontend/ ./
+RUN npm run build
+# Build backend
+FROM python:3.12-slim as backend
+WORKDIR /app
+# Install dependencies including nginx
+RUN apt-get update && apt-get install -y nginx \
+    && rm -rf /var/lib/apt/lists/*
+COPY src/docker_requirements.txt /app/src/
+RUN pip install --upgrade pip wheel
+RUN pip install --cache-dir=~/.cache/pip --prefer-binary pyarrow pandas numpy scipy fsspec aiohttp tqdm --progress-bar off -v
+RUN pip install --cache-dir=~/.cache/pip -r /app/src/docker_requirements.txt -v --prefer-binary && rm -rf ~/.cache/pip
+COPY src/ /app/src/
+# Copy Nginx config (adjust path if needed)
+COPY nginx.conf /etc/nginx/nginx.conf
+COPY --from=frontend-build /app/frontend /app/frontend
+# Create non-root user
+RUN useradd -m -u 1000 user
+# Create and configure cache directory
+RUN mkdir -p /app/.cache && \
+    chown -R user:user /app
+# Environment variables
+ENV HF_HOME=/app/.cache \
+    HF_DATASETS_CACHE=/app/.cache \
+    INTERNAL_API_PORT=7861 \
+    PORT=7860
+WORKDIR /app
+COPY nginx.conf /etc/nginx/nginx.conf
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+RUN chown -R user:user /var/lib/nginx
+RUN apt-get update && apt-get install -y nginx \
+    && groupadd -r nginx && useradd -r -g nginx nginx
+#Give user nginx write permissions
+RUN mkdir -p /var/lib/nginx && chown -R user:user /var/lib/nginx
+RUN mkdir -p /var/log/nginx && chown -R user:user /var/log/nginx
+RUN mkdir -p /app/logs && chown -R user:user /app/logs
+RUN mkdir -p /run && touch /run/nginx.pid && chown -R user:user /run
+RUN apt-get update && apt-get install -y \
+    curl \
+    netcat-openbsd \
+    && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+# Note: HF_TOKEN should be provided at runtime, not build time
+USER user
+EXPOSE 7860
+# Start both servers with wait-for
+CMD ["sh", "/start.sh"]

frontend/README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
+## Getting Started
+First, run the development server:
+```bash
+npm install
+npm run dev
+# or
+yarn dev
+# or
+pnpm dev
+# or
+bun dev
+```
+Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
+You can start editing the page by modifying `app/page.js`. The page auto-updates as you edit the file.
+This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
+## Learn More
+To learn more about Next.js, take a look at the following resources:
+- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
+- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
+You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
+## Deploy on Vercel
+The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
+Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.

frontend/cole.pdf ADDED Viewed

The diff for this file is too large to render. See raw diff

frontend/eslint.config.mjs ADDED Viewed

	@@ -0,0 +1,14 @@

+import { dirname } from "path";
+import { fileURLToPath } from "url";
+import { FlatCompat } from "@eslint/eslintrc";
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = dirname(__filename);
+const compat = new FlatCompat({
+  baseDirectory: __dirname,
+});
+const eslintConfig = [...compat.extends("next/core-web-vitals")];
+export default eslintConfig;

frontend/jsconfig.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "compilerOptions": {
+    "paths": {
+      "@/*": ["./src/*"]
+    }
+  }
+}

frontend/next.config.mjs ADDED Viewed

	@@ -0,0 +1,4 @@

+/** @type {import('next').NextConfig} */
+const nextConfig = {};
+export default nextConfig;

frontend/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

frontend/postcss.config.mjs ADDED Viewed

	@@ -0,0 +1,5 @@

+const config = {
+  plugins: ["@tailwindcss/postcss"],
+};
+export default config;

frontend/src/app/FAQ/page.js ADDED Viewed

	@@ -0,0 +1,45 @@

+'use client';
+import '../i18n';
+import { useState } from 'react';
+import { useTranslation } from 'react-i18next';
+export default function FAQ() {
+  const { t } = useTranslation();
+  const faqs = t('faqs', { returnObjects: true });
+  const [openIndex, setOpenIndex] = useState(null);
+  const toggle = (index) => {
+    setOpenIndex(openIndex === index ? null : index);
+  };
+  return (
+    <div className="max-w-3xl mx-auto px-6 py-3">
+      <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
+        {t('faq_title')}
+      </h2>
+      <div className="space-y-4">
+        {faqs.map((faq, i) => (
+          <div
+            key={i}
+            className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm transition"
+          >
+            <button
+              className="w-full text-left text-xl font-semibold text-gray-800 flex justify-between items-center"
+              onClick={() => toggle(i)}
+            >
+              <span>{`${i + 1}. ${faq.question}`}</span>
+              <span className="text-2xl text-gray-500">
+                {openIndex === i ? '▴' : '▾'}
+              </span>
+            </button>
+            {openIndex === i && (
+              <p className="mt-4 text-gray-600 text-sm">{faq.answer}</p>
+            )}
+          </div>
+        ))}
+      </div>
+    </div>
+  );
+}

frontend/src/app/benchmarks/page.js ADDED Viewed

	@@ -0,0 +1,181 @@

+'use client';
+import '../i18n';
+import { useTranslation } from 'react-i18next';
+import Link from 'next/link';
+export default function Benchmarks() {
+  const { t } = useTranslation();
+  return (
+    <div suppressHydrationWarning>
+      <div className="max-w-3xl mx-auto px-2 py-3">
+        <p className="text-1.5xl text-left text-gray-800">
+          {t('benchmarksIntro')}
+        </p>
+      </div>
+      <div className="space-y-8">
+        <Benchmark
+          title={t('benchmark_alloCine_title')}
+          link="https://huggingface.co/datasets/CATIE-AQ/allocine_fr_prompt_sentiment_analysis"
+          description={t('benchmark_alloCine_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_daccord_title')}
+          link="https://huggingface.co/datasets/maximoss/daccord-contradictions"
+          description={t('benchmark_daccord_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_fquad_title')}
+          link="https://arxiv.org/pdf/2002.06071"
+          description={t('benchmark_fquad_description')}
+          metrics="F1 Score, Exact Match Ratio"
+        />
+        <Benchmark
+          title={t('benchmark_french_boolq_title')}
+          link="https://huggingface.co/datasets/manu/french_boolq"
+          description={t('benchmark_french_boolq_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_fracas_title')}
+          link="https://huggingface.co/datasets/maximoss/fracas"
+          description={t('benchmark_fracas_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_gqnli_title')}
+          link="https://huggingface.co/datasets/maximoss/gqnli-fr"
+          description={t('benchmark_gqnli_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_lingnli_title')}
+          link="https://huggingface.co/datasets/maximoss/lingnli-multi-mt"
+          description={t('benchmark_lingnli_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_mms_title')}
+          link="https://huggingface.co/datasets/Brand24/mms"
+          description={t('benchmark_mms_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_mnli_nineeleven_fr_mt_title')}
+          link="https://huggingface.co/datasets/maximoss/mnli-nineeleven-fr-mt"
+          description={t('benchmark_mnli_nineeleven_fr_mt_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_multiblimp_title')}
+          link="https://huggingface.co/datasets/jumelet/multiblimp"
+          description={t('benchmark_multiblimp_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_paws_title')}
+          link="https://huggingface.co/datasets/google-research-datasets/paws-x"
+          description={t('benchmark_paws_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_piaf_title')}
+          link="https://aclanthology.org/2020.lrec-1.673/"
+          description={t('benchmark_piaf_description')}
+          metrics="F1 Score, Exact Match Ratio"
+        />
+        <Benchmark
+          title={t('benchmark_qfrblimp_title')}
+          link="https://github.com/davebulaval/FrBLiMP"
+          description={t('benchmark_qfrblimp_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_qfrcola_title')}
+          link="https://github.com/davebulaval/qfrcola"
+          description={t('benchmark_qfrcola_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_qfrcore_title')}
+          link=""
+          description={t('benchmark_qfrcore_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_qfrcort_title')}
+          link=""
+          description={t('benchmark_qfrcort_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_rte3_french_title')}
+          link="https://huggingface.co/datasets/maximoss/rte3-french"
+          description={t('benchmark_rte3_french_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_sickfr_title')}
+          link="https://huggingface.co/datasets/Lajavaness/SICK-fr"
+          description={t('benchmark_sickfr_description')}
+          metrics="Pearson"
+        />
+        <Benchmark
+          title={t('benchmark_sts22_title')}
+          link="https://huggingface.co/datasets/mteb/sts22-crosslingual-sts/viewer/fr"
+          description={t('benchmark_sts22_description')}
+          metrics="Pearson"
+        />
+        <Benchmark
+          title={t('benchmark_wino_x_lm_title')}
+          link="https://huggingface.co/datasets/demelin/wino_x/viewer/lm_en_fr?views%5B%5D=lm_en_fr"
+          description={t('benchmark_wino_x_lm_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_wino_x_mt_title')}
+          link="https://huggingface.co/datasets/demelin/wino_x/viewer/mt_en_fr"
+          description={t('benchmark_wino_x_mt_description')}
+          metrics="Accuracy"
+        />
+        <Benchmark
+          title={t('benchmark_wsd_title')}
+          link="https://huggingface.co/datasets/GETALP/flue"
+          description={t('benchmark_wsd_description')}
+          metrics="Exact Match Ratio"
+        />
+        <Benchmark
+          title={t('benchmark_xnli_title')}
+          link="https://github.com/facebookresearch/XNLI"
+          description={t('benchmark_xnli_description')}
+          metrics="Accuracy"
+        />
+      </div>
+    </div>
+  );
+}
+function Benchmark({ title, description, metrics, link }) {
+  const { t } = useTranslation();
+  return (
+    <div className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm">
+      <h3 className="text-xl font-semibold text-blue-700 mb-2 border-b-2 border-blue-500 inline-block">
+        {link ? (
+          <Link href={link} className="hover:underline">
+            {title}
+          </Link>
+        ) : (
+          title
+        )}
+      </h3>
+      <p className="text-gray-700 mb-2">{description}</p>
+      <p className="text-sm text-gray-500">
+        <span className="font-medium">{t('metrics')}</span> {metrics}
+      </p>
+    </div>
+  );
+}

frontend/src/app/components/BigBlueButton.js ADDED Viewed

	@@ -0,0 +1,17 @@

+export default function BigBlueButton({ children, onClick, disabled }) {
+  return (
+    <button
+      onClick={onClick}
+      disabled={disabled}
+      className={`px-4 py-2 text-white text-base font-medium rounded-md shadow-sm focus:outline-none focus:ring-2 ${
+        disabled
+          ? "bg-gray-400 cursor-not-allowed"
+          : "bg-blue-500 hover:bg-blue-600 focus:ring-blue-300"
+      }`}
+    >
+      {children}
+    </button>
+  );
+}

frontend/src/app/components/ClientHeader.js ADDED Viewed

	@@ -0,0 +1,17 @@

+'use client';
+import '../i18n';
+import { useTranslation } from 'react-i18next';
+import Taskbar from './taskbar';
+import { LanguageSwitcher } from './LanguageSwitcher';
+export default function ClientHeader() {
+  useTranslation();
+  return (
+    <header className="flex items-center justify-between px-4 py-3 shadow">
+      <Taskbar />
+      <LanguageSwitcher />
+    </header>
+  );
+}

frontend/src/app/components/CodeBlock.js ADDED Viewed

	@@ -0,0 +1,10 @@

+export default function CodeBlock({children}){
+    return (
+        <pre className="bg-gray-100 p-4 rounded-md overflow-x-auto text-sm text-gray-800 mt-4">
+          <code className="font-mono">
+            {children}
+          </code>
+        </pre>
+    );
+};

frontend/src/app/components/ErrorMessage.js ADDED Viewed

	@@ -0,0 +1,14 @@

+export default function ErrorMessage({children,condition}){
+return(
+    <div className="pt-2">
+        <div className="pt-2 space-y-2">
+            {condition && (
+        <div className="text-red-600 text-sm font-medium">
+            {children}
+        </div>
+        )}
+        </div>
+    </div>
+);
+}

frontend/src/app/components/LanguageSwitcher.js ADDED Viewed

	@@ -0,0 +1,22 @@

+'use client';
+import {useTranslation} from 'react-i18next';
+export function LanguageSwitcher() {
+    const {i18n} = useTranslation();
+    const changeLanguage = (lng) => {
+        i18n.changeLanguage(lng);
+    };
+    return (
+        <div className="flex space-x-2">
+            <button onClick={() => changeLanguage('en')} className="px-2 py-1 rounded-lg border">
+                EN
+            </button>
+            <button onClick={() => changeLanguage('fr')} className="px-2 py-1 rounded-lg border">
+                FR
+            </button>
+        </div>
+    );
+}

frontend/src/app/components/Modal.js ADDED Viewed

	@@ -0,0 +1,20 @@

+"use client";
+import BigBlueButton from "./BigBlueButton";
+export default function Modal({ children, onClose }) {
+  return (
+    <div className="fixed inset-0 bg-gray-600 bg-opacity-25 overflow-y-auto h-full w-full flex items-center justify-center z-50"
+         style={{ backgroundColor: 'rgba(75, 85, 99, 0.55)' }}>
+      <div className="p-8 border w-96 shadow-lg rounded-md bg-white">
+        <div className="text-center text-black">
+          {children}
+          <div className="flex justify-center mt-4">
+            <BigBlueButton onClick={onClose}>
+              Close
+            </BigBlueButton>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}

frontend/src/app/components/ModalManager.js ADDED Viewed

	@@ -0,0 +1,26 @@

+'use client';
+import { useSearchParams, useRouter } from 'next/navigation';
+import Modal from './Modal';
+import SubmitForm from './SubmitForm';
+export default function ModalManager() {
+  const searchParams = useSearchParams();
+  const submitModal = searchParams.get("show") === "submit";
+  const router = useRouter();
+  const handleClose = () => {
+    const newUrl = window.location.pathname;
+    router.push(newUrl);
+  };
+  return (
+    <>
+      {submitModal && (
+        <Modal onClose={handleClose}>
+          <SubmitForm />
+        </Modal>
+      )}
+    </>
+  );
+}

frontend/src/app/components/ModelDetailsModal.js ADDED Viewed

	@@ -0,0 +1,44 @@

+"use client";
+import Modal from "./Modal";
+// Fonction utilitaire pour nettoyer le nom du benchmark
+const getCleanBenchmarkName = (name) => {
+  const parts = name.split("|");
+  if (parts.length >= 2) return parts[1];
+  return name;
+};
+export default function ModelDetailsModal({ entry, onClose }) {
+  const modelName = entry.display_name || entry.name?.replace(".json", "") || "Unknown Model";
+  return (
+    <Modal onClose={onClose}>
+      <div className="space-y-4 max-w-2xl mx-auto">
+        {/* En-tête avec nom du modèle */}
+        <div className="text-center sticky top-0 bg-white z-10 pb-2">
+          <h3 className="text-2xl font-bold text-blue-700">{modelName}</h3>
+          <p className="text-sm text-gray-500">Model Details</p>
+        </div>
+        {/* Contenu des benchmarks */}
+        <div className="space-y-4">
+          {Object.entries(entry.results || {}).map(([benchmark, metrics]) => (
+            <div key={benchmark} className="p-4 border rounded-md bg-white shadow-md">
+              <h4 className="text-lg font-medium text-blue-600 mb-2">
+                📊 {getCleanBenchmarkName(benchmark)}
+              </h4>
+              <ul className="ml-4 text-sm text-gray-700 list-disc">
+                {Object.entries(metrics).map(([metric, value]) => (
+                  <li key={metric}>
+                    <strong>{metric}</strong>:{" "}
+                    {typeof value === "number" ? value.toFixed(4) : value}
+                  </li>
+                ))}
+              </ul>
+            </div>
+          ))}
+        </div>
+      </div>
+    </Modal>
+  );
+}

frontend/src/app/components/SubmitForm.js ADDED Viewed

	@@ -0,0 +1,177 @@

+'use client';
+import { useState } from "react";
+import { useRouter } from "next/navigation";
+import ErrorMessage from "./ErrorMessage";
+import { BACKEND_ADDRESS } from "@/app/resources/ResourcesPaths";
+import { Trans } from 'react-i18next';
+import BigBlueButton from "./BigBlueButton";
+import { useTranslation } from 'react-i18next';
+export default function SubmitForm() {
+  const { t } = useTranslation();
+  const router = useRouter();
+  const [requiredVisible, setRequiredVisible] = useState(false);
+  const [email, setEmail] = useState('');
+  const [displayName, setDisplayName] = useState('');
+  const [file, setFile] = useState(null);
+  const [isSubmitting, setIsSubmitting] = useState(false);
+  const [submitStatus, setSubmitStatus] = useState(null); // 'success' | 'error'
+  const [errorMessage, setErrorMessage] = useState('');
+  const [submissionId, setSubmissionId] = useState(null);
+  const handleFileChange = (e) => {
+    setFile(e.target.files[0]);
+  };
+  const submitResults = async () => {
+    if (!email || !displayName || !file) {
+      setRequiredVisible(true);
+      return;
+    }
+    if (!file.name.toLowerCase().endsWith('.zip')) {
+      alert(t('submit_zipAlert'));
+      return;
+    }
+    setRequiredVisible(false);
+    setIsSubmitting(true);
+    const formData = new FormData();
+    formData.append('email', email);
+    formData.append('display_name', displayName);
+    formData.append('predictions_zip', file);
+    try {
+      const res = await fetch(`${BACKEND_ADDRESS}/submit`, {
+        method: "POST",
+        body: formData,
+      });
+      if (!res.ok) {
+        const err = await res.json().catch(() => null);
+        throw new Error(err?.detail || `HTTP ${res.status}`);
+      }
+      const json = await res.json();
+      const id = json.submission_id;
+      setSubmissionId(id);
+      localStorage.setItem('last_result_file', `${id}.json`);
+      localStorage.setItem('just_submitted', 'true');
+      setSubmitStatus('success');
+    } catch (err) {
+      setErrorMessage(err.message);
+      setSubmitStatus('error');
+    } finally {
+      setIsSubmitting(false);
+    }
+  };
+  const renderModal = () => {
+    if (submitStatus === 'success') {
+      return (
+        <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
+          <div className="bg-white p-6 rounded-2xl shadow-lg max-w-sm text-center">
+            <h3 className="text-xl font-semibold text-green-600">
+              {t('submit_successTitle')}
+            </h3>
+            <p className="mt-2">{t('submit_successMessage')}</p>
+            <BigBlueButton
+              className="mt-4 px-4 py-2 rounded-full shadow hover:shadow-md"
+              onClick={() => router.push(`/results/${submissionId}`)}
+            >
+              {t('submit_checkResults')}
+            </BigBlueButton>
+          </div>
+        </div>
+      );
+    }
+    if (submitStatus === 'error') {
+      return (
+        <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
+          <div className="bg-white p-6 rounded-2xl shadow-lg max-w-sm text-center">
+            <h3 className="text-xl font-semibold text-red-600">
+              {t('submit_errorTitle')}
+            </h3>
+            <p className="mt-2">
+              <Trans i18nKey="submit_errorMessage" values={{ errorMessage }}>
+                Submission error: {{ errorMessage }}
+              </Trans>
+            </p>
+            <button
+              className="mt-4 px-4 py-2 rounded-full shadow hover:shadow-md"
+              onClick={() => setSubmitStatus(null)}
+            >
+              {t('submit_closeButton')}
+            </button>
+          </div>
+        </div>
+      );
+    }
+    return null;
+  };
+  return (
+    <div className="relative">
+      <div className="space-y-6 bg-white rounded-xl shadow-md p-6 w-full max-w-xl mx-auto border border-gray-200">
+        <h2 className="text-2xl font-semibold text-gray-800 text-center">
+          {t('submit_formTitle')}
+        </h2>
+        <div className="space-y-2">
+          <label htmlFor="email" className="block text-sm font-medium text-gray-700">
+            {t('submit_labelEmail')}
+          </label>
+          <input
+            id="email"
+            type="email"
+            placeholder={t('submit_placeholderEmail')}
+            value={email}
+            onChange={(e) => setEmail(e.target.value)}
+            className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
+          />
+        </div>
+        <div className="space-y-2">
+          <label htmlFor="displayname" className="block text-sm font-medium text-gray-700">
+            {t('submit_labelDisplayName')}
+          </label>
+          <input
+            id="displayname"
+            type="text"
+            placeholder={t('submit_placeholderDisplayName')}
+            value={displayName}
+            onChange={(e) => setDisplayName(e.target.value)}
+            className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
+          />
+        </div>
+        <div className="space-y-2">
+          <label htmlFor="zipfile" className="block text-sm font-medium text-gray-700">
+            {t('submit_labelZip')}
+          </label>
+          <input
+            id="zipfile"
+            type="file"
+            accept=".zip"
+            onChange={handleFileChange}
+            className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
+          />
+        </div>
+        <ErrorMessage condition={requiredVisible}>
+          ⚠️ Email, display name & ZIP are required.
+        </ErrorMessage>
+        <button
+          onClick={submitResults}
+          disabled={isSubmitting}
+          className="w-full bg-blue-600 text-white py-3 rounded-xl hover:bg-blue-700 mt-4"
+        >
+          {isSubmitting ? t('submit_submitting') : t('submit_button')}
+        </button>
+        {renderModal()}
+      </div>
+    </div>
+  );
+}

frontend/src/app/components/UploadButton.js ADDED Viewed

	@@ -0,0 +1,27 @@

+import { useState } from "react";
+import BigBlueButton from "./BigBlueButton";
+export default function UploadButton({children,uploaded}){
+      const [file, setFile] = useState(null);
+  function handleFileChange(e) {
+    const selectedFile = e.target.files?.[0];
+    if (selectedFile) {
+      setFile(selectedFile);
+      uploaded(selectedFile);
+    }
+  }
+return(
+    <div>
+        <label htmlFor="file_upload">{children}</label>
+        <input type="file" id="file_upload" accept=".zip" onChange={handleFileChange}
+    className="bg-gray-500 text-white text-base
+     font-medium rounded-md shadow-sm hover:bg-gray-400 focus:outline-none focus:ring-2 focus:ring-gray-300"></input>
+    </div>
+);
+}
+const uploadFile = async () => {}

frontend/src/app/components/taskbar.js ADDED Viewed

	@@ -0,0 +1,64 @@

+'use client';
+import '../i18n';
+import Link from 'next/link';
+import { usePathname } from 'next/navigation';
+import { FileText } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+export default function Taskbar() {
+  const { t } = useTranslation();
+  const pathname = usePathname();
+  const linkStyle = (path) =>
+    pathname === path
+      ? 'text-blue-500 font-semibold'
+      : 'text-gray-700 hover:text-blue-500';
+  return (
+    <nav className="w-full py-4 bg-none flex justify-between items-center mx-auto max-w-5xl">
+      <div className="flex items-center">
+        <Link href="/">
+          <span className="text-xl font-bold text-blue-600">{t('nav_home')}</span>
+        </Link>
+        <Link href="/papers" className="ml-2">
+          <FileText className="w-6 h-6 text-blue-600 hover:text-blue-500" />
+        </Link>
+      </div>
+      <div className="space-x-6">
+        <Link href="/guide" className={linkStyle('/guide')}>
+          {t('nav_guide')}
+        </Link>
+        <Link href="/FAQ" className={linkStyle('/FAQ')}>
+          {t('nav_faq')}
+        </Link>
+        <Link href="/contact" className={linkStyle('/contact')}>
+          {t('nav_contact')}
+        </Link>
+        <Link
+          href={`${pathname}?show=submit`}
+          className={linkStyle('/submit')}
+        >
+          {t('nav_submit')}
+        </Link>
+        <Link href="/benchmarks" className={linkStyle('/benchmarks')}>
+          {t('nav_tasks')}
+        </Link>
+        <Link href="/results" className={linkStyle('/results')}>
+          {t('nav_results')}
+        </Link>
+        <Link href="/leaderboard" className={linkStyle('/leaderboard')}>
+          {t('nav_leaderboard')}
+        </Link>
+        <Link
+          href="https://huggingface.co/datasets/graalul/COLE-public"
+          className={linkStyle('/hf')}
+        >
+          {t('nav_datasets')}
+        </Link>
+      </div>
+    </nav>
+  );
+}

frontend/src/app/contact/page.js ADDED Viewed

	@@ -0,0 +1,32 @@

+'use client';
+import '../i18n';
+import { useTranslation } from 'react-i18next';
+export default function Contact() {
+  const { t } = useTranslation();
+  return (
+    <div className="max-w-3xl mx-auto px-6 py-3">
+      <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
+        {t('contact_title')}
+      </h2>
+      <p className="text-gray-700 mb-4 leading-relaxed">
+        {t('contact_paragraph')}
+      </p>
+      <div className="bg-gray-50 p-4 rounded-md border border-dashed border-blue-400">
+        <p className="text-sm text-gray-500 mb-2">
+          {t('contact_email_label')}
+        </p>
+        <a
+          href="mailto:david.beauchemin@ift.ulaval.ca"
+          className="text-blue-600 font-mono text-lg hover:underline"
+        >
+          david.beauchemin@ift.ulaval.ca
+        </a>
+      </div>
+    </div>
+  );
+}

frontend/src/app/en/translation.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "siteTitle": "COLE",
+  "welcome": "Welcome to COLE!",
+  "upload": "Upload",
+  "submit": "Submit",
+  "results": "Results",
+  "contact": "Contact",
+  "contactUs": "Contact us",
+  "guide": "Guide",
+  "faq": "FAQ",
+  "submitResults": "Submit your results",
+  "ourTasks": "Our tasks",
+  "ourDatasets": "Our datasets",
+  "leaderboard": "COLE Leaderboard",
+  "errorOccurred": "An error occurred",
+  "close": "Close",
+  "details": "Details",
+  "benchmarksIntro": "COLE is constituted of 23 tasks, each of them aims to test one or more facets of language understanding in machine learning. Below are each of the tasks in more detail.",
+  "metrics": "Metric(s) :",
+  "benchmark_alloCine_title": "Allo-ciné.ca",
+  "benchmark_alloCine_description": "Allo-ciné tests language understanding in sentiment classification by feeding movie reviews which can be either positive and negative. The task consists in giving the correct sentiment for each review.",
+  "benchmark_lingnli_title": "LingNLI",
+  "benchmark_lingnli_description": "LingNLI is a Natural Language Inference corpus collected by putting a linguist 'in the loop' to dynamically introduce novel constraints during data collection, aiming to mitigate the systematic gaps and biases often found in crowdsourced datasets.",
+  "benchmark_daccord_title": "DACCORD",
+  "benchmark_daccord_description":"Predict whether the two sentences are compatible (0) or contradict each other (1).",
+  "benchmark_fquad_title": "FQuAD - French Question Answering Dataset",
+  "benchmark_fquad_description": "FQuAD is question/answer pairs built on high-quality Wikipedia articles. The goal in this task is to accurately predict if the answer to the question can be found in the provided article.",
+  "benchmark_french_boolq_title": "French BoolQ",
+  "benchmark_french_boolq_description": "Answer whether the context allows answering 'yes' to the question (1) or only 'no' or doesn't answer (0).",
+  "benchmark_fracas_title": "FraCaS",
+  "benchmark_fracas_description": "Natural language inference task : predict the relation between two sentences (implication, neutral, contradiction).",
+  "benchmark_gqnli_title": "GQNLI-Fr - The Generalized Quantifier NLI Challenge Dataset",
+  "benchmark_gqnli_description": "The dataset consists of carefully constructed premise-hypothesis pairs. Each hypothesis logically follows from the premise, contradicts it, or is neutral.",
+  "benchmark_mms_title": "MMS - Massive Multilingual Sentiment Corpora",
+  "benchmark_mms_description": "A massive multilingual sentiment analysis corpus in 27 languages.",
+  "benchmark_mnli_nineeleven_fr_mt_title": "MNLI-NineEleven-FR-MT",
+  "benchmark_mnli_nineeleven_fr_mt_description": "Predict the relation between two sentences (entailment, neutral, contradiction).",
+  "benchmark_paws_title": "PAWS: Paraphrase Adversaries from Word Scrambling",
+  "benchmark_paws_description": "This task aims to test paraphrase identification by giving two sentences and having the model define if these sentences are equivalent in meaning or not.",
+  "benchmark_piaf_title": "PIAF - The French-Language Dataset of Questions-Answers",
+  "benchmark_piaf_description": "This task consists of pairs of questions and text answers with information of where in the answer is the truly relevant information.",
+  "benchmark_qfrblimp_title": "QFrBLiMP - a Quebec-French Linguistic minimal pairs",
+  "benchmark_qfrblimp_description": "This task gives the model sentence pairs. The goal is to determine if the sentences are semantically equivalent, even with slightly different syntax and words.",
+  "benchmark_qfrcola_title": "QFrCoLA - a Quebec-French Corpus of Linguistic Acceptability Judgments",
+  "benchmark_qfrcola_description": "QFrCoLA is a French dataset sourced from multiple linguistic sites such as académie-française.fr and vitrinelinguistique.com. It aims to test models’ ability to determine grammatical correctness. The answer is a binary label indicating if the sentence is correct or not.",
+  "benchmark_qfrcore_title": "QFRCoRE: Quebec-French Corpus of Regional Expressions",
+  "benchmark_qfrcore_description": "Match the Quebec expression with its definition from a list.",
+  "benchmark_qfrcort_title": "QFRCoRT: Quebec-French Corpus of Regional Terms",
+  "benchmark_qfrcort_description": "Match the Quebec term with its definition from a list.",
+  "benchmark_rte3_french_title": "RTE3-French",
+  "benchmark_rte3_french_description": "Predict the relation between two sentences (entailment, neutral, contradiction).",
+  "benchmark_sickfr_title": "Sick-FR - French Sentences Involving Compositional Knowledge",
+  "benchmark_sickfr_description": "This task also has pairs of sentences annotated on two dimensions: relatedness (scored 1 to 5) and entailment (choices: entails, contradicts, neutral).",
+  "benchmark_sts22_title": "Sts22-Crosslingual - Multilingual News Article Similarity",
+  "benchmark_sts22_description": "This task evaluates whether pairs of news articles, written in different languages, cover the same story. It focuses on document-level similarity, where systems rate article pairs on a 4-point scale from most to least similar.",
+  "benchmark_wino_x_lm_title": "WiNo-X LM - Pronoun Resolution ",
+  "benchmark_wino_x_lm_description": "Predict the correct referent (1 or 2) of a pronoun in a sentence by choosing between two candidates.",
+  "benchmark_wino_x_mt_title": "WiNo-X MT - Pronoun Resolution ",
+  "benchmark_wino_x_mt_description": "Choose which of two French translations uses the correct pronoun (il/elle) based on the intended referent in the original English sentence.",
+  "benchmark_xnli_title": "XNLI - The Cross-Lingual NLI Corpus",
+  "benchmark_xnli_description": "This task consists of pairs of sentences where the goal is to determine the relation between the two: entailment, neutral, or contradiction.",
+  "benchmark_wsd_title": "WSD-Fr : Word Sense Disambiguation",
+  "benchmark_wsd_description": "WSD-Fr is a word sense disambiguation task where the model must identify the correct meaning of an ambiguous verb in context, as part of the FLUE benchmark.",
+  "benchmark_multiblimp_title": "MultiBLiMP-Fr - Multilingual Linguistic Minimal Pairs",
+  "benchmark_multiblimp_description": "A grammaticality judgment task using the French subset of the Multilingual Benchmark of Linguistic Minimal Pairs . Each instance is a minimal pair—one grammatical and one ungrammatical—differing by a single targeted feature. The model must select the grammatically correct sentence. This task probes fine-grained knowledge of French syntax, morphology, and agreement.",
+  "home_whatIsColleTitle": "What is COLE?",
+  "home_paragraph1": "COLE is a multidisciplinary French Natural Language Understanding benchmark ( <1>NLU</1> ). It takes inspiration from its predecessors <3>GLUE</3> and <5>SuperGLUE</5> to build a benchmark capable of evaluating models in the French language on multiple topics of language understanding. See <7>our paper</7> for more information.",
+  "home_paragraph2": "The COLE benchmark is built with multiple goals in mind. First, it aims to provide a solid and complete French alternative for benchmarking models on NLU tasks. Second, it provides the user with multiple datasets, all usable through HuggingFace’s libraries, to train or fine-tune models on specific tasks.",
+  "home_paragraph3": "We have made the choice to hide test labels to discourage cheating or overfitting on test data. To get results on your test data, you may send us your results as explained in <1>our guide</1>.",
+  "guide_title": "Using the COLE Benchmark",
+  "guide_section1_title": "Training and Testing",
+  "guide_section1_para1": "The COLE benchmark can be used to train and/or test models on multiple tasks. To train or fine-tune a model, you can fetch the train, validation and test data splits from our <0>Hugging Face public repository</0>. We recommend using Hugging Face’s libraries to simplify the process.",
+  "guide_section1_para2": "To test a model, you also need to fetch the data in the same way. Once done, your model should infer predictions for each line in the test split. Our repository includes benchmark evaluation scripts for each dataset. You only need to plug in your model's inference method using the HuggingFace Model interface. Our inference scripts are available on our <0>GitHub Repository</0>.",
+  "guide_section1_para3": "If you prefer to run inference separately, please ensure that the predictions are formatted correctly before submitting them for evaluation (see our \"Formatting the Dataset\" section).",
+  "guide_section2_title": "Formatting the Dataset",
+  "guide_section2_para1": "Before submitting your results, make sure your output is properly formatted so that our systems can process it. The expected format is a nested JSON dictionary as follows:",
+   "faq_title": "Frequently Asked Questions",
+   "faqs": [
+    {
+      "question": "How can I evaluate my model?",
+      "answer": "You can upload your model outputs in JSON format on the website. The system will automatically evaluate them, and you can view the results in the evaluation interface."
+    },
+    {
+      "question": "Is COLE multilingual?",
+      "answer": "No, COLE is available only in French. The benchmark is specifically designed to evaluate NLU models in the French language."
+    }
+  ],
+  "contact_title": "Contact us",
+  "contact_paragraph": "If you have any questions, feedback, or suggestions regarding the COLE benchmark, feel free to reach out to us. We are happy to help — please note that response times may vary.",
+  "contact_email_label": "Email us at:",
+  "submit_formTitle": "Submit Your Results",
+  "submit_labelEmail": "Your Email",
+  "submit_placeholderEmail": "you@example.com",
+  "submit_labelDisplayName": "Display Name",
+  "submit_placeholderDisplayName": "Leaderboard Name",
+  "submit_labelFile": "Predictions ZIP",
+  "submit_labelZip" : "Select your results file",
+  "submit_requiredError": "⚠️ Email, display name & ZIP are required.",
+  "submit_zipAlert": "Please upload a ZIP (.zip) file.",
+  "submit_button": "Submit Your Results",
+  "submit_submitting": "Submitting...",
+  "submit_successTitle": "Success",
+  "submit_successMessage": "Your submission has been successfully sent!",
+  "submit_checkResults": "Check the results",
+  "submit_errorTitle": "Error ⚠️",
+  "submit_errorMessage": "Submission error: {{errorMessage}}",
+  "submit_closeButton": "Close",
+  "results_default_title": "No Results Yet",
+  "results_default_message": "Please submit a ZIP file to generate benchmark results.",
+  "results_loading": "⏳ Loading results...",
+  "results_page_title": "📊 Results for {{displayName}}",
+  "results_download": "Download JSON",
+  "results_no_results": "⚠️ No benchmark results found.",
+  "results_benchmark_label": "🧪 Benchmark: {{name}}",
+  "leaderboard_title": "Leaderboard",
+  "leaderboard_modelHeader": "Model Name",
+  "leaderboard_overallHeader": "Overall",
+  "leaderboard_avgScoreLabel": "(avg score)",
+  "leaderboard_notSpecified": "NS",
+  "leaderboard_modalTitle": "Results for {{name}}",
+  "leaderboard_closeButton": "Close",
+  "nav_home": "COLE",
+  "nav_guide": "Guide",
+  "nav_faq": "FAQ",
+  "nav_contact": "Contact us",
+  "nav_submit": "Submit your results",
+  "nav_tasks": "Our tasks",
+  "nav_results": "Results",
+  "nav_leaderboard": "COLE Leaderboard",
+  "nav_datasets": "Our datasets"
+}

frontend/src/app/fr/translation.json ADDED Viewed

	@@ -0,0 +1,135 @@

+{
+  "siteTitle": "COLE",
+  "welcome": "Bienvenue sur COLE !",
+  "upload": "Téléverser",
+  "submit": "Soumettre",
+  "results": "Résultats",
+  "contact": "Contact",
+  "contactUs": "Nous contacter",
+  "guide": "Guide",
+  "faq": "FAQ",
+  "submitResults": "Soumettre vos résultats",
+  "ourTasks": "Nos tâches",
+  "ourDatasets": "Nos jeux de données",
+  "leaderboard": "Classement COLE",
+  "errorOccurred": "Une erreur est survenue",
+  "close": "Fermer",
+  "details": "Détails",
+  "benchmarksIntro": "COLE est constitué de 23 tâches, chacune visant à tester une ou plusieurs facettes de la compréhension du langage en apprentissage automatique. Ci-dessous, chaque tâche est décrite en détail.",
+  "metrics": "Métrique(s) :",
+  "benchmark_alloCine_title": "Allo-ciné.ca",
+  "benchmark_alloCine_description": "Allo-ciné teste la compréhension du langage dans la classification des sentiments en fournissant des critiques de films pouvant être positives ou négatives. La tâche consiste à donner le sentiment correct pour chaque critique.",
+  "benchmark_lingnli_title": "LingNLI",
+  "benchmark_lingnli_description": "LingNLI est un corpus d'inférence en langage naturel collecté en faisant appel à un linguiste afin d'introduire de manière dynamique de nouvelles contraintes pendant la collecte des données, dans le but d'atténuer les lacunes et les biais systématiques souvent présents dans les ensembles de données issus du crowdsourcing.",
+  "benchmark_daccord_title": "DACCORD",
+  "benchmark_daccord_description": "  Prédisez si les deux phrases sont compatibles (0) ou se contredisent (1). ",
+  "benchmark_fquad_title": "FQuAD - Corpus de questions-réponses français",
+  "benchmark_fquad_description": "FQuAD est un ensemble de paires question/réponse construit à partir d’articles Wikipédia de haute qualité. L’objectif est de prédire correctement si la réponse à la question se trouve réellement dans l’article fourni.",
+  "benchmark_french_boolq_title": "French BoolQ",
+  "benchmark_french_boolq_description": " Répondez si le contexte permet de répondre « oui » à la question (1) ou « non »/ne répond pas (0).",
+  "benchmark_fracas_title": "FraCaS",
+  "benchmark_fracas_description": "Tâche d'inférence en langage naturel : prédire la relation entre deux phrases (implication, neutralité, contradiction).",
+  "benchmark_gqnli_title": "GQNLI-Fr - Jeu de données Generalized Quantifier NLI Challenge",
+  "benchmark_gqnli_description": "Le jeu se compose de paires prémisse-hypothèse soigneusement construites. Chaque hypothèse découle logiquement de la prémisse, la contredit ou est neutre.",
+  "benchmark_mms_title": "MMS - Massive Multilingual Sentiment Corpora",
+  "benchmark_mms_description": "Un corpus multilingue massif d'analyse des sentiments en 27 langues.",
+  "benchmark_mnli_nineeleven_fr_mt_title": "MNLI-NineEleven-FR-MT",
+  "benchmark_mnli_nineeleven_fr_mt_description": "Prédisez la relation entre deux phrases (implication, neutre, contradiction).",
+  "benchmark_multiblimp_title": "MultiBLiMP-Fr - Paires minimales linguistiques en français",
+  "benchmark_multiblimp_description": "Une tâche de jugement de grammaticalité utilisant le sous-ensemble français du Multilingual Benchmark of Linguistic Minimal Pairs. Chaque instance est une paire minimale — l’une grammaticale et l’autre agrammaticale — ne différant que par une seule caractéristique ciblée. Le modèle doit sélectionner la phrase grammaticalement correcte. Cette tâche évalue les connaissances fines de la syntaxe, de la morphologie et des accords en français.",
+  "benchmark_paws_title": "PAWS : Paraphrase Adversaries from Word Scrambling",
+  "benchmark_paws_description": "Cette tâche vise à tester l’identification de paraphrases en donnant deux phrases et en demandant au modèle de définir si ces phrases sont équivalentes en sens ou non.",
+  "benchmark_piaf_title": "PIAF - Jeu de questions-réponses en français",
+  "benchmark_piaf_description": "Cette tâche consiste en paires de questions et de réponses textuelles avec l’indication de l’emplacement de l’information réellement pertinente dans la réponse.",
+  "benchmark_qfrblimp_title": "QFrBLiMP - Paires minimales linguistiques québécoises",
+  "benchmark_qfrblimp_description": "Cette tâche présente au modèle des paires de phrases. Le but est de déterminer si les phrases sont sémantiquement équivalentes, même avec une syntaxe et des mots légèrement différents.",
+  "benchmark_qfrcola_title": "QFrCoLA - Corpus québécois de jugements d’acceptabilité linguistique",
+  "benchmark_qfrcola_description": "QFrCoLA est un jeu de données français issu de plusieurs sites linguistiques tels qu’académie-française.fr et vitrinelinguistique.com. Il vise à tester la capacité des modèles à déterminer la correction grammaticale. La réponse est un label binaire indiquant si la phrase est correcte ou non.",
+  "benchmark_qfrcore_title": "QFRCoRE: Quebec-French Corpus of Regional Expressions",
+  "benchmark_qfrcore_description": "Associez l'expression québécoise à sa définition parmi une liste proposée.",
+  "benchmark_qfrcort_title": "QFRCoRT: Quebec-French Corpus of Regional Terms",
+  "benchmark_qfrcort_description": "Associez le terme québécois à sa définition parmi une liste proposée.",
+  "benchmark_rte3_french_title": "RTE3-Français",
+  "benchmark_rte3_french_description": "Prédisez la relation entre deux phrases (implication, neutre, contradiction).",
+  "benchmark_sickfr_title": "Sick-FR - Phrases françaises impliquant des connaissances compositionnelles",
+  "benchmark_sickfr_description": "Cette tâche propose des paires de phrases annotées selon deux dimensions : la similarité (1 à 5) et l’inférence (implique, contredit ou neutre).",
+  "benchmark_sts22_title": "Sts22-Crosslingual - Similarité d’articles d’actualités multilingues",
+  "benchmark_sts22_description": "Cette tâche évalue si des paires d’articles d’actualités, écrits dans différentes langues, couvrent la même histoire. Elle se concentre sur la similarité au niveau du document, où les systèmes notent les paires sur une échelle de 4 points, du plus similaire au moins similaire.",
+  "benchmark_wino_x_lm_title": "WiNo-X LM - Résolution de pronom ",
+  "benchmark_wino_x_lm_description": "Prédire le bon référent (1 ou 2) d’un pronom dans une phrase en choisissant parmi deux candidats.",
+  "benchmark_wino_x_mt_title": "WiNo-X MT - Résolution de pronom ",
+  "benchmark_wino_x_mt_description": " Choisir laquelle de deux traductions françaises utilise le bon pronom (il/elle) selon le référent correct de la phrase anglaise.",
+  "benchmark_xnli_title": "XNLI - Corpus NLI multilingue",
+  "benchmark_xnli_description": "Cette tâche consiste en paires de phrases où l’objectif est de déterminer la relation entre les deux : implication, neutre ou contradiction.",
+  "benchmark_wsd_title": "WSD-Fr : Désambiguïsation lexicale",
+  "benchmark_wsd_description": "WSD-Fr est une tâche de désambiguïsation lexicale dans laquelle le modèle doit identifier le sens correct d’un verbe ambigu en contexte, dans le cadre du benchmark FLUE.",
+  "home_whatIsColleTitle": "Qu’est-ce que COLE ?",
+  "home_paragraph1": "COLE est un benchmark multidisciplinaire de compréhension du langage naturel en français ( <1>NLU</1> ). Il s’inspire de ses prédécesseurs <3>GLUE</3> et <5>SuperGLUE</5> pour construire un benchmark capable d’évaluer les modèles en langue française sur plusieurs facettes de la compréhension du langage. Consultez <7>notre article</7> pour plus d’informations.",
+  "home_paragraph2": "Le benchmark COLE poursuit plusieurs objectifs : d’abord fournir une alternative solide et complète en français pour évaluer les modèles sur des tâches NLU, puis offrir à l’utilisateur plusieurs jeux de données, tous utilisables via les bibliothèques HuggingFace, pour entraîner ou affiner des modèles sur des tâches spécifiques.",
+  "home_paragraph3": "Nous avons choisi de masquer les étiquettes de test pour décourager la triche ou le sur-apprentissage sur les données de test. Pour obtenir des résultats sur vos données de test, vous pouvez nous envoyer vos résultats comme expliqué dans <1>notre guide</1>.",
+  "guide_title": "Utilisation du benchmark COLE",
+  "guide_section1_title": "Entraînement et tests",
+  "guide_section1_para1": "Le benchmark COLE peut être utilisé pour entraîner et/ou tester des modèles sur plusieurs tâches. Pour entraîner ou affiner un modèle, vous pouvez récupérer les jeux de données train, validation et test depuis notre <0>dépôt public Hugging Face</0>. Nous recommandons d’utiliser les bibliothèques Hugging Face pour simplifier le processus.",
+  "guide_section1_para2": "Pour tester un modèle, vous devez également récupérer les données de la même façon. Une fois fait, votre modèle doit inférer les prédictions pour chaque ligne de la partition de test. Notre dépôt inclut des scripts d’évaluation pour chaque dataset. Il vous suffit de connecter la méthode d’inférence de votre modèle via l’interface HuggingFace. Nos scripts d’inférence sont disponibles sur notre <0>dépôt GitHub</0>.",
+  "guide_section1_para3": "Si vous préférez lancer l’inférence séparément, assurez-vous que les prédictions sont correctement formatées avant de les soumettre pour évaluation (voir notre section « Formatting the Dataset »).",
+  "guide_section2_title": "Formatage du jeu de données",
+  "guide_section2_para1": "Avant de soumettre vos résultats, assurez-vous que votre sortie est correctement formatée afin que nos systèmes puissent la traiter. Le format attendu est un dictionnaire JSON imbriqué comme suit :",
+  "faq_title": "Foire aux questions",
+  "faqs": [
+    {
+      "question": "Comment évaluer mon modèle ?",
+      "answer": "Vous pouvez téléverser les sorties de votre modèle au format JSON sur le site. Le système les évaluera automatiquement, et vous pourrez consulter les résultats dans l’interface d’évaluation."
+    },
+    {
+      "question": "COLE est-il multilingue ?",
+      "answer": "Non, COLE est disponible uniquement en français. Le benchmark est spécifiquement conçu pour évaluer les modèles en compréhension de la langue française. ( NLU )"
+    }
+  ],
+  "contact_title": "Nous contacter",
+  "contact_paragraph": "Si vous avez des questions, des commentaires ou des suggestions concernant le benchmark COLE, n’hésitez pas à nous contacter. Nous serons ravis de vous aider — veuillez noter que les délais de réponse peuvent varier.",
+  "contact_email_label": "Envoyez-nous un email à :",
+  "submit_formTitle": "Soumettre vos résultats",
+  "submit_labelEmail": "Votre email",
+  "submit_placeholderEmail": "vous@exemple.com",
+  "submit_labelDisplayName": "Nom affiché",
+  "submit_placeholderDisplayName": "Nom au classement",
+  "submit_labelFile": "Fichier ZIP de prédictions",
+  "submit_labelZip": "Sélectionnez votre fichier de résultats",
+  "submit_requiredError": "⚠️ Email, nom affiché et ZIP sont requis.",
+  "submit_zipAlert": "Veuillez téléverser un fichier ZIP (.zip).",
+  "submit_button": "Soumettre vos résultats",
+  "submit_submitting": "Envoi en cours...",
+  "submit_successTitle": "Succès",
+  "submit_successMessage": "Votre soumission a été envoyée avec succès !",
+  "submit_checkResults": "Voir les résultats",
+  "submit_errorTitle": "Erreur ⚠️",
+  "submit_errorMessage": "Erreur de soumission : {{errorMessage}}",
+  "submit_closeButton": "Fermer",
+  "results_default_title": "Pas encore de résultats",
+  "results_default_message": "Veuillez soumettre un fichier ZIP pour générer les résultats du benchmark.",
+  "results_loading": "⏳ Chargement des résultats...",
+  "results_page_title": "📊 Résultats pour {{displayName}}",
+  "results_download": "Télécharger le JSON",
+  "results_no_results": "⚠️ Aucun résultat de benchmark trouvé.",
+  "results_benchmark_label": "🧪 Benchmark : {{name}}",
+  "leaderboard_title": "Classement",
+  "leaderboard_modelHeader": "Nom du modèle",
+  "leaderboard_overallHeader": "Global",
+  "leaderboard_avgScoreLabel": "(score moyen)",
+  "leaderboard_notSpecified": "NS",
+  "leaderboard_modalTitle": "Résultats pour {{name}}",
+  "leaderboard_closeButton": "Fermer",
+  "nav_home": "COLE",
+  "nav_guide": "Guide",
+  "nav_faq": "FAQ",
+  "nav_contact": "Nous contacter",
+  "nav_submit": "Soumettre vos résultats",
+  "nav_tasks": "Nos tâches",
+  "nav_results": "Résultats",
+  "nav_leaderboard": "Classement COLE",
+  "nav_datasets": "Nos  données"
+}

frontend/src/app/globals.css ADDED Viewed

	@@ -0,0 +1,26 @@

+@import "tailwindcss";
+:root {
+  --background: #ffffff;
+  --foreground: #6526ae;
+}
+@theme inline {
+  --color-background: var(--background);
+  --color-foreground: var(--foreground);
+  --font-sans: var(--font-geist-sans);
+  --font-mono: var(--font-geist-mono);
+}
+@media (prefers-color-scheme: dark) {
+  :root {
+    --background: #ffffff;
+    --foreground: #ededed;
+  }
+}
+body {
+  background: var(--background);
+  color: var(--foreground);
+  font-family: Arial, Helvetica, sans-serif;
+}

frontend/src/app/guide/page.js ADDED Viewed

	@@ -0,0 +1,76 @@

+'use client';
+import '../i18n';
+import {useTranslation, Trans} from 'react-i18next';
+import Link from 'next/link';
+import CodeBlock from '../components/CodeBlock';
+export default function Guide() {
+    const {t} = useTranslation();
+    return (
+        <div className="max-w-3xl mx-auto px-6 py-3">
+            <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
+                {t('guide_title')}
+            </h2>
+            <div className="space-y-8">
+                {/* SECTION TRAINING & TESTING */}
+                <div className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm hover:shadow transition">
+                    <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
+                        {t('guide_section1_title')}
+                    </h3>
+                    <p className="text-gray-700">
+                        <Trans i18nKey="guide_section1_para1" components={[
+                            <a key="hf-link"
+                               href="https://huggingface.co/datasets/graalul/COLE-public"
+                               target="_blank"
+                               rel="noopener noreferrer"
+                               className="text-blue-600 underline hover:text-blue-800"
+                            />
+                        ]}>
+                        </Trans>
+                    </p>
+                    <p className="text-gray-700 mt-4">
+                        <Trans i18nKey="guide_section1_para2" components={[<a key="github-ref"
+                        href="https://github.com/GRAAL-Research/COLE"
+                        target="_blank"
+                        rel="noopener noreferrer"
+                        className="text-blue-600 underline hover:text-blue-800">
+                        GitHub Repository.
+                        </a>]}> </Trans>
+                    </p>
+                    <p className="text-gray-700 mt-4">
+                        <Trans i18nKey="guide_section1_para3">
+                        </Trans>
+                    </p>
+                    {/* SECTION FORMATTING */}
+                    <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
+                        {t('guide_section2_title')}
+                    </h3>
+                    <p className="text-gray-700 mb-4">
+                        {t('guide_section2_para1')}
+                    </p>
+                    <CodeBlock>{`{
+  "model_name": "a_model_name",
+  "model_url": "a_model_url",
+  "tasks": [
+    {
+      "qfrcola": { "predictions": [1,1,1,1,1] }
+    },
+    {
+      "allocine": { "predictions": [1,1,1,1,1] }
+    }
+  ]
+}`}</CodeBlock>
+                </div>
+            </div>
+        </div>
+    );
+}

frontend/src/app/i18n.js ADDED Viewed

	@@ -0,0 +1,28 @@

+import i18n from 'i18next';
+import { initReactI18next } from 'react-i18next';
+import LanguageDetector from 'i18next-browser-languagedetector';
+import en from "./en/translation.json";
+import fr from './fr/translation.json';
+i18n
+  .use(LanguageDetector)
+  .use(initReactI18next)
+  .init({
+    resources: {
+      en: { translation: en },
+      fr: { translation: fr },
+    },
+    lng: 'en',
+    fallbackLng: 'en',
+    interpolation: {
+      escapeValue: false,
+    },
+    detection: {
+      order: ['localStorage', 'navigator'],
+      caches: ['localStorage'],
+    },
+  });
+export default i18n;

frontend/src/app/layout.js ADDED Viewed

	@@ -0,0 +1,37 @@

+import { Geist, Geist_Mono } from "next/font/google";
+import "./globals.css";
+import ClientHeader from "./components/ClientHeader";
+import ModalManager from "./components/ModalManager";
+import {Suspense} from "react";
+const geistSans = Geist({
+  variable: "--font-geist-sans",
+  subsets: ["latin"],
+});
+const geistMono = Geist_Mono({
+  variable: "--font-geist-mono",
+  subsets: ["latin"],
+});
+export const metadata = {
+  title: "COLE NLU",
+  description: "COLE : An NLU benchmark",
+};
+export default function RootLayout({ children }) {
+  return (
+    <html lang="en">
+      <body className={`${geistSans.variable} ${geistMono.variable} antialiased`}>
+        <ClientHeader />
+        <main className="w-full flex justify-center px-4 pt-8">
+          <div className="w-full max-w-3xl">{children}</div>
+        </main>
+        <Suspense fallback={null}>
+          <ModalManager/>
+        </Suspense>
+      </body>
+    </html>
+  );
+}

frontend/src/app/leaderboard/page.js ADDED Viewed

	@@ -0,0 +1,272 @@

+'use client';
+import React, { useEffect, useState } from "react";
+import {
+  normalizeBenchmarkName,
+  computeAverageScore,
+} from "./util";
+import { useTranslation } from "react-i18next";
+import { useParams } from "next/navigation";
+import { BACKEND_ADDRESS } from "@/app/resources/ResourcesPaths";
+const allowedMetrics = [
+  'acc',
+  'accuracy',
+  'f1',
+  'pearson',
+  'pearsonr',
+  'spearman',
+  'fquad',
+  'exact_match',
+];
+export default function LeaderboardPage() {
+  const { t } = useTranslation();
+  const { id: _ } = useParams(); // unused here
+  const [entries, setEntries] = useState([]);
+  const [benchmarks, setBenchmarks] = useState([]);
+  const [sortCol, setSortCol] = useState('overall');
+  const [sortOrder, setSortOrder] = useState('desc');
+  const [selectedEntry, setSelectedEntry] = useState(null);
+  const headerLabels = {
+    model: t('leaderboard_modelHeader'),
+    overall: t('leaderboard_overallHeader'),
+  };
+  useEffect(() => {
+    fetch(`${BACKEND_ADDRESS}/leaderboard`)
+      .then((res) => {
+        if (!res.ok) throw new Error(`HTTP ${res.status}`);
+        return res.json();
+      })
+      .then((data) => {
+        const withOverall = data.map((e) => ({
+          ...e,
+          averageScore: computeAverageScore(e),
+        }));
+        setEntries(withOverall);
+        const allBench = new Set();
+        withOverall.forEach((entry) => {
+          Object.keys(entry.results || {}).forEach((raw) => {
+            allBench.add(normalizeBenchmarkName(raw));
+          });
+        });
+        setBenchmarks(Array.from(allBench));
+      })
+      .catch((err) => console.error('Failed to load leaderboard:', err));
+  }, []);
+  const getCellValue = (entry, col) => {
+    if (col === 'model') return entry.display_name;
+    if (col === 'overall') return entry.averageScore ?? null;
+    const pair = Object.entries(entry.results || {}).find(
+      ([rawName]) => normalizeBenchmarkName(rawName) === col
+    );
+    if (!pair) return null;
+    const rawValues = [];
+    Object.values(pair[1]).forEach((metricGroup) => {
+      if (metricGroup && typeof metricGroup === 'object') {
+        Object.entries(metricGroup).forEach(([metricName, metricValue]) => {
+          if (
+            !metricName.includes('_warning') &&
+            typeof metricValue === 'number' &&
+            allowedMetrics.includes(metricName.toLowerCase())
+          ) {
+            rawValues.push(metricValue);
+          }
+        });
+      }
+    });
+    if (rawValues.length === 0) return null;
+    const normalized = rawValues.map((v) => v > 1 ? v / 100 : v);
+    const avg = normalized.reduce((a, b) => a + b, 0) / normalized.length;
+    return avg;
+  };
+  const sorted = [...entries].sort((a, b) => {
+    const va = getCellValue(a, sortCol);
+    const vb = getCellValue(b, sortCol);
+    if (sortCol === 'model') {
+      if (va == null) return 1;
+      if (vb == null) return -1;
+      return sortOrder === 'asc'
+        ? va.localeCompare(vb)
+        : vb.localeCompare(va);
+    }
+    const na = va ?? -Infinity;
+    const nb = vb ?? -Infinity;
+    return sortOrder === 'asc' ? na - nb : nb - na;
+  });
+  const handleSort = (col) => {
+    if (sortCol === col) {
+      setSortOrder(sortOrder === 'asc' ? 'desc' : 'asc');
+    } else {
+      setSortCol(col);
+      setSortOrder('desc');
+    }
+  };
+  const renderHeader = (col) => {
+    const baseLabel = headerLabels[col] ?? col;
+    const arrow = sortCol === col ? (sortOrder === 'asc' ? ' ▲' : ' ▼') : '';
+    if (col === 'overall') {
+      return (
+        <div>
+          <div onClick={() => handleSort(col)} className="cursor-pointer">
+            {baseLabel}
+            {arrow}
+          </div>
+          <div className="text-xs text-gray-600 text-center">
+            {t('leaderboard_avgScoreLabel')}
+          </div>
+        </div>
+      );
+    }
+    if (col === 'model') {
+      return (
+        <div onClick={() => handleSort(col)} className="cursor-pointer">
+          {baseLabel}
+          {arrow}
+        </div>
+      );
+    }
+    let metricText = '';
+    const sample = entries[0];
+    if (sample && sample.results) {
+      const p = Object.entries(sample.results).find(
+        ([raw]) => normalizeBenchmarkName(raw) === col
+      );
+      if (p) {
+        const grp = Object.values(p[1])[0];
+        if (grp) {
+          const metrics = Object.keys(grp)
+            .filter((m) => allowedMetrics.includes(m.toLowerCase()));
+          if (metrics.length > 0) {
+            metricText = ` (${metrics.join(', ')})`;
+          }
+        }
+      }
+    }
+    return (
+      <div onClick={() => handleSort(col)} className="cursor-pointer">
+        {baseLabel}
+        {arrow}
+        {metricText}
+      </div>
+    );
+  };
+  return (
+    <div className="space-y-8">
+        <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
+        {t('leaderboard_title')}</h3>
+      <div className="overflow-auto">
+        <table className="min-w-full border-collapse">
+          <thead>
+            <tr>
+              {['model', 'overall', ...benchmarks].map((b) => (
+                <th
+                  key={b}
+                  className="border border-gray-300 px-2 py-1 bg-blue-100 text-left text-sm font-semibold text-blue-700"
+                >
+                  {renderHeader(b)}
+                </th>
+              ))}
+            </tr>
+          </thead>
+          <tbody>
+            {sorted.map((entry) => (
+              <tr
+                key={entry.submission_id}
+                className="bg-white hover:bg-gray-50 cursor-pointer"
+                onClick={() => setSelectedEntry(entry)}
+              >
+                <td className="border border-gray-300 px-2 py-1 font-medium text-blue-600">
+                  {entry.display_name}
+                </td>
+                <td className="border border-gray-300 px-2 py-1 text-center">
+                  {entry.averageScore == null
+                    ? t('leaderboard_notSpecified')
+                    : (entry.averageScore * 100).toFixed(1) + '%'}
+                </td>
+                {benchmarks.map((b) => {
+                  const val = getCellValue(entry, b);
+                  return (
+                    <td
+                      key={b}
+                      className="border border-gray-200 px-2 py-1 text-center text-purple-700"
+                    >
+                      {val == null
+                        ? t('leaderboard_notSpecified')
+                        : (val * 100).toFixed(1) + '%'}
+                    </td>
+                  );
+                })}
+              </tr>
+            ))}
+          </tbody>
+        </table>
+      </div>
+      {selectedEntry && (
+        <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
+          <div className="bg-white p-6 rounded-2xl shadow-lg max-w-2xl w-full mx-4 max-h-[80vh] overflow-y-auto">
+            <h3 className="text-xl font-semibold text-gray-800 mb-4">
+              {t('leaderboard_modalTitle', {
+                name: selectedEntry.display_name,
+              })}
+            </h3>
+            {Object.entries(selectedEntry.results || {}).map(
+              ([taskKey, metricsObj]) => {
+                const prettyName = taskKey.split('|')[1] || taskKey;
+                const [metricType, values] = Object.entries(metricsObj)[0];
+                return (
+                  <div key={taskKey} className="mb-4">
+                    <h4 className="font-medium text-blue-700">
+                      {prettyName}
+                    </h4>
+                    <ul className="list-disc list-inside text-gray-700">
+                      {Object.entries(values)
+                        .filter(([k]) => !k.endsWith('_warning'))
+                        .map(([metricKey, value]) => (
+                          <li key={metricKey}>
+                            <strong>{metricKey.replace(/_/g, ' ')}</strong>:{' '}
+                            {typeof value === 'number'
+                              ? (value > 1
+                                ? value.toFixed(1) + '%'
+                                : (value * 100).toFixed(1) + '%')
+                              : value}
+                          </li>
+                        ))}
+                    </ul>
+                    {values[`${metricType}_warning`] && (
+                      <p className="text-sm text-yellow-700 mt-2">
+                        ⚠️ {values[`${metricType}_warning`]}
+                      </p>
+                    )}
+                  </div>
+                );
+              }
+            )}
+            <button
+              className="mt-4 px-4 py-2 bg-gray-200 rounded-full hover:bg-gray-300"
+              onClick={() => setSelectedEntry(null)}
+            >
+              {t('leaderboard_closeButton')}
+            </button>
+          </div>
+        </div>
+      )}
+    </div>
+  );
+}

frontend/src/app/leaderboard/util.js ADDED Viewed

	@@ -0,0 +1,47 @@

+export const normalizeBenchmarkName = (name) => {
+  const parts = name.toLowerCase().split("|");
+  if (parts.length >= 2) return parts[1].replace(/-/g, "_");
+  return name.toLowerCase();
+};
+export const computeAverageScore = (entry) => {
+  const allowedMetrics = [
+    "acc",
+    "accuracy",
+    "f1",
+    "exact_match",
+    "fquad",
+    "pearson",
+    "pearsonr",
+    "spearman",
+  ];
+  const perTaskAverages = [];
+  Object.values(entry.results || {}).forEach((taskData) => {
+    if (taskData && typeof taskData === "object") {
+      Object.values(taskData).forEach((metricGroup) => {
+        if (metricGroup && typeof metricGroup === "object") {
+          const taskMetrics = Object.entries(metricGroup)
+            .filter(([metric]) => allowedMetrics.includes(metric.toLowerCase()))
+            .map(([, value]) =>
+              typeof value === "number" ? value : null
+            )
+            .filter((v) => v !== null);
+          if (taskMetrics.length > 0) {
+            const normalized = taskMetrics.map((v) => v > 1 ? v / 100 : v);
+            const taskAvg = normalized.reduce((a, b) => a + b, 0) / normalized.length;
+            perTaskAverages.push(taskAvg);
+          }
+        }
+      });
+    }
+  });
+  if (perTaskAverages.length === 0) return null;
+  return perTaskAverages.reduce((a, b) => a + b, 0) / perTaskAverages.length;
+};

frontend/src/app/page.js ADDED Viewed

	@@ -0,0 +1,74 @@

+'use client'
+import Link from "next/link";
+import { Trans } from 'react-i18next';
+import { useTranslation } from 'react-i18next';
+export default function Home() {
+  const { t } = useTranslation();
+  return (
+    <div className="max-w-3xl mx-auto px-6 py-3">
+      <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
+        {t('home_whatIsColleTitle')}
+      </h2>
+      <p className="text-gray-700 mb-4 leading-relaxed space-y-4">
+        <Trans i18nKey="home_paragraph1">
+          COLE is a multidisciplinary French Natural Language Understanding benchmark (
+          <a
+            href="https://en.wikipedia.org/wiki/Natural_language_understanding"
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-blue-600 underline hover:text-blue-800"
+          >
+            NLU
+          </a>
+          ). It takes inspiration from its predecessors&nbsp;
+          <a
+            href="https://gluebenchmark.com/"
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-blue-600 underline hover:text-blue-800"
+          >
+            GLUE
+          </a>
+          &nbsp;and&nbsp;
+          <a
+            href="https://super.gluebenchmark.com/"
+            target="_blank"
+            rel="noopener noreferrer"
+            className="text-blue-600 underline hover:text-blue-800"
+          >
+            SuperGLUE
+          </a>
+          &nbsp;to build a benchmark capable of evaluating models in the French language on multiple topics of language understanding. See&nbsp;
+          <Link
+            href="https://arxiv.org/abs/2510.05046"
+            className="text-blue-600 underline hover:text-blue-800"
+          >
+            our paper
+          </Link>
+          &nbsp;for more information.
+        </Trans>
+      </p>
+      <p className="text-gray-700 leading-relaxed">
+        {t('home_paragraph2')}
+      </p>
+      <p className="text-gray-700 leading-relaxed mt-4">
+        <Trans i18nKey="home_paragraph3">
+          We have made the choice to hide test labels to discourage cheating or overfitting on test data. To get results on your test data, you may send us your results as explained in&nbsp;
+          <Link
+            href="/guide"
+            className="text-blue-600 underline hover:text-blue-800"
+          >
+            our guide
+          </Link>
+          .
+        </Trans>
+      </p>
+    </div>
+  );
+}

frontend/src/app/papers/page.js ADDED Viewed

	@@ -0,0 +1,31 @@

+'use client';
+import React, { useState } from 'react';
+export default function PapersPage() {
+  const [loaded, setLoaded] = useState(false);
+  return (
+    <div className="relative h-screen">
+      {!loaded && (
+        <div className="absolute inset-0 flex items-center justify-center bg-white z-10">
+          <div className="animate-spin h-12 w-12 border-4 border-blue-600 border-t-transparent rounded-full" />
+        </div>
+      )}
+        <div className="max-w-3xl mx-auto px-6 py-3">
+      <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
+        Our papers
+      </h2>
+    </div>
+      <iframe
+        onLoad={() => setLoaded(true)}
+        src="cole.pdf"
+        title="Document COLE"
+        width="100%"
+        height="100%"
+        className="border-none"
+      />
+    </div>
+  );
+}

frontend/src/app/resources/BenchmarksResource.js ADDED Viewed

	@@ -0,0 +1,35 @@

+const BASE_PATH = " http://127.0.0.1:8000"
+const send_results = async (email, labels) => {
+    let path = `${BASE_PATH}/submit`;
+    if(!email||!labels){
+        alert("email and results must be present")
+        return
+    }
+    const formData = new FormData()
+    formData.append("email",email)
+    formData.append("labels",labels)
+    console.log(email)
+    console.log(labels)
+ try {
+    const response = await fetch(path, {
+      method: "POST",
+      body: formData,
+    });
+    if (!response.ok) {
+        console.log(response)
+      throw new Error("Failed to submit");
+    }
+    const result = await response.json();
+    console.log("Server response:", result);
+    alert("Submission successful!");
+  } catch (err) {
+    console.error("Error submitting results:", err);
+    alert("There was a problem with the submission.");
+  }
+}
+export {send_results}

frontend/src/app/resources/ResourcesPaths.js ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ const BACKEND_ADDRESS = "/api"
2	+ export {BACKEND_ADDRESS}

frontend/src/app/results/[id]/page.js ADDED Viewed

	@@ -0,0 +1,128 @@

+'use client';
+import '../../i18n';
+import { useTranslation } from 'react-i18next';
+import React, { useEffect, useState } from 'react';
+import { useParams } from 'next/navigation';
+export default function ResultsPage() {
+  const { t } = useTranslation();
+  const { id: submissionId } = useParams();
+  const [data, setData] = useState(null);
+  // Noms de métriques fixes en anglais
+  const metricLabel = {
+    accuracy: 'Accuracy',
+    exact_match: 'Exact Match',
+    f1: 'F1 Score',
+    pearsonr: 'Pearson Correlation',
+  };
+  const getReadableMetricName = (metricKey) =>
+    metricLabel[metricKey] ||
+    metricKey.replace(/_/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
+  useEffect(() => {
+    fetch(`http://localhost:8000/results/${submissionId}.json`)
+      .then((res) => {
+        if (!res.ok) throw new Error(`HTTP ${res.status}`);
+        return res.json();
+      })
+      .then(setData)
+      .catch(() => setData({ error: true }));
+  }, [submissionId]);
+  const handleDownload = async () => {
+    if (!data) return;
+    try {
+      const res = await fetch(`${BACKEND_ADDRESS}/results/${submissionId}.json`);
+      if (!res.ok) throw new Error(`HTTP ${res.status}`);
+      const blob = await res.blob();
+      const url = URL.createObjectURL(blob);
+      const link = document.createElement('a');
+      link.href = url;
+      link.download = `${submissionId}.json`;
+      document.body.appendChild(link);
+      link.click();
+      document.body.removeChild(link);
+      URL.revokeObjectURL(url);
+    } catch {
+      console.error('Download failed');
+    }
+  };
+  if (!data) {
+    return (
+      <main className="max-w-3xl mx-auto px-6 py-6 text-center">
+        <p className="text-gray-600">{t('results_loading')}</p>
+      </main>
+    );
+  }
+  const tasksArray = data.tasks || [];
+  const displayName = data.display_name || data.config_general?.display_name;
+  return (
+    <main className="max-w-3xl mx-auto px-6 py-6">
+      <h2 className="text-2xl font-bold text-center mb-4">
+        {t('results_page_title', { displayName })}
+      </h2>
+      <div className="flex justify-center mb-6">
+        <button
+          onClick={handleDownload}
+          className="px-4 py-2 bg-blue-600 text-white rounded-lg shadow hover:bg-blue-700 transition"
+        >
+          {t('results_download')}
+        </button>
+      </div>
+      {tasksArray.length === 0 ? (
+        <p className="text-blue-700 text-center">
+          {t('results_no_results')}
+        </p>
+      ) : (
+        <div className="space-y-6">
+          {tasksArray.map((taskObj) => {
+            const [taskName, metricsObj] = Object.entries(taskObj)[0];
+            const [metricType, metricValues] = Object.entries(metricsObj)[0];
+            const prettyName = taskName.split('|')[1] || taskName;
+            const warningKey = `${metricType}_warning`;
+            return (
+              <div
+                key={taskName}
+                className="p-5 border border-purple-400 rounded-xl shadow-md bg-white"
+              >
+                <h3 className="text-xl font-semibold text-blue-700 mb-3">
+                  {t('results_benchmark_label', { name: prettyName })}
+                </h3>
+                <ul className="list-disc ml-6 text-gray-700">
+                  {Object.entries(metricValues)
+                    .filter(([k]) => !k.endsWith('_warning'))
+                    .map(([metricKey, value]) => (
+                      <li key={metricKey}>
+                        <strong>{getReadableMetricName(metricKey)}</strong>:{' '}
+                        {typeof value === 'number' ? (
+                          (metricKey === 'exact_match' || metricKey === 'f1'
+                            ? value
+                            : value * 100
+                          ).toFixed(1) + '%'
+                        ) : (
+                          value
+                        )}
+                      </li>
+                    ))}
+                </ul>
+                {metricValues[warningKey] && (
+                  <p className="text-sm text-yellow-700 mt-2">
+                    ⚠️ {metricValues[warningKey]}
+                  </p>
+                )}
+              </div>
+            );
+          })}
+        </div>
+      )}
+    </main>
+  );
+}

frontend/src/app/results/page.js ADDED Viewed

	@@ -0,0 +1,31 @@

+'use client';
+import '../i18n'
+import { useEffect } from 'react';
+import { useRouter } from 'next/navigation';
+import { useTranslation } from 'react-i18next';
+export default function ResultsDefaultPage() {
+  const router = useRouter();
+  const { t } = useTranslation();
+  useEffect(() => {
+    const justSubmitted = localStorage.getItem('just_submitted');
+    const savedFile = localStorage.getItem('last_result_file');
+    if (justSubmitted && savedFile) {
+      const id = savedFile.replace('.json', '');
+      localStorage.removeItem('just_submitted');
+      router.push(`/results/${id}`);
+    }
+  }, [router]);
+  return (
+    <main className="max-w-2xl mx-auto px-6 py-12 text-center">
+      <h1 className="text-3xl font-bold text-blue-700 mb-4">
+        {t('results_default_title')}
+      </h1>
+      <p className="text-gray-700">{t('results_default_message')}</p>
+    </main>
+  );
+}

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+REPO_ID = "COLE-Graal/COLEGraal"
+WANDB_PROJECT = "COLE-final"
+NA_VALUE = -1

src/backend/__init__.py ADDED Viewed

File without changes

src/backend/evaluation.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import copy
+import operator
+from functools import reduce
+from typing import List, Dict
+from src.task.task_factory import Task
+def compute_tasks_ratings(tasks: List[Task], submission: Dict) -> Dict:
+    """
+    Method to compute the tasks ratings.
+    :param tasks: list of tasks
+    :param submission: submission dictionary
+    """
+    # We merge the tasks dictionary for simpler handling.
+    submission_copy = copy.deepcopy(submission)
+    submission_response = reduce(operator.ior, submission_copy.get("tasks"), {})
+    for task in tasks:
+        task_name = task.task_name
+        # We remove the prediction since we do not keep it in the response.
+        predictions = submission_response.get(task_name).pop("predictions")
+        ratings, warning = task.compute(predictions=predictions)
+        ratings.update({f"{task.metric_name}_warning": warning})
+        submission_response.get(task_name).update({f"{task.metric_name}": ratings})
+    # Final submission response where we unwrap the merge tasks dictionary into a list of dictionary.
+    submission_response = {
+        "model_name": submission.get("model_name"),
+        "model_url": submission.get("model_url"),
+        "tasks": [{key: value} for key, value in submission_response.items()],
+    }
+    return submission_response

src/backend/results/leaderboard.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/backend/submission_api.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import glob
+import json
+import logging
+import os
+import sys
+import uuid
+from contextlib import asynccontextmanager
+from datetime import datetime
+from functools import lru_cache
+from pathlib import Path
+from typing import Dict, List, Any, Union
+import huggingface_hub
+from fastapi import FastAPI, UploadFile, Form, File
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
+from starlette.middleware.cors import CORSMiddleware
+from src.backend.evaluation import compute_tasks_ratings
+from src.backend.submit_tools import unzip_predictions_from_zip
+from src.dataset.datasets_data import preload_all_datasets
+from src.backend.validation_tools import (
+    validate_submission_tasks_name,
+    validate_submission_json,
+    validate_submission_template,
+)
+from src.task.task import Task
+from src.task.task_factory import (
+    tasks_factory,
+)
+BASE_DIR = Path(__file__).resolve().parents[2]
+SRC_DIR = BASE_DIR / "src"
+sys.path.insert(0, str(SRC_DIR))
+RESULTS_DIR = BASE_DIR / "src" / "backend" / "results"
+RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+FRONTEND_DIR = BASE_DIR / "frontend"
+@asynccontextmanager
+async def lifespan(application: FastAPI = None):  # pylint: disable=unused-argument
+    """Called before the backend comes online, is used to load datasets in memory."""
+    # Load the ML model
+    try:
+        token = os.environ.get("HF_TOKEN")
+        huggingface_hub.login(token=token)
+        preload_all_datasets()
+    except Exception as e:
+        error_message = f"The datasets could not be loaded : {e}"
+        logging.critical(error_message)
+    yield
+app = FastAPI(lifespan=lifespan)
+app.mount("/results", StaticFiles(directory=str(RESULTS_DIR)), name="results")
+front_end_info_message = f"The Front-end directory is: {FRONTEND_DIR}"
+logging.info(front_end_info_message)
+app.add_middleware(
+    CORSMiddleware,
+    allow_credentials=True,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.post("/submit")
+async def submit(
+    email: str = Form(...),
+    predictions_zip: UploadFile = File(...),
+    display_name: str = Form(...),
+):
+    """Route for making submissions with user generated results.
+    :param email : The email of the user's submission
+    :param predictions_zip : The zip file of the user's predictions'
+    :param display_name : The display name associated with the user's submission'
+    """
+    logging.info("Starting submission")
+    info_message = f"Submission from {email!r} as {display_name!r}."
+    logging.info(info_message)
+    zip_bytes = await predictions_zip.read()
+    submission_json = unzip_predictions_from_zip(zip_bytes)
+    validate_submission_template(submission_json)
+    validate_submission_tasks_name(submission_json)
+    validate_submission_json(submission_json)
+    tasks: List[Task] = tasks_factory(submission_json)
+    logging.info("Computation started")
+    start = datetime.now()
+    submission_response = compute_tasks_ratings(tasks=tasks, submission=submission_json)
+    computation_time = datetime.now() - start
+    info_message = f"Computation ended in {computation_time}"
+    logging.info(info_message)
+    submission_id = str(uuid.uuid4())
+    submission_response.update(
+        {
+            "display_name": display_name,
+            "email": email,
+            "submission_id": submission_id,
+        }
+    )
+    out_path = RESULTS_DIR / f"{submission_id}.json"
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(submission_response, f, ensure_ascii=False, indent=2)
+    get_leaderboard_entries.cache_clear()
+    return JSONResponse(content=submission_response)
+@lru_cache(maxsize=1)
+def get_leaderboard_entries() -> List[Dict[str, Any]]:
+    """Returns all entries currently in the leaderboard.
+    Supporte aussi les fichiers JSON qui contiennent une LISTE d'entrées
+    et normalise les métriques 'plates' en groupes imbriqués pour le front.
+    """
+    def _wrap_flat_metrics(task_payload: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Si task_payload est 'plat' (ex: {"accuracy": 94.2}),
+        on le transforme en {"<group>": {...}} pour que le front puisse l'agréger.
+        Règles de nommage du groupe :
+          - présence de exact_match/f1 -> "fquad"
+          - sinon présence de acc/accuracy -> "accuracy"
+          - sinon présence de pearson/pearsonr/spearman -> "correlation"
+          - sinon -> "metrics"
+        Les valeurs >1 sont laissées telles quelles (le front normalise déjà % -> [0,1]).
+        """
+        if not isinstance(task_payload, dict):
+            return task_payload
+        # si c'est déjà "imbriqué" (une valeur est un dict), on ne touche pas
+        if any(isinstance(v, dict) for v in task_payload.values()):
+            return task_payload
+        keys = set(k.lower() for k in task_payload.keys())
+        if {"exact_match", "f1"} & keys:
+            group = "fquad"
+        elif {"accuracy", "acc"} & keys:
+            group = "accuracy"
+        elif {"pearson", "pearsonr", "spearman"} & keys:
+            group = "correlation"
+        else:
+            group = "metrics"
+        # Rien de spécial pour les warnings ici : le front les considère optionnels
+        # et s'attend à "<group>_warning" dans l'objet interne si on veut en fournir.
+        return {group: task_payload}
+    entries: List[Dict[str, Any]] = []
+    for filepath in glob.glob(str(RESULTS_DIR / "*.json")):
+        try:
+            with open(filepath, encoding="utf-8") as f:
+                data = json.load(f)
+            # Fonction interne qui traite UNE entrée (dict) au bon format minimal
+            def process_entry(entry: Dict[str, Any]) -> Union[Dict[str, Any], None]:
+                if not isinstance(entry, dict):
+                    return None
+                if "model_name" not in entry or "tasks" not in entry:
+                    return None
+                # Re-construire "results" comme le front s'y attend
+                results = {}
+                for task_obj in entry.get("tasks", []):
+                    if not isinstance(task_obj, dict) or len(task_obj) != 1:
+                        continue
+                    task_name, payload = list(task_obj.items())[0]
+                    normalized = _wrap_flat_metrics(payload)
+                    results[task_name] = normalized
+                if not results:
+                    return None
+                return {
+                    "submission_id": entry.get("submission_id") or str(uuid.uuid4()),
+                    "display_name": entry.get("display_name")
+                    or entry.get("model_name")
+                    or "Unnamed Model",
+                    "email": entry.get("email", "N/A"),
+                    "results": results,
+                }
+            # Le fichier peut contenir UNE entrée (dict) ou PLUSIEURS (list)
+            if isinstance(data, list):
+                for item in data:
+                    processed = process_entry(item)
+                    if processed:
+                        entries.append(processed)
+            else:
+                processed = process_entry(data)
+                if processed:
+                    entries.append(processed)
+        except Exception as e:
+            logging_message = f"Error processing file '{filepath}': {e}"
+            logging.error(logging_message)
+            continue
+    return entries
+@app.get("/leaderboard")
+async def leaderboard() -> List[Dict[str, Any]]:
+    return get_leaderboard_entries()
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "message": "API is running."}
+@app.get("/")
+async def home():
+    return {"status": "working"}

src/backend/submit_tools.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import io
+import json
+import zipfile
+from fastapi import HTTPException
+def unzip_predictions_from_zip(zip_bytes: bytes) -> dict:
+    """
+    Reads predictions.json directly from the ZIP in memory.
+    """
+    with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
+        if "predictions.json" not in z.namelist():
+            error_message = (
+                "The uploaded ZIP file does not contains a predictions.json file."
+            )
+            raise HTTPException(400, error_message)
+        with z.open("predictions.json") as f:
+            return json.load(f)

src/backend/validation_tools.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import logging
+from typing import Dict, List
+from fastapi import HTTPException
+tasks_name = [
+    "allocine",
+    "fquad",
+    "gqnli",
+    "paws_x",
+    "piaf",
+    "qfrblimp",
+    "qfrcola",
+    "sickfr",
+    "sts22",
+    "xnli",
+]
+def validate_submission_template(dictionary: Dict) -> None:
+    """Ensures the dictionnary follows the correct format.
+    :param dictionary: Dictionary to validate."""
+    if dictionary.get("model_name", None) is None:
+        error = "The submission is missing a model name."
+        logging.error(error)
+        raise HTTPException(200, error)
+    if dictionary.get("model_url", None) is None:
+        error = "The submission is missing a model URL."
+        logging.error(error)
+        raise HTTPException(200, error)
+    if dictionary.get("tasks", None) is None:
+        error = "The submission is missing a tasks keyword."
+        logging.error(error)
+        raise HTTPException(200, error)
+    tasks = dictionary.get("tasks")
+    if not isinstance(tasks, List):
+        error = (
+            "The tasks keyword value must be a list of dictionaries where they key is the tasks "
+            "and value is a dictionary of predictions (in a list format). See our documentation for"
+            "a template."
+        )
+        logging.error(error)
+        raise HTTPException(200, error)
+    for task in tasks:
+        if len(task.keys()) > 1:
+            error = (
+                "Each task must be a dictionary of one element where the key is "
+                "the task name and the value is a list."
+            )
+            logging.error(error)
+            raise HTTPException(200, error)
+def validate_submission_tasks_name(dictionary: Dict) -> None:
+    """
+    Validate if the submission JSON key are the tasks name.
+    """
+    for task in dictionary.get("tasks"):
+        key = list(task.keys())[0]
+        if key not in tasks_name:
+            error = f"Unknown key '{key}' in the submission JSON. The expected tasks are: {tasks_name}."
+            logging.error(error)
+            raise HTTPException(200, error)
+def validate_submission_json(dictionary: Dict) -> None:
+    """Validates that the submitted json is in the correct format.
+    :param dictionary: Dictionary to validate."""
+    task_payload = dictionary.get("tasks")
+    for task in task_payload:
+        for task_name, payload in task.items():
+            if not isinstance(payload, dict):
+                error = (
+                    "The tasks payload must be a dictionary in the format '{'prediction': [<predictions>]}' "
+                    "for each task."
+                )
+                logging.error(error)
+                raise HTTPException(200, error)
+            for key, value in payload.items():
+                if key not in ["predictions", "prediction"]:
+                    error = f"The task '{task_name}' payload does not have the expected key: 'predictions'."
+                    logging.error(error)
+                    raise HTTPException(200, error)
+                if not isinstance(value, list):
+                    error = (
+                        f"The task '{task_name}' predictions payload is not in a list format. "
+                        r"The expected format is: '{'prediction': [<predictions>]}'"
+                    )
+                    logging.error(error)
+                    raise HTTPException(200, error)

src/dataset/__init__.py ADDED Viewed

File without changes

src/dataset/dataset.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Callable, Any, Union, List
+from datasets import load_dataset
+class Dataset:
+    """Class representing a usable dataset.
+    Allows dataset to be expressed as multiple forms, including as prompts, data or answers.
+    :param name : name of the dataset.
+    :param description : description of the dataset.
+    :param possible_ground_truths : the form that could be taken by ground truths.
+    :param hugging_face_repo : where to download the dataset on HuggingFace.
+    :param line_to_truth_fn : a function converting a dataset line to its truth value.
+    :param line_to_prompt_fn : a function converting a dataset line to a prompt for LLM inference.
+    :param line_to_data_fn : a function converting a dataset line to its data value for non LLM inference.
+    """
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        possible_ground_truths: Union[List[str], List[int], List[float]],
+        hugging_face_repo: str,
+        line_to_truth_fn: Callable,
+        line_to_prompt_fn: Callable,
+        line_to_data_fn: Callable,
+    ):
+        self._dataset = None
+        self.name = name
+        self.description = description
+        self.hugging_face_repo = hugging_face_repo
+        self.possible_ground_truths = possible_ground_truths
+        self.line_to_prompt_fn = line_to_prompt_fn
+        self.line_to_truth_fn = line_to_truth_fn
+        self.line_to_data_fn = line_to_data_fn
+    @property
+    def dataset(self):
+        self.load_data()
+        return self._dataset
+    def load_data(self):
+        if self._dataset is None:
+            self._dataset = load_dataset(
+                self.hugging_face_repo, name=self.name, split="test"
+            )
+    @property
+    def ground_truths(self) -> Union[List[str], List[int], List[float]]:
+        """The dataset's ground truths as a list"""
+        return [self.line_to_truth_fn(line) for line in self.dataset]
+    @property
+    def prompts(self) -> List[str]:
+        """The dataset's prompts as a list"""
+        return [self.line_to_prompt_fn(line) for line in self.dataset]
+    @property
+    def data(self) -> List[str]:
+        """The dataset's data as a list"""
+        return [self.line_to_data_fn(line) for line in self.dataset]
+    @property
+    def metadata(self) -> dict[str, Any]:
+        """The dataset's metadata as a dict"""
+        return {
+            "name": self.name,
+            "description": self.description,
+            "possible_ground_truths": str(self.possible_ground_truths),
+            "Prompt template": self.line_to_prompt_fn(self.EchoDict()),
+        }
+    @property
+    def metadata_string(self) -> str:
+        """The dataset's metadata as a string"""
+        lines = []
+        for key, value in self.metadata.items():
+            lines.append(f"{key}: {value}")
+        return "\n".join(lines)
+    def __len__(self):
+        return len(self.ground_truths)
+    def __getitem__(self, index: Union[int, slice]):
+        if isinstance(index, slice):
+            get_item_data = self.ground_truths[index.start : index.stop]
+        else:
+            get_item_data = self.ground_truths[index]
+        return get_item_data
+    class EchoDict:
+        """Helper class for building prompt templates,always returns the accessed key"""
+        def __getitem__(self, key):
+            return key

src/dataset/datasets_data.py ADDED Viewed

	@@ -0,0 +1,602 @@

+from src.dataset.dataset import Dataset
+from src.dataset.prompt_builder import PromptBuilder
+from src.task import COLE_REPOSITORY_NAME
+from src.task.task_names import Tasks
+datasets = {
+    Tasks.ALLOCINE.value: Dataset(
+        name=Tasks.ALLOCINE.value,
+        description="Binary classification on sentiment analysis"
+        " of movie reviews, with reviews being either positive (1) or negative (0).",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise("Cette phrase possède-t-elle un sentiment positif ou négatif ?")
+        .add_data(line["review"])
+        .add_end(
+            (
+                "Réponds "
+                "uniquement par 1 si la phrase est positive, réponds par 0 sinon. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: line["review"],
+    ),
+    Tasks.QFRCOLA.value: Dataset(
+        name=Tasks.QFRCOLA.value,
+        description="Binary grammatical judgement : "
+        "Predicts whether a sentence is grammatically correct (1) or not. (0).",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise("Juge si cette phrase est grammaticalement correcte :")
+        .add_data(line["sentence"])
+        .add_end(
+            (
+                "Réponds avec seulement 1 si la phrase est grammaticalement correcte, 0 sinon. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: line["sentence"],
+    ),
+    Tasks.QFRBLIMP.value: Dataset(
+        name=Tasks.QFRBLIMP.value,
+        description="Choice task between two sentences : Choose the one which is grammatically correct.",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(
+            line["label"]
+        ),  # The label is return as a string.
+        line_to_prompt_fn=lambda line: (
+            PromptBuilder()
+            .add_premise("Laquelle de ces phrases est grammaticalement correcte ?")
+            .add_data(f"Phrase 0:{line['sentence_a']}")
+            .add_data(f"Phrase 1:{line['sentence_b']}")
+            .add_end(
+                "Réponds avec seulement 0 si la phrase 0 "
+                "est grammaticalement correcte, et uniquement 1 si la phrase 1 est grammaticalement "
+                "correcte. La réponse est :"
+            )
+            .build()
+        ),
+        line_to_data_fn=lambda line: {line["sentence_a"], line["sentence_b"]},
+    ),
+    Tasks.GQNLI.value: Dataset(
+        name=Tasks.GQNLI.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Quelle est la relation de la deuxième phrase par rapport à la première ?"
+        )
+        .add_data(line["premise"])
+        .add_data(line["hypothesis"])
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la deuxième phrase implique la première,\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.SICKFR.value: Dataset(
+        name=Tasks.SICKFR.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["label"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise("Détermine la relation entre les deux phrases suivantes :")
+        .add_data(f"Phrase A : {line['sentence_A']}\nPhrase B : {line['sentence_B']}")
+        .add_end(
+            "Réponds uniquement par 0, 1 ou 2 :\n"
+            "0 - si la deuxième phrase découle logiquement de la première,\n"
+            "1 - si leur relation est neutre,\n"
+            "2 - si les phrases se contredisent.\n"
+            "La réponse est :"
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence_A": line["sentence_A"],
+            "sentence_B": line["sentence_B"],
+        },
+    ),
+    Tasks.STS22.value: Dataset(
+        name=Tasks.STS22.value,
+        description="Semantic textual similarity task : "
+        "Predict how similar two sentences are to each other (1 to 4).",
+        possible_ground_truths=["1", "2", "3", "4"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["score"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "À quel point les deux phrases suivantes sont-elles similaires ? Donne une note entière de 1 à 4."
+        )
+        .add_data(f"Phrase 1 : {line['sentence1']}\nPhrase 2 : {line['sentence2']}")
+        .add_end(
+            "Réponds uniquement avec un nombre entier entre 1 (aucune similarité) et 4 (équivalence parfaite). "
+            "La réponse est :"
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence1": line["sentence1"],
+            "sentence2": line["sentence2"],
+        },
+    ),
+    Tasks.PAWS_X.value: Dataset(
+        name=Tasks.PAWS_X.value,
+        description="Binary classification task : "
+        "Predict if two sentences have the same meaning (1) or not (0).",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Les deux phrases suivantes veulent-elles dire la même chose, ou ont-elles des significations différentes ?"
+        )
+        .add_data(line["sentence1"])
+        .add_data(line["sentence2"])
+        .add_end(
+            (
+                "Réponds seulement 1 si les deux phrases ont la même signification, 0 sinon. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence1": line["sentence1"],
+            "sentence2": line["sentence2"],
+        },
+    ),
+    Tasks.PIAF.value: Dataset(
+        name=Tasks.PIAF.value,
+        description="Extractive question answering task : Extract a question's answer from a given context.",
+        possible_ground_truths=[],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["answers"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Tu vas recevoir un contexte suivi d'une question.\n"
+            "Ta tâche est d'extraire **mot pour mot** le passage du contexte qui répond le mieux à la question.\n"
+            "N'invente rien. Ne reformule pas.\n"
+            "Réponds **en copiant uniquement** un extrait exact du texte ci-dessus."
+        )
+        .add_data(f"Contexte  : {line['context']}")
+        .add_data(f"Question : {line['question']}")
+        .add_end(
+            "Réponds uniquement par un passage extrait du contexte. La réponse est :"
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "context": line["context"],
+            "question": line["question"],
+        },
+    ),
+    Tasks.FQUAD.value: Dataset(
+        name=Tasks.FQUAD.value,
+        description="Extractive question answering task : Extract a question's answer from a given context.",
+        possible_ground_truths=[],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["answers"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Tu vas recevoir un contexte suivi d'une question.\n"
+            "Ta tâche est d'extraire **mot pour mot** le passage du contexte qui répond le mieux à la question.\n"
+            "N'invente rien. Ne reformule pas.\n"
+            "Réponds **en copiant uniquement** un extrait exact du texte ci-dessus."
+        )
+        .add_data(f"Contexte  : {line['context']}")
+        .add_data(f"Question : {line['question']}")
+        .add_end(
+            "Réponds uniquement par un passage extrait du contexte. La réponse est :"
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "context": line["context"],
+            "question": line["question"],
+        },
+    ),
+    Tasks.XNLI.value: Dataset(
+        name=Tasks.XNLI.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["label"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Quelle est la relation de la deuxième phrase par rapport à la première ?"
+        )
+        .add_data(rf"premise : {line['premise']}\n" f"sentence 2: {line['hypothesis']}")
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la deuxième phrase implique la première,\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.QFRCORE.value: Dataset(
+        name=Tasks.QFRCORE.value,
+        description="Definition matching task : "
+        "Match the Quebec expression with its definition from a list.",
+        possible_ground_truths=[str(i) for i in range(10)],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["correct_index"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            f"Que veut dire cette expression québécoise « {line['expression']} » ?"
+        )
+        .add_data(
+            "\n".join(
+                f"{idx} - {definition}"
+                for idx, definition in enumerate(line["choices"])
+            )
+        )
+        .add_end(
+            (
+                "Réponds uniquement par l'index, débutant à zéro,  "
+                "de la bonne définition parmi la liste ci-dessus. Par exemple, si la "
+                "troisième phrase correspond à l'expression, la réponse sera 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "expression": line["expression"],
+            "choices": line["choices"],
+        },
+    ),
+    Tasks.QFRCORT.value: Dataset(
+        name=Tasks.QFRCORT.value,
+        description="Definition matching task : "
+        "Match the Quebec term with its definition from a list.",
+        possible_ground_truths=[str(i) for i in range(10)],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["correct_index"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            f"Qu'est-ce que ça veut dire ce terme québécois « {line['terme']} » ?"
+        )
+        .add_data(
+            "\n".join(
+                f"{idx} - {definition}"
+                for idx, definition in enumerate(line["choices"])
+            )
+        )
+        .add_end(
+            (
+                "Réponds uniquement par l'index, débutant à zéro,  "
+                "de la bonne définition parmi la liste ci-dessus. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "terme": line["terme"],
+            "choices": line["choices"],
+        },
+    ),
+    Tasks.DACCORD.value: Dataset(
+        name=Tasks.DACCORD.value,
+        description="Paraphrase detection task :"
+        "Predict whether the two sentences are compatible (0) "
+        "or contradict each other (1).",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["label"]),
+        line_to_prompt_fn=lambda line: (
+            PromptBuilder()
+            .add_premise("Détermine la relation entre les deux phrases suivantes :")
+            .add_data(f"Première phrase : {line['premise']}")
+            .add_data(f"Deuxième phrase : {line['hypothesis']}")
+            .add_end(
+                "Réponds uniquement par :\n"
+                "0 - si les deux phrases sont compatibles (elles expriment la même information ou sont cohérentes),\n"
+                "1 - s'il y a contradiction entre les deux phrases.\n"
+                "Réponds uniquement par 0 ou 1. La réponse est :"
+            )
+            .build()
+        ),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.FRENCH_BOOLQ.value: Dataset(
+        name=Tasks.FRENCH_BOOLQ.value,
+        description="Binary question answering task : "
+        "Answer whether the context allows answering 'yes' to the question (1)"
+        "or, if the context only allows answering 'no' "
+        "to the question or does not answer the question. (0).",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["label"]),
+        line_to_prompt_fn=lambda line: (
+            PromptBuilder()
+            .add_premise(
+                "Lis le passage suivant et réponds à la question en te basant uniquement sur le texte :\n"
+                "- Si le passage permet d'affirmer que la réponse à la question est oui, réponds 1.\n"
+                "- Sinon, si la réponse est non ou que le passage ne permet pas de répondre à la question, réponds 0."
+            )
+            .add_data(f"Passage : {line['passage']}")
+            .add_data(f"Question : {line['question']}")
+            .add_end("La réponse est :")
+            .build()
+        ),
+        line_to_data_fn=lambda line: {
+            "question": line["question"],
+            "passage": line["passage"],
+        },
+    ),
+    Tasks.MNLI_NINEELEVEN_FR_MT.value: Dataset(
+        name=Tasks.MNLI_NINEELEVEN_FR_MT.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Quelle est la relation de la deuxième phrase par rapport à la première ?"
+        )
+        .add_data(line["premise"])
+        .add_data(line["hypothesis"])
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la deuxième phrase implique la première,\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.RTE3_FRENCH.value: Dataset(
+        name=Tasks.RTE3_FRENCH.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (entailment, neutral, contradiction)",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["label"]),
+        line_to_prompt_fn=lambda line: (
+            PromptBuilder()
+            .add_premise(
+                "Lis le texte suivant et détermine la relation de l'énoncé par rapport au texte."
+            )
+            .add_data(f"Texte : {line['premise']}")
+            .add_data(f"Énoncé : {line['hypothesis']}")
+            .add_end(
+                "Réponds uniquement par 0, 1 ou 2 :\n"
+                "0 - si l'énoncé découle logiquement du texte (entailment),\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "La réponse est :"
+            )
+            .build()
+        ),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.WINO_X_LM.value: Dataset(
+        name=Tasks.WINO_X_LM.value,
+        description=(
+            "Pronoun resolution task : predict the correct referent (1 or 2) "
+            "of a pronoun in a sentence by choosing between two candidates."
+        ),
+        possible_ground_truths=["1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["answer"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            'Voici une phrase en anglais contenant le pronom "it" dans un sens ambigu et sa traduction en français.'
+        )
+        .add_data(f"Phrase (originale en anglais) : {line['sentence']}")
+        .add_data(
+            f"Traduction en français (le pronom est caché par '_' ) : {line['context_fr']}"
+        )
+        .add_data("À quoi renvoie ce pronom ? Voici les choix: ")
+        .add_data(f"1 : {line['option1_fr']}")
+        .add_data(f"2 : {line['option2_fr']}")
+        .add_end("Réponds uniquement par 1 ou 2. La réponse est :")
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence": line["sentence"],
+            "translation": line["context_fr"],
+            "referent1": line["option1_fr"],
+            "referent2": line["option2_fr"],
+        },
+    ),
+    Tasks.WINO_X_MT.value: Dataset(
+        name="wino_x_mt",
+        description=(
+            "Pronoun resolution based on translations: choose between two French translations of an English "
+            "sentence with an ambiguous pronoun. The goal is to identify which of the two translations uses "
+            "the correct pronoun (he or she) based on the correct referent."
+        ),
+        possible_ground_truths=["1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(line["answer"]),
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Voici deux traductions d’une phrase anglaise contenant un pronom ambigu :"
+        )
+        .add_data(f"Phrase originale : {line['sentence']}")
+        .add_data(f"Traduction 1 (avec '{line['pronoun1']}') : {line['translation1']}")
+        .add_data(f"Traduction 2 (avec '{line['pronoun2']}') : {line['translation2']}")
+        .add_end(
+            "Quelle traduction utilise le bon pronom en fonction du référent visé dans la phrase originale ?\n"
+            "Réponds uniquement par 1 si la traduction 1 est correcte, ou 2 si la traduction 2 est correcte.\n"
+            "La réponse est :"
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence": line["sentence"],
+            "translation1": line["translation1"],
+            "translation2": line["translation2"],
+            "pronoun1": line["pronoun1"],
+            "pronoun2": line["pronoun2"],
+        },
+    ),
+    Tasks.MULTIBLIMP.value: Dataset(
+        name=Tasks.MULTIBLIMP.value,
+        description="Choice task between two sentences : Choose the one which is grammatically correct.",
+        possible_ground_truths=["0", "1"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: str(
+            line["label"]
+        ),  # The label is return as a string.
+        line_to_prompt_fn=lambda line: (
+            PromptBuilder()
+            .add_premise("Laquelle de ces phrases est grammaticalement correcte ?")
+            .add_data(f"Phrase 0:{line['sentence_a']}")
+            .add_data(f"Phrase 1:{line['sentence_b']}")
+            .add_end(
+                "Réponds avec seulement 0 si la phrase 0 "
+                "est grammaticalement correcte, et uniquement 1 si la phrase 1 est grammaticalement "
+                "correcte. La réponse est :"
+            )
+            .build()
+        ),
+        line_to_data_fn=lambda line: {line["sentence_a"], line["sentence_b"]},
+    ),
+    Tasks.FRACAS.value: Dataset(
+        name=Tasks.FRACAS.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Quelle est la relation de la deuxième phrase par rapport à la première ?"
+        )
+        .add_data(line["premise"])
+        .add_data(line["hypothesis"])
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la deuxième phrase implique la première,\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+    Tasks.MMS.value: Dataset(
+        name=Tasks.MMS.value,
+        description="A sentiment analysis task for classifying text as positive (2), negative (0), or neutral (1).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise("Quel est le sentiment de cette phrase?")
+        .add_data(line["text"])
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la phrase est négative,\n"
+                "1 - si la phrase est neutre,\n"
+                "2 - si la phrase est positive.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "text": line["text"],
+        },
+    ),
+    Tasks.WSD.value: Dataset(
+        name=Tasks.WSD.value,
+        description="Extractive word sense disambiguation : Extract an ambiguous word in a sentence.",
+        possible_ground_truths=[],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Tu vas recevoir une phrase contenant un mot ambigu ainsi que les étiquettes du 'part-of-speech tagging "
+            "(PoS)' pour chaque mot de la phrase. Le mot ambigu peut être un verbe ou un adjectif.\n"
+            "Ta tâche est d’indiquer **exactement** ce mot ambigu dans la phrase, sans rien ajouter ni reformuler.\n"
+            "Réponds uniquement avec le mot ambigu identifié."
+        )
+        .add_data(f"Phrase : {line['sentence']}")
+        .add_data(f"Part-of-speech tagging: {line['pos_tag_labels']}")
+        .add_end("La réponse est :")
+        .build(),
+        line_to_data_fn=lambda line: {
+            "sentence": line["sentence"],
+            "pos_tag_labels": line["pos_tag_labels"],
+        },
+    ),
+    Tasks.LINGNLI.value: Dataset(
+        name=Tasks.LINGNLI.value,
+        description="Natural language inference task : "
+        "predict the relation between two sentences (implication, neutral, contradiction).",
+        possible_ground_truths=["0", "1", "2"],
+        hugging_face_repo=COLE_REPOSITORY_NAME,
+        line_to_truth_fn=lambda line: line["label"],
+        line_to_prompt_fn=lambda line: PromptBuilder()
+        .add_premise(
+            "Quelle est la relation de la deuxième phrase par rapport à la première ?"
+        )
+        .add_data(line["premise"])
+        .add_data(line["hypothesis"])
+        .add_end(
+            (
+                "Réponds uniquement par :\n"
+                "0 - si la deuxième phrase implique la première,\n"
+                "1 - si la relation est neutre,\n"
+                "2 - s'il y a contradiction.\n"
+                "Réponds uniquement par 0, 1 ou 2. La réponse est :"
+            )
+        )
+        .build(),
+        line_to_data_fn=lambda line: {
+            "premise": line["premise"],
+            "hypothesis": line["hypothesis"],
+        },
+    ),
+}
+def preload_all_datasets():
+    """Loads all datasets into cache for later usage"""
+    for dataset in datasets.values():
+        dataset.load_data()
+def generate_metadata_dict():
+    """Generates a dictionary with all the datasets metadata information"""
+    metadata_dict = {}
+    for dataset in datasets.values():
+        metadata_dict[dataset.name] = dataset.metadata
+    return metadata_dict

src/dataset/prompt_builder.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import logging
+from typing import List
+class PromptBuilder:
+    """Builder class for creating prompt strings with dynamic data."""
+    def __init__(self):
+        self.premise: List[str] = []
+        self.end: List[str] = []
+        self.data: List[str] = []
+        self.data_only = False
+    def add_data(self, data):
+        self.data.append(data)
+        return self
+    def add_end(self, end):
+        self.end.append(end)
+        return self
+    def set_data_only(self, data_only):
+        self.data_only = data_only
+        return self
+    def add_premise(self, premise):
+        self.premise.append(premise)
+        return self
+    def build(self):
+        """Builds and returns the prompt as a string based on data, premise and end that were added to the builder."""
+        if len(self.data) == 0:
+            logging.warning(
+                "This prompt did not contain any data, was that intentional ?"
+            )
+        data = "\n".join(self.data)
+        if self.data_only:
+            return data
+        end = "".join(self.end)
+        premise = "".join(self.premise)
+        return f"{premise}\n{data}\n{end}"

src/docker_requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+# Core
+python-dotenv
+numpy
+python-multipart
+scikit-learn
+# Web backend (si tu utilises FastAPI/Flask, sinon à ignorer)
+fastapi
+uvicorn
+# Optional: pretty printing, progress bars, etc.
+tqdm
+aenum
+evaluate
+wheel
+# Pour compatibilité ancienne
+protobuf<=3.20.3

src/evaluation/__init__.py ADDED Viewed

File without changes

src/evaluation/evaluation_pipeline.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import argparse
+import gc
+import logging
+from datetime import datetime
+import torch
+import wandb
+from tqdm import tqdm
+from predictions.all_llms import llms
+from src import WANDB_PROJECT
+from src.evaluation.llm_evaluator import ModelEvaluator
+from src.evaluation.llm_factory import model_factory
+from src.evaluation.tools import split_llm_list
+from src.task.task_factory import tasks_factory
+from src.task.task_names import Tasks
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--test",
+    help="If set to true, the system will default to testing only a small model with a few examples.",
+    default=False,
+    type=bool,
+)
+parser.add_argument(
+    "--max_examples",
+    "-m",
+    help="The maximum number of examples to use, defaults to None.",
+    type=int,
+    default=None,
+)
+parser.add_argument(
+    "--models_name",
+    "-mn",
+    help="The name of the model(s) to load.",
+    type=str,
+    default=None,
+)
+parser.add_argument(
+    "--batch_size",
+    help="The batch size to use during the evaluation.",
+    type=int,
+    default=32,
+)
+parser.add_argument(
+    "--llm_split",
+    help="The split of the LLMs list to use. It can be '1', '2' or '3'.",
+    type=int,
+    default=None,
+    choices=[1, 2, 3],
+)
+parser.add_argument(
+    "--skip_first_n",
+    help="The number of LLM to skip in the list of split",
+    type=int,
+    default=None,
+)
+args = parser.parse_args()
+tasks_names = list(Tasks)
+tasks = tasks_factory(tasks_names)
+models = []
+if args.models_name is not None:
+    if args.models_name in llms:
+        models = llms[args.models_name]
+    else:
+        models = args.models_name.split(",")
+else:
+    models = llms["all"]
+models = split_llm_list(models=models, llm_split=args.llm_split)
+if args.skip_first_n is not None:
+    models = models[args.skip_first_n :]
+logging.info("Starting Evaluation")
+time_start = datetime.now()
+for model_name in tqdm(
+    models, total=len(models), desc="Processing LLM inference on tasks."
+):
+    try:
+        model = model_factory(model_name, batch_size=args.batch_size)
+        logging.info("Creating model")
+        evaluator = ModelEvaluator()
+        logging.info("Evaluating model")
+        exp_name = f"{model_name}"
+        wandb.init(
+            project=WANDB_PROJECT,
+            entity="doctorate",
+            config={
+                "model_name": model_name,
+                "tasks": "; ".join(tasks_names),
+                "batch_size": args.batch_size,
+            },
+            name=exp_name,
+        )
+        predictions_payload = evaluator.evaluate_subset(model, tasks, args.max_examples)
+        wandb.log(predictions_payload)
+        logging.info("Saving results")
+        evaluator.save_results("./results")
+        metrics_payload = evaluator.compute_metrics()
+        evaluator.save_metrics("./results")
+        wandb.log(metrics_payload)
+    except Exception as e:
+        error_message = f"Evaluation failed for model {model_name}: {e}"
+        logging.error(error_message)
+        wandb.finish(exit_code=1)
+        continue
+    finally:
+        # Memory cleaning
+        if "model" in locals():
+            del model
+        if "evaluator" in locals():
+            del evaluator
+        gc.collect()
+        torch.cuda.empty_cache()
+        wandb.finish(exit_code=0)
+time_end = datetime.now()
+info_message = f"End time: {time_end}"
+logging.info(info_message)
+elapsed_time = time_end - time_start
+info_message = f"Elapsed time: {elapsed_time}"
+logging.info(info_message)