davebulaval commited on
Commit
8fa3acc
·
1 Parent(s): 2857720
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +69 -0
  2. frontend/README.md +38 -0
  3. frontend/cole.pdf +0 -0
  4. frontend/eslint.config.mjs +14 -0
  5. frontend/jsconfig.json +7 -0
  6. frontend/next.config.mjs +4 -0
  7. frontend/package-lock.json +0 -0
  8. frontend/postcss.config.mjs +5 -0
  9. frontend/src/app/FAQ/page.js +45 -0
  10. frontend/src/app/benchmarks/page.js +181 -0
  11. frontend/src/app/components/BigBlueButton.js +17 -0
  12. frontend/src/app/components/ClientHeader.js +17 -0
  13. frontend/src/app/components/CodeBlock.js +10 -0
  14. frontend/src/app/components/ErrorMessage.js +14 -0
  15. frontend/src/app/components/LanguageSwitcher.js +22 -0
  16. frontend/src/app/components/Modal.js +20 -0
  17. frontend/src/app/components/ModalManager.js +26 -0
  18. frontend/src/app/components/ModelDetailsModal.js +44 -0
  19. frontend/src/app/components/SubmitForm.js +177 -0
  20. frontend/src/app/components/UploadButton.js +27 -0
  21. frontend/src/app/components/taskbar.js +64 -0
  22. frontend/src/app/contact/page.js +32 -0
  23. frontend/src/app/en/translation.json +132 -0
  24. frontend/src/app/fr/translation.json +135 -0
  25. frontend/src/app/globals.css +26 -0
  26. frontend/src/app/guide/page.js +76 -0
  27. frontend/src/app/i18n.js +28 -0
  28. frontend/src/app/layout.js +37 -0
  29. frontend/src/app/leaderboard/page.js +272 -0
  30. frontend/src/app/leaderboard/util.js +47 -0
  31. frontend/src/app/page.js +74 -0
  32. frontend/src/app/papers/page.js +31 -0
  33. frontend/src/app/resources/BenchmarksResource.js +35 -0
  34. frontend/src/app/resources/ResourcesPaths.js +2 -0
  35. frontend/src/app/results/[id]/page.js +128 -0
  36. frontend/src/app/results/page.js +31 -0
  37. src/__init__.py +3 -0
  38. src/backend/__init__.py +0 -0
  39. src/backend/evaluation.py +36 -0
  40. src/backend/results/leaderboard.json +0 -0
  41. src/backend/submission_api.py +224 -0
  42. src/backend/submit_tools.py +19 -0
  43. src/backend/validation_tools.py +93 -0
  44. src/dataset/__init__.py +0 -0
  45. src/dataset/dataset.py +96 -0
  46. src/dataset/datasets_data.py +602 -0
  47. src/dataset/prompt_builder.py +43 -0
  48. src/docker_requirements.txt +17 -0
  49. src/evaluation/__init__.py +0 -0
  50. src/evaluation/evaluation_pipeline.py +138 -0
Dockerfile ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Build frontend
2
+ FROM node:18 as frontend-build
3
+ WORKDIR /app/frontend
4
+ COPY frontend/package*.json ./
5
+ RUN npm install
6
+ COPY frontend/ ./
7
+ RUN npm run build
8
+
9
+ # Build backend
10
+ FROM python:3.12-slim as backend
11
+ WORKDIR /app
12
+
13
+ # Install dependencies including nginx
14
+ RUN apt-get update && apt-get install -y nginx \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ COPY src/docker_requirements.txt /app/src/
18
+ RUN pip install --upgrade pip wheel
19
+ RUN pip install --cache-dir=~/.cache/pip --prefer-binary pyarrow pandas numpy scipy fsspec aiohttp tqdm --progress-bar off -v
20
+ RUN pip install --cache-dir=~/.cache/pip -r /app/src/docker_requirements.txt -v --prefer-binary && rm -rf ~/.cache/pip
21
+ COPY src/ /app/src/
22
+
23
+ # Copy Nginx config (adjust path if needed)
24
+ COPY nginx.conf /etc/nginx/nginx.conf
25
+
26
+ COPY --from=frontend-build /app/frontend /app/frontend
27
+
28
+ # Create non-root user
29
+ RUN useradd -m -u 1000 user
30
+
31
+ # Create and configure cache directory
32
+ RUN mkdir -p /app/.cache && \
33
+ chown -R user:user /app
34
+
35
+ # Environment variables
36
+ ENV HF_HOME=/app/.cache \
37
+ HF_DATASETS_CACHE=/app/.cache \
38
+ INTERNAL_API_PORT=7861 \
39
+ PORT=7860
40
+
41
+ WORKDIR /app
42
+ COPY nginx.conf /etc/nginx/nginx.conf
43
+ COPY start.sh /start.sh
44
+ RUN chmod +x /start.sh
45
+ RUN chown -R user:user /var/lib/nginx
46
+
47
+ RUN apt-get update && apt-get install -y nginx \
48
+ && groupadd -r nginx && useradd -r -g nginx nginx
49
+
50
+ #Give user nginx write permissions
51
+ RUN mkdir -p /var/lib/nginx && chown -R user:user /var/lib/nginx
52
+ RUN mkdir -p /var/log/nginx && chown -R user:user /var/log/nginx
53
+ RUN mkdir -p /app/logs && chown -R user:user /app/logs
54
+ RUN mkdir -p /run && touch /run/nginx.pid && chown -R user:user /run
55
+
56
+ RUN apt-get update && apt-get install -y \
57
+ curl \
58
+ netcat-openbsd \
59
+ && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
60
+ && apt-get install -y nodejs \
61
+ && rm -rf /var/lib/apt/lists/*
62
+
63
+
64
+ # Note: HF_TOKEN should be provided at runtime, not build time
65
+ USER user
66
+ EXPOSE 7860
67
+
68
+ # Start both servers with wait-for
69
+ CMD ["sh", "/start.sh"]
frontend/README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://github.com/vercel/next.js/tree/canary/packages/create-next-app).
2
+
3
+ ## Getting Started
4
+
5
+ First, run the development server:
6
+
7
+ ```bash
8
+ npm install
9
+
10
+ npm run dev
11
+ # or
12
+ yarn dev
13
+ # or
14
+ pnpm dev
15
+ # or
16
+ bun dev
17
+ ```
18
+
19
+ Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
20
+
21
+ You can start editing the page by modifying `app/page.js`. The page auto-updates as you edit the file.
22
+
23
+ This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
24
+
25
+ ## Learn More
26
+
27
+ To learn more about Next.js, take a look at the following resources:
28
+
29
+ - [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
30
+ - [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
31
+
32
+ You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
33
+
34
+ ## Deploy on Vercel
35
+
36
+ The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
37
+
38
+ Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
frontend/cole.pdf ADDED
The diff for this file is too large to render. See raw diff
 
frontend/eslint.config.mjs ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { dirname } from "path";
2
+ import { fileURLToPath } from "url";
3
+ import { FlatCompat } from "@eslint/eslintrc";
4
+
5
+ const __filename = fileURLToPath(import.meta.url);
6
+ const __dirname = dirname(__filename);
7
+
8
+ const compat = new FlatCompat({
9
+ baseDirectory: __dirname,
10
+ });
11
+
12
+ const eslintConfig = [...compat.extends("next/core-web-vitals")];
13
+
14
+ export default eslintConfig;
frontend/jsconfig.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "paths": {
4
+ "@/*": ["./src/*"]
5
+ }
6
+ }
7
+ }
frontend/next.config.mjs ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /** @type {import('next').NextConfig} */
2
+ const nextConfig = {};
3
+
4
+ export default nextConfig;
frontend/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
frontend/postcss.config.mjs ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ const config = {
2
+ plugins: ["@tailwindcss/postcss"],
3
+ };
4
+
5
+ export default config;
frontend/src/app/FAQ/page.js ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import { useState } from 'react';
5
+ import { useTranslation } from 'react-i18next';
6
+
7
+ export default function FAQ() {
8
+ const { t } = useTranslation();
9
+ const faqs = t('faqs', { returnObjects: true });
10
+ const [openIndex, setOpenIndex] = useState(null);
11
+
12
+ const toggle = (index) => {
13
+ setOpenIndex(openIndex === index ? null : index);
14
+ };
15
+
16
+ return (
17
+ <div className="max-w-3xl mx-auto px-6 py-3">
18
+ <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
19
+ {t('faq_title')}
20
+ </h2>
21
+
22
+ <div className="space-y-4">
23
+ {faqs.map((faq, i) => (
24
+ <div
25
+ key={i}
26
+ className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm transition"
27
+ >
28
+ <button
29
+ className="w-full text-left text-xl font-semibold text-gray-800 flex justify-between items-center"
30
+ onClick={() => toggle(i)}
31
+ >
32
+ <span>{`${i + 1}. ${faq.question}`}</span>
33
+ <span className="text-2xl text-gray-500">
34
+ {openIndex === i ? '▴' : '▾'}
35
+ </span>
36
+ </button>
37
+ {openIndex === i && (
38
+ <p className="mt-4 text-gray-600 text-sm">{faq.answer}</p>
39
+ )}
40
+ </div>
41
+ ))}
42
+ </div>
43
+ </div>
44
+ );
45
+ }
frontend/src/app/benchmarks/page.js ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import { useTranslation } from 'react-i18next';
5
+ import Link from 'next/link';
6
+
7
+ export default function Benchmarks() {
8
+ const { t } = useTranslation();
9
+
10
+ return (
11
+ <div suppressHydrationWarning>
12
+ <div className="max-w-3xl mx-auto px-2 py-3">
13
+ <p className="text-1.5xl text-left text-gray-800">
14
+ {t('benchmarksIntro')}
15
+ </p>
16
+ </div>
17
+ <div className="space-y-8">
18
+ <Benchmark
19
+ title={t('benchmark_alloCine_title')}
20
+ link="https://huggingface.co/datasets/CATIE-AQ/allocine_fr_prompt_sentiment_analysis"
21
+ description={t('benchmark_alloCine_description')}
22
+ metrics="Accuracy"
23
+ />
24
+ <Benchmark
25
+ title={t('benchmark_daccord_title')}
26
+ link="https://huggingface.co/datasets/maximoss/daccord-contradictions"
27
+ description={t('benchmark_daccord_description')}
28
+ metrics="Accuracy"
29
+ />
30
+ <Benchmark
31
+ title={t('benchmark_fquad_title')}
32
+ link="https://arxiv.org/pdf/2002.06071"
33
+ description={t('benchmark_fquad_description')}
34
+ metrics="F1 Score, Exact Match Ratio"
35
+ />
36
+ <Benchmark
37
+ title={t('benchmark_french_boolq_title')}
38
+ link="https://huggingface.co/datasets/manu/french_boolq"
39
+ description={t('benchmark_french_boolq_description')}
40
+ metrics="Accuracy"
41
+ />
42
+ <Benchmark
43
+ title={t('benchmark_fracas_title')}
44
+ link="https://huggingface.co/datasets/maximoss/fracas"
45
+ description={t('benchmark_fracas_description')}
46
+ metrics="Accuracy"
47
+ />
48
+ <Benchmark
49
+ title={t('benchmark_gqnli_title')}
50
+ link="https://huggingface.co/datasets/maximoss/gqnli-fr"
51
+ description={t('benchmark_gqnli_description')}
52
+ metrics="Accuracy"
53
+ />
54
+ <Benchmark
55
+ title={t('benchmark_lingnli_title')}
56
+ link="https://huggingface.co/datasets/maximoss/lingnli-multi-mt"
57
+ description={t('benchmark_lingnli_description')}
58
+ metrics="Accuracy"
59
+ />
60
+ <Benchmark
61
+ title={t('benchmark_mms_title')}
62
+ link="https://huggingface.co/datasets/Brand24/mms"
63
+ description={t('benchmark_mms_description')}
64
+ metrics="Accuracy"
65
+ />
66
+ <Benchmark
67
+ title={t('benchmark_mnli_nineeleven_fr_mt_title')}
68
+ link="https://huggingface.co/datasets/maximoss/mnli-nineeleven-fr-mt"
69
+ description={t('benchmark_mnli_nineeleven_fr_mt_description')}
70
+ metrics="Accuracy"
71
+ />
72
+ <Benchmark
73
+ title={t('benchmark_multiblimp_title')}
74
+ link="https://huggingface.co/datasets/jumelet/multiblimp"
75
+ description={t('benchmark_multiblimp_description')}
76
+ metrics="Accuracy"
77
+ />
78
+ <Benchmark
79
+ title={t('benchmark_paws_title')}
80
+ link="https://huggingface.co/datasets/google-research-datasets/paws-x"
81
+ description={t('benchmark_paws_description')}
82
+ metrics="Accuracy"
83
+ />
84
+ <Benchmark
85
+ title={t('benchmark_piaf_title')}
86
+ link="https://aclanthology.org/2020.lrec-1.673/"
87
+ description={t('benchmark_piaf_description')}
88
+ metrics="F1 Score, Exact Match Ratio"
89
+ />
90
+ <Benchmark
91
+ title={t('benchmark_qfrblimp_title')}
92
+ link="https://github.com/davebulaval/FrBLiMP"
93
+ description={t('benchmark_qfrblimp_description')}
94
+ metrics="Accuracy"
95
+ />
96
+ <Benchmark
97
+ title={t('benchmark_qfrcola_title')}
98
+ link="https://github.com/davebulaval/qfrcola"
99
+ description={t('benchmark_qfrcola_description')}
100
+ metrics="Accuracy"
101
+ />
102
+ <Benchmark
103
+ title={t('benchmark_qfrcore_title')}
104
+ link=""
105
+ description={t('benchmark_qfrcore_description')}
106
+ metrics="Accuracy"
107
+ />
108
+ <Benchmark
109
+ title={t('benchmark_qfrcort_title')}
110
+ link=""
111
+ description={t('benchmark_qfrcort_description')}
112
+ metrics="Accuracy"
113
+ />
114
+ <Benchmark
115
+ title={t('benchmark_rte3_french_title')}
116
+ link="https://huggingface.co/datasets/maximoss/rte3-french"
117
+ description={t('benchmark_rte3_french_description')}
118
+ metrics="Accuracy"
119
+ />
120
+ <Benchmark
121
+ title={t('benchmark_sickfr_title')}
122
+ link="https://huggingface.co/datasets/Lajavaness/SICK-fr"
123
+ description={t('benchmark_sickfr_description')}
124
+ metrics="Pearson"
125
+ />
126
+ <Benchmark
127
+ title={t('benchmark_sts22_title')}
128
+ link="https://huggingface.co/datasets/mteb/sts22-crosslingual-sts/viewer/fr"
129
+ description={t('benchmark_sts22_description')}
130
+ metrics="Pearson"
131
+ />
132
+ <Benchmark
133
+ title={t('benchmark_wino_x_lm_title')}
134
+ link="https://huggingface.co/datasets/demelin/wino_x/viewer/lm_en_fr?views%5B%5D=lm_en_fr"
135
+ description={t('benchmark_wino_x_lm_description')}
136
+ metrics="Accuracy"
137
+ />
138
+ <Benchmark
139
+ title={t('benchmark_wino_x_mt_title')}
140
+ link="https://huggingface.co/datasets/demelin/wino_x/viewer/mt_en_fr"
141
+ description={t('benchmark_wino_x_mt_description')}
142
+ metrics="Accuracy"
143
+ />
144
+ <Benchmark
145
+ title={t('benchmark_wsd_title')}
146
+ link="https://huggingface.co/datasets/GETALP/flue"
147
+ description={t('benchmark_wsd_description')}
148
+ metrics="Exact Match Ratio"
149
+ />
150
+ <Benchmark
151
+ title={t('benchmark_xnli_title')}
152
+ link="https://github.com/facebookresearch/XNLI"
153
+ description={t('benchmark_xnli_description')}
154
+ metrics="Accuracy"
155
+ />
156
+ </div>
157
+ </div>
158
+ );
159
+ }
160
+
161
+ function Benchmark({ title, description, metrics, link }) {
162
+ const { t } = useTranslation();
163
+
164
+ return (
165
+ <div className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm">
166
+ <h3 className="text-xl font-semibold text-blue-700 mb-2 border-b-2 border-blue-500 inline-block">
167
+ {link ? (
168
+ <Link href={link} className="hover:underline">
169
+ {title}
170
+ </Link>
171
+ ) : (
172
+ title
173
+ )}
174
+ </h3>
175
+ <p className="text-gray-700 mb-2">{description}</p>
176
+ <p className="text-sm text-gray-500">
177
+ <span className="font-medium">{t('metrics')}</span> {metrics}
178
+ </p>
179
+ </div>
180
+ );
181
+ }
frontend/src/app/components/BigBlueButton.js ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ export default function BigBlueButton({ children, onClick, disabled }) {
4
+ return (
5
+ <button
6
+ onClick={onClick}
7
+ disabled={disabled}
8
+ className={`px-4 py-2 text-white text-base font-medium rounded-md shadow-sm focus:outline-none focus:ring-2 ${
9
+ disabled
10
+ ? "bg-gray-400 cursor-not-allowed"
11
+ : "bg-blue-500 hover:bg-blue-600 focus:ring-blue-300"
12
+ }`}
13
+ >
14
+ {children}
15
+ </button>
16
+ );
17
+ }
frontend/src/app/components/ClientHeader.js ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import { useTranslation } from 'react-i18next';
5
+ import Taskbar from './taskbar';
6
+ import { LanguageSwitcher } from './LanguageSwitcher';
7
+
8
+ export default function ClientHeader() {
9
+ useTranslation();
10
+
11
+ return (
12
+ <header className="flex items-center justify-between px-4 py-3 shadow">
13
+ <Taskbar />
14
+ <LanguageSwitcher />
15
+ </header>
16
+ );
17
+ }
frontend/src/app/components/CodeBlock.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function CodeBlock({children}){
2
+ return (
3
+ <pre className="bg-gray-100 p-4 rounded-md overflow-x-auto text-sm text-gray-800 mt-4">
4
+ <code className="font-mono">
5
+ {children}
6
+ </code>
7
+ </pre>
8
+
9
+ );
10
+ };
frontend/src/app/components/ErrorMessage.js ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function ErrorMessage({children,condition}){
2
+ return(
3
+ <div className="pt-2">
4
+ <div className="pt-2 space-y-2">
5
+ {condition && (
6
+ <div className="text-red-600 text-sm font-medium">
7
+ {children}
8
+ </div>
9
+ )}
10
+ </div>
11
+ </div>
12
+ );
13
+
14
+ }
frontend/src/app/components/LanguageSwitcher.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import {useTranslation} from 'react-i18next';
4
+
5
+ export function LanguageSwitcher() {
6
+ const {i18n} = useTranslation();
7
+
8
+ const changeLanguage = (lng) => {
9
+ i18n.changeLanguage(lng);
10
+ };
11
+
12
+ return (
13
+ <div className="flex space-x-2">
14
+ <button onClick={() => changeLanguage('en')} className="px-2 py-1 rounded-lg border">
15
+ EN
16
+ </button>
17
+ <button onClick={() => changeLanguage('fr')} className="px-2 py-1 rounded-lg border">
18
+ FR
19
+ </button>
20
+ </div>
21
+ );
22
+ }
frontend/src/app/components/Modal.js ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+ import BigBlueButton from "./BigBlueButton";
3
+
4
+ export default function Modal({ children, onClose }) {
5
+ return (
6
+ <div className="fixed inset-0 bg-gray-600 bg-opacity-25 overflow-y-auto h-full w-full flex items-center justify-center z-50"
7
+ style={{ backgroundColor: 'rgba(75, 85, 99, 0.55)' }}>
8
+ <div className="p-8 border w-96 shadow-lg rounded-md bg-white">
9
+ <div className="text-center text-black">
10
+ {children}
11
+ <div className="flex justify-center mt-4">
12
+ <BigBlueButton onClick={onClose}>
13
+ Close
14
+ </BigBlueButton>
15
+ </div>
16
+ </div>
17
+ </div>
18
+ </div>
19
+ );
20
+ }
frontend/src/app/components/ModalManager.js ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import { useSearchParams, useRouter } from 'next/navigation';
4
+ import Modal from './Modal';
5
+ import SubmitForm from './SubmitForm';
6
+
7
+ export default function ModalManager() {
8
+ const searchParams = useSearchParams();
9
+ const submitModal = searchParams.get("show") === "submit";
10
+ const router = useRouter();
11
+
12
+ const handleClose = () => {
13
+ const newUrl = window.location.pathname;
14
+ router.push(newUrl);
15
+ };
16
+
17
+ return (
18
+ <>
19
+ {submitModal && (
20
+ <Modal onClose={handleClose}>
21
+ <SubmitForm />
22
+ </Modal>
23
+ )}
24
+ </>
25
+ );
26
+ }
frontend/src/app/components/ModelDetailsModal.js ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client";
2
+ import Modal from "./Modal";
3
+
4
+ // Fonction utilitaire pour nettoyer le nom du benchmark
5
+ const getCleanBenchmarkName = (name) => {
6
+ const parts = name.split("|");
7
+ if (parts.length >= 2) return parts[1];
8
+ return name;
9
+ };
10
+
11
+ export default function ModelDetailsModal({ entry, onClose }) {
12
+ const modelName = entry.display_name || entry.name?.replace(".json", "") || "Unknown Model";
13
+
14
+ return (
15
+ <Modal onClose={onClose}>
16
+ <div className="space-y-4 max-w-2xl mx-auto">
17
+ {/* En-tête avec nom du modèle */}
18
+ <div className="text-center sticky top-0 bg-white z-10 pb-2">
19
+ <h3 className="text-2xl font-bold text-blue-700">{modelName}</h3>
20
+ <p className="text-sm text-gray-500">Model Details</p>
21
+ </div>
22
+
23
+ {/* Contenu des benchmarks */}
24
+ <div className="space-y-4">
25
+ {Object.entries(entry.results || {}).map(([benchmark, metrics]) => (
26
+ <div key={benchmark} className="p-4 border rounded-md bg-white shadow-md">
27
+ <h4 className="text-lg font-medium text-blue-600 mb-2">
28
+ 📊 {getCleanBenchmarkName(benchmark)}
29
+ </h4>
30
+ <ul className="ml-4 text-sm text-gray-700 list-disc">
31
+ {Object.entries(metrics).map(([metric, value]) => (
32
+ <li key={metric}>
33
+ <strong>{metric}</strong>:{" "}
34
+ {typeof value === "number" ? value.toFixed(4) : value}
35
+ </li>
36
+ ))}
37
+ </ul>
38
+ </div>
39
+ ))}
40
+ </div>
41
+ </div>
42
+ </Modal>
43
+ );
44
+ }
frontend/src/app/components/SubmitForm.js ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import { useState } from "react";
4
+ import { useRouter } from "next/navigation";
5
+ import ErrorMessage from "./ErrorMessage";
6
+ import { BACKEND_ADDRESS } from "@/app/resources/ResourcesPaths";
7
+ import { Trans } from 'react-i18next';
8
+ import BigBlueButton from "./BigBlueButton";
9
+ import { useTranslation } from 'react-i18next';
10
+
11
+ export default function SubmitForm() {
12
+ const { t } = useTranslation();
13
+ const router = useRouter();
14
+
15
+ const [requiredVisible, setRequiredVisible] = useState(false);
16
+ const [email, setEmail] = useState('');
17
+ const [displayName, setDisplayName] = useState('');
18
+ const [file, setFile] = useState(null);
19
+ const [isSubmitting, setIsSubmitting] = useState(false);
20
+ const [submitStatus, setSubmitStatus] = useState(null); // 'success' | 'error'
21
+ const [errorMessage, setErrorMessage] = useState('');
22
+ const [submissionId, setSubmissionId] = useState(null);
23
+
24
+ const handleFileChange = (e) => {
25
+ setFile(e.target.files[0]);
26
+ };
27
+
28
+ const submitResults = async () => {
29
+ if (!email || !displayName || !file) {
30
+ setRequiredVisible(true);
31
+ return;
32
+ }
33
+ if (!file.name.toLowerCase().endsWith('.zip')) {
34
+ alert(t('submit_zipAlert'));
35
+ return;
36
+ }
37
+
38
+ setRequiredVisible(false);
39
+ setIsSubmitting(true);
40
+
41
+ const formData = new FormData();
42
+ formData.append('email', email);
43
+ formData.append('display_name', displayName);
44
+ formData.append('predictions_zip', file);
45
+
46
+ try {
47
+ const res = await fetch(`${BACKEND_ADDRESS}/submit`, {
48
+ method: "POST",
49
+ body: formData,
50
+ });
51
+ if (!res.ok) {
52
+ const err = await res.json().catch(() => null);
53
+ throw new Error(err?.detail || `HTTP ${res.status}`);
54
+ }
55
+ const json = await res.json();
56
+ const id = json.submission_id;
57
+ setSubmissionId(id);
58
+ localStorage.setItem('last_result_file', `${id}.json`);
59
+ localStorage.setItem('just_submitted', 'true');
60
+ setSubmitStatus('success');
61
+ } catch (err) {
62
+ setErrorMessage(err.message);
63
+ setSubmitStatus('error');
64
+ } finally {
65
+ setIsSubmitting(false);
66
+ }
67
+ };
68
+
69
+ const renderModal = () => {
70
+ if (submitStatus === 'success') {
71
+ return (
72
+ <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
73
+ <div className="bg-white p-6 rounded-2xl shadow-lg max-w-sm text-center">
74
+ <h3 className="text-xl font-semibold text-green-600">
75
+ {t('submit_successTitle')}
76
+ </h3>
77
+ <p className="mt-2">{t('submit_successMessage')}</p>
78
+ <BigBlueButton
79
+ className="mt-4 px-4 py-2 rounded-full shadow hover:shadow-md"
80
+ onClick={() => router.push(`/results/${submissionId}`)}
81
+ >
82
+ {t('submit_checkResults')}
83
+ </BigBlueButton>
84
+ </div>
85
+ </div>
86
+ );
87
+ }
88
+ if (submitStatus === 'error') {
89
+ return (
90
+ <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
91
+ <div className="bg-white p-6 rounded-2xl shadow-lg max-w-sm text-center">
92
+ <h3 className="text-xl font-semibold text-red-600">
93
+ {t('submit_errorTitle')}
94
+ </h3>
95
+ <p className="mt-2">
96
+ <Trans i18nKey="submit_errorMessage" values={{ errorMessage }}>
97
+ Submission error: {{ errorMessage }}
98
+ </Trans>
99
+ </p>
100
+ <button
101
+ className="mt-4 px-4 py-2 rounded-full shadow hover:shadow-md"
102
+ onClick={() => setSubmitStatus(null)}
103
+ >
104
+ {t('submit_closeButton')}
105
+ </button>
106
+ </div>
107
+ </div>
108
+ );
109
+ }
110
+ return null;
111
+ };
112
+
113
+ return (
114
+ <div className="relative">
115
+ <div className="space-y-6 bg-white rounded-xl shadow-md p-6 w-full max-w-xl mx-auto border border-gray-200">
116
+ <h2 className="text-2xl font-semibold text-gray-800 text-center">
117
+ {t('submit_formTitle')}
118
+ </h2>
119
+
120
+ <div className="space-y-2">
121
+ <label htmlFor="email" className="block text-sm font-medium text-gray-700">
122
+ {t('submit_labelEmail')}
123
+ </label>
124
+ <input
125
+ id="email"
126
+ type="email"
127
+ placeholder={t('submit_placeholderEmail')}
128
+ value={email}
129
+ onChange={(e) => setEmail(e.target.value)}
130
+ className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
131
+ />
132
+ </div>
133
+
134
+ <div className="space-y-2">
135
+ <label htmlFor="displayname" className="block text-sm font-medium text-gray-700">
136
+ {t('submit_labelDisplayName')}
137
+ </label>
138
+ <input
139
+ id="displayname"
140
+ type="text"
141
+ placeholder={t('submit_placeholderDisplayName')}
142
+ value={displayName}
143
+ onChange={(e) => setDisplayName(e.target.value)}
144
+ className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
145
+ />
146
+ </div>
147
+
148
+ <div className="space-y-2">
149
+ <label htmlFor="zipfile" className="block text-sm font-medium text-gray-700">
150
+ {t('submit_labelZip')}
151
+ </label>
152
+ <input
153
+ id="zipfile"
154
+ type="file"
155
+ accept=".zip"
156
+ onChange={handleFileChange}
157
+ className="border border-gray-300 p-3 rounded-md w-full focus:ring-2 focus:ring-blue-500"
158
+ />
159
+ </div>
160
+
161
+ <ErrorMessage condition={requiredVisible}>
162
+ ⚠️ Email, display name & ZIP are required.
163
+ </ErrorMessage>
164
+
165
+ <button
166
+ onClick={submitResults}
167
+ disabled={isSubmitting}
168
+ className="w-full bg-blue-600 text-white py-3 rounded-xl hover:bg-blue-700 mt-4"
169
+ >
170
+ {isSubmitting ? t('submit_submitting') : t('submit_button')}
171
+ </button>
172
+
173
+ {renderModal()}
174
+ </div>
175
+ </div>
176
+ );
177
+ }
frontend/src/app/components/UploadButton.js ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ import { useState } from "react";
4
+ import BigBlueButton from "./BigBlueButton";
5
+
6
+ export default function UploadButton({children,uploaded}){
7
+ const [file, setFile] = useState(null);
8
+
9
+ function handleFileChange(e) {
10
+ const selectedFile = e.target.files?.[0];
11
+ if (selectedFile) {
12
+ setFile(selectedFile);
13
+ uploaded(selectedFile);
14
+ }
15
+ }
16
+ return(
17
+ <div>
18
+ <label htmlFor="file_upload">{children}</label>
19
+ <input type="file" id="file_upload" accept=".zip" onChange={handleFileChange}
20
+ className="bg-gray-500 text-white text-base
21
+ font-medium rounded-md shadow-sm hover:bg-gray-400 focus:outline-none focus:ring-2 focus:ring-gray-300"></input>
22
+ </div>
23
+ );
24
+
25
+ }
26
+
27
+ const uploadFile = async () => {}
frontend/src/app/components/taskbar.js ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import Link from 'next/link';
5
+ import { usePathname } from 'next/navigation';
6
+ import { FileText } from 'lucide-react';
7
+ import { useTranslation } from 'react-i18next';
8
+
9
+ export default function Taskbar() {
10
+ const { t } = useTranslation();
11
+ const pathname = usePathname();
12
+
13
+ const linkStyle = (path) =>
14
+ pathname === path
15
+ ? 'text-blue-500 font-semibold'
16
+ : 'text-gray-700 hover:text-blue-500';
17
+
18
+ return (
19
+ <nav className="w-full py-4 bg-none flex justify-between items-center mx-auto max-w-5xl">
20
+ <div className="flex items-center">
21
+ <Link href="/">
22
+ <span className="text-xl font-bold text-blue-600">{t('nav_home')}</span>
23
+ </Link>
24
+
25
+ <Link href="/papers" className="ml-2">
26
+ <FileText className="w-6 h-6 text-blue-600 hover:text-blue-500" />
27
+ </Link>
28
+ </div>
29
+
30
+ <div className="space-x-6">
31
+ <Link href="/guide" className={linkStyle('/guide')}>
32
+ {t('nav_guide')}
33
+ </Link>
34
+ <Link href="/FAQ" className={linkStyle('/FAQ')}>
35
+ {t('nav_faq')}
36
+ </Link>
37
+ <Link href="/contact" className={linkStyle('/contact')}>
38
+ {t('nav_contact')}
39
+ </Link>
40
+ <Link
41
+ href={`${pathname}?show=submit`}
42
+ className={linkStyle('/submit')}
43
+ >
44
+ {t('nav_submit')}
45
+ </Link>
46
+ <Link href="/benchmarks" className={linkStyle('/benchmarks')}>
47
+ {t('nav_tasks')}
48
+ </Link>
49
+ <Link href="/results" className={linkStyle('/results')}>
50
+ {t('nav_results')}
51
+ </Link>
52
+ <Link href="/leaderboard" className={linkStyle('/leaderboard')}>
53
+ {t('nav_leaderboard')}
54
+ </Link>
55
+ <Link
56
+ href="https://huggingface.co/datasets/graalul/COLE-public"
57
+ className={linkStyle('/hf')}
58
+ >
59
+ {t('nav_datasets')}
60
+ </Link>
61
+ </div>
62
+ </nav>
63
+ );
64
+ }
frontend/src/app/contact/page.js ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import { useTranslation } from 'react-i18next';
5
+
6
+ export default function Contact() {
7
+ const { t } = useTranslation();
8
+
9
+ return (
10
+ <div className="max-w-3xl mx-auto px-6 py-3">
11
+ <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
12
+ {t('contact_title')}
13
+ </h2>
14
+
15
+ <p className="text-gray-700 mb-4 leading-relaxed">
16
+ {t('contact_paragraph')}
17
+ </p>
18
+
19
+ <div className="bg-gray-50 p-4 rounded-md border border-dashed border-blue-400">
20
+ <p className="text-sm text-gray-500 mb-2">
21
+ {t('contact_email_label')}
22
+ </p>
23
+ <a
24
+ href="mailto:david.beauchemin@ift.ulaval.ca"
25
+ className="text-blue-600 font-mono text-lg hover:underline"
26
+ >
27
+ david.beauchemin@ift.ulaval.ca
28
+ </a>
29
+ </div>
30
+ </div>
31
+ );
32
+ }
frontend/src/app/en/translation.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "siteTitle": "COLE",
3
+ "welcome": "Welcome to COLE!",
4
+ "upload": "Upload",
5
+ "submit": "Submit",
6
+ "results": "Results",
7
+ "contact": "Contact",
8
+ "contactUs": "Contact us",
9
+ "guide": "Guide",
10
+ "faq": "FAQ",
11
+ "submitResults": "Submit your results",
12
+ "ourTasks": "Our tasks",
13
+ "ourDatasets": "Our datasets",
14
+ "leaderboard": "COLE Leaderboard",
15
+ "errorOccurred": "An error occurred",
16
+ "close": "Close",
17
+ "details": "Details",
18
+ "benchmarksIntro": "COLE is constituted of 23 tasks, each of them aims to test one or more facets of language understanding in machine learning. Below are each of the tasks in more detail.",
19
+ "metrics": "Metric(s) :",
20
+ "benchmark_alloCine_title": "Allo-ciné.ca",
21
+ "benchmark_alloCine_description": "Allo-ciné tests language understanding in sentiment classification by feeding movie reviews which can be either positive and negative. The task consists in giving the correct sentiment for each review.",
22
+ "benchmark_lingnli_title": "LingNLI",
23
+ "benchmark_lingnli_description": "LingNLI is a Natural Language Inference corpus collected by putting a linguist 'in the loop' to dynamically introduce novel constraints during data collection, aiming to mitigate the systematic gaps and biases often found in crowdsourced datasets.",
24
+ "benchmark_daccord_title": "DACCORD",
25
+ "benchmark_daccord_description":"Predict whether the two sentences are compatible (0) or contradict each other (1).",
26
+ "benchmark_fquad_title": "FQuAD - French Question Answering Dataset",
27
+ "benchmark_fquad_description": "FQuAD is question/answer pairs built on high-quality Wikipedia articles. The goal in this task is to accurately predict if the answer to the question can be found in the provided article.",
28
+ "benchmark_french_boolq_title": "French BoolQ",
29
+ "benchmark_french_boolq_description": "Answer whether the context allows answering 'yes' to the question (1) or only 'no' or doesn't answer (0).",
30
+ "benchmark_fracas_title": "FraCaS",
31
+ "benchmark_fracas_description": "Natural language inference task : predict the relation between two sentences (implication, neutral, contradiction).",
32
+ "benchmark_gqnli_title": "GQNLI-Fr - The Generalized Quantifier NLI Challenge Dataset",
33
+ "benchmark_gqnli_description": "The dataset consists of carefully constructed premise-hypothesis pairs. Each hypothesis logically follows from the premise, contradicts it, or is neutral.",
34
+ "benchmark_mms_title": "MMS - Massive Multilingual Sentiment Corpora",
35
+ "benchmark_mms_description": "A massive multilingual sentiment analysis corpus in 27 languages.",
36
+ "benchmark_mnli_nineeleven_fr_mt_title": "MNLI-NineEleven-FR-MT",
37
+ "benchmark_mnli_nineeleven_fr_mt_description": "Predict the relation between two sentences (entailment, neutral, contradiction).",
38
+ "benchmark_paws_title": "PAWS: Paraphrase Adversaries from Word Scrambling",
39
+ "benchmark_paws_description": "This task aims to test paraphrase identification by giving two sentences and having the model define if these sentences are equivalent in meaning or not.",
40
+ "benchmark_piaf_title": "PIAF - The French-Language Dataset of Questions-Answers",
41
+ "benchmark_piaf_description": "This task consists of pairs of questions and text answers with information of where in the answer is the truly relevant information.",
42
+ "benchmark_qfrblimp_title": "QFrBLiMP - a Quebec-French Linguistic minimal pairs",
43
+ "benchmark_qfrblimp_description": "This task gives the model sentence pairs. The goal is to determine if the sentences are semantically equivalent, even with slightly different syntax and words.",
44
+ "benchmark_qfrcola_title": "QFrCoLA - a Quebec-French Corpus of Linguistic Acceptability Judgments",
45
+ "benchmark_qfrcola_description": "QFrCoLA is a French dataset sourced from multiple linguistic sites such as académie-française.fr and vitrinelinguistique.com. It aims to test models’ ability to determine grammatical correctness. The answer is a binary label indicating if the sentence is correct or not.",
46
+ "benchmark_qfrcore_title": "QFRCoRE: Quebec-French Corpus of Regional Expressions",
47
+ "benchmark_qfrcore_description": "Match the Quebec expression with its definition from a list.",
48
+ "benchmark_qfrcort_title": "QFRCoRT: Quebec-French Corpus of Regional Terms",
49
+ "benchmark_qfrcort_description": "Match the Quebec term with its definition from a list.",
50
+ "benchmark_rte3_french_title": "RTE3-French",
51
+ "benchmark_rte3_french_description": "Predict the relation between two sentences (entailment, neutral, contradiction).",
52
+ "benchmark_sickfr_title": "Sick-FR - French Sentences Involving Compositional Knowledge",
53
+ "benchmark_sickfr_description": "This task also has pairs of sentences annotated on two dimensions: relatedness (scored 1 to 5) and entailment (choices: entails, contradicts, neutral).",
54
+ "benchmark_sts22_title": "Sts22-Crosslingual - Multilingual News Article Similarity",
55
+ "benchmark_sts22_description": "This task evaluates whether pairs of news articles, written in different languages, cover the same story. It focuses on document-level similarity, where systems rate article pairs on a 4-point scale from most to least similar.",
56
+ "benchmark_wino_x_lm_title": "WiNo-X LM - Pronoun Resolution ",
57
+ "benchmark_wino_x_lm_description": "Predict the correct referent (1 or 2) of a pronoun in a sentence by choosing between two candidates.",
58
+ "benchmark_wino_x_mt_title": "WiNo-X MT - Pronoun Resolution ",
59
+ "benchmark_wino_x_mt_description": "Choose which of two French translations uses the correct pronoun (il/elle) based on the intended referent in the original English sentence.",
60
+ "benchmark_xnli_title": "XNLI - The Cross-Lingual NLI Corpus",
61
+ "benchmark_xnli_description": "This task consists of pairs of sentences where the goal is to determine the relation between the two: entailment, neutral, or contradiction.",
62
+ "benchmark_wsd_title": "WSD-Fr : Word Sense Disambiguation",
63
+ "benchmark_wsd_description": "WSD-Fr is a word sense disambiguation task where the model must identify the correct meaning of an ambiguous verb in context, as part of the FLUE benchmark.",
64
+ "benchmark_multiblimp_title": "MultiBLiMP-Fr - Multilingual Linguistic Minimal Pairs",
65
+ "benchmark_multiblimp_description": "A grammaticality judgment task using the French subset of the Multilingual Benchmark of Linguistic Minimal Pairs . Each instance is a minimal pair—one grammatical and one ungrammatical—differing by a single targeted feature. The model must select the grammatically correct sentence. This task probes fine-grained knowledge of French syntax, morphology, and agreement.",
66
+ "home_whatIsColleTitle": "What is COLE?",
67
+ "home_paragraph1": "COLE is a multidisciplinary French Natural Language Understanding benchmark ( <1>NLU</1> ). It takes inspiration from its predecessors <3>GLUE</3> and <5>SuperGLUE</5> to build a benchmark capable of evaluating models in the French language on multiple topics of language understanding. See <7>our paper</7> for more information.",
68
+ "home_paragraph2": "The COLE benchmark is built with multiple goals in mind. First, it aims to provide a solid and complete French alternative for benchmarking models on NLU tasks. Second, it provides the user with multiple datasets, all usable through HuggingFace’s libraries, to train or fine-tune models on specific tasks.",
69
+ "home_paragraph3": "We have made the choice to hide test labels to discourage cheating or overfitting on test data. To get results on your test data, you may send us your results as explained in <1>our guide</1>.",
70
+ "guide_title": "Using the COLE Benchmark",
71
+ "guide_section1_title": "Training and Testing",
72
+ "guide_section1_para1": "The COLE benchmark can be used to train and/or test models on multiple tasks. To train or fine-tune a model, you can fetch the train, validation and test data splits from our <0>Hugging Face public repository</0>. We recommend using Hugging Face’s libraries to simplify the process.",
73
+ "guide_section1_para2": "To test a model, you also need to fetch the data in the same way. Once done, your model should infer predictions for each line in the test split. Our repository includes benchmark evaluation scripts for each dataset. You only need to plug in your model's inference method using the HuggingFace Model interface. Our inference scripts are available on our <0>GitHub Repository</0>.",
74
+ "guide_section1_para3": "If you prefer to run inference separately, please ensure that the predictions are formatted correctly before submitting them for evaluation (see our \"Formatting the Dataset\" section).",
75
+ "guide_section2_title": "Formatting the Dataset",
76
+ "guide_section2_para1": "Before submitting your results, make sure your output is properly formatted so that our systems can process it. The expected format is a nested JSON dictionary as follows:",
77
+ "faq_title": "Frequently Asked Questions",
78
+ "faqs": [
79
+ {
80
+ "question": "How can I evaluate my model?",
81
+ "answer": "You can upload your model outputs in JSON format on the website. The system will automatically evaluate them, and you can view the results in the evaluation interface."
82
+ },
83
+ {
84
+ "question": "Is COLE multilingual?",
85
+ "answer": "No, COLE is available only in French. The benchmark is specifically designed to evaluate NLU models in the French language."
86
+ }
87
+ ],
88
+ "contact_title": "Contact us",
89
+ "contact_paragraph": "If you have any questions, feedback, or suggestions regarding the COLE benchmark, feel free to reach out to us. We are happy to help — please note that response times may vary.",
90
+ "contact_email_label": "Email us at:",
91
+
92
+ "submit_formTitle": "Submit Your Results",
93
+ "submit_labelEmail": "Your Email",
94
+ "submit_placeholderEmail": "you@example.com",
95
+ "submit_labelDisplayName": "Display Name",
96
+ "submit_placeholderDisplayName": "Leaderboard Name",
97
+ "submit_labelFile": "Predictions ZIP",
98
+ "submit_labelZip" : "Select your results file",
99
+ "submit_requiredError": "⚠️ Email, display name & ZIP are required.",
100
+ "submit_zipAlert": "Please upload a ZIP (.zip) file.",
101
+ "submit_button": "Submit Your Results",
102
+ "submit_submitting": "Submitting...",
103
+ "submit_successTitle": "Success",
104
+ "submit_successMessage": "Your submission has been successfully sent!",
105
+ "submit_checkResults": "Check the results",
106
+ "submit_errorTitle": "Error ⚠️",
107
+ "submit_errorMessage": "Submission error: {{errorMessage}}",
108
+ "submit_closeButton": "Close",
109
+ "results_default_title": "No Results Yet",
110
+ "results_default_message": "Please submit a ZIP file to generate benchmark results.",
111
+ "results_loading": "⏳ Loading results...",
112
+ "results_page_title": "📊 Results for {{displayName}}",
113
+ "results_download": "Download JSON",
114
+ "results_no_results": "⚠️ No benchmark results found.",
115
+ "results_benchmark_label": "🧪 Benchmark: {{name}}",
116
+ "leaderboard_title": "Leaderboard",
117
+ "leaderboard_modelHeader": "Model Name",
118
+ "leaderboard_overallHeader": "Overall",
119
+ "leaderboard_avgScoreLabel": "(avg score)",
120
+ "leaderboard_notSpecified": "NS",
121
+ "leaderboard_modalTitle": "Results for {{name}}",
122
+ "leaderboard_closeButton": "Close",
123
+ "nav_home": "COLE",
124
+ "nav_guide": "Guide",
125
+ "nav_faq": "FAQ",
126
+ "nav_contact": "Contact us",
127
+ "nav_submit": "Submit your results",
128
+ "nav_tasks": "Our tasks",
129
+ "nav_results": "Results",
130
+ "nav_leaderboard": "COLE Leaderboard",
131
+ "nav_datasets": "Our datasets"
132
+ }
frontend/src/app/fr/translation.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "siteTitle": "COLE",
3
+ "welcome": "Bienvenue sur COLE !",
4
+ "upload": "Téléverser",
5
+ "submit": "Soumettre",
6
+ "results": "Résultats",
7
+ "contact": "Contact",
8
+ "contactUs": "Nous contacter",
9
+ "guide": "Guide",
10
+ "faq": "FAQ",
11
+ "submitResults": "Soumettre vos résultats",
12
+ "ourTasks": "Nos tâches",
13
+ "ourDatasets": "Nos jeux de données",
14
+ "leaderboard": "Classement COLE",
15
+ "errorOccurred": "Une erreur est survenue",
16
+ "close": "Fermer",
17
+ "details": "Détails",
18
+ "benchmarksIntro": "COLE est constitué de 23 tâches, chacune visant à tester une ou plusieurs facettes de la compréhension du langage en apprentissage automatique. Ci-dessous, chaque tâche est décrite en détail.",
19
+ "metrics": "Métrique(s) :",
20
+ "benchmark_alloCine_title": "Allo-ciné.ca",
21
+ "benchmark_alloCine_description": "Allo-ciné teste la compréhension du langage dans la classification des sentiments en fournissant des critiques de films pouvant être positives ou négatives. La tâche consiste à donner le sentiment correct pour chaque critique.",
22
+ "benchmark_lingnli_title": "LingNLI",
23
+ "benchmark_lingnli_description": "LingNLI est un corpus d'inférence en langage naturel collecté en faisant appel à un linguiste afin d'introduire de manière dynamique de nouvelles contraintes pendant la collecte des données, dans le but d'atténuer les lacunes et les biais systématiques souvent présents dans les ensembles de données issus du crowdsourcing.",
24
+ "benchmark_daccord_title": "DACCORD",
25
+ "benchmark_daccord_description": " Prédisez si les deux phrases sont compatibles (0) ou se contredisent (1). ",
26
+ "benchmark_fquad_title": "FQuAD - Corpus de questions-réponses français",
27
+ "benchmark_fquad_description": "FQuAD est un ensemble de paires question/réponse construit à partir d’articles Wikipédia de haute qualité. L’objectif est de prédire correctement si la réponse à la question se trouve réellement dans l’article fourni.",
28
+ "benchmark_french_boolq_title": "French BoolQ",
29
+ "benchmark_french_boolq_description": " Répondez si le contexte permet de répondre « oui » à la question (1) ou « non »/ne répond pas (0).",
30
+ "benchmark_fracas_title": "FraCaS",
31
+ "benchmark_fracas_description": "Tâche d'inférence en langage naturel : prédire la relation entre deux phrases (implication, neutralité, contradiction).",
32
+ "benchmark_gqnli_title": "GQNLI-Fr - Jeu de données Generalized Quantifier NLI Challenge",
33
+ "benchmark_gqnli_description": "Le jeu se compose de paires prémisse-hypothèse soigneusement construites. Chaque hypothèse découle logiquement de la prémisse, la contredit ou est neutre.",
34
+ "benchmark_mms_title": "MMS - Massive Multilingual Sentiment Corpora",
35
+ "benchmark_mms_description": "Un corpus multilingue massif d'analyse des sentiments en 27 langues.",
36
+ "benchmark_mnli_nineeleven_fr_mt_title": "MNLI-NineEleven-FR-MT",
37
+ "benchmark_mnli_nineeleven_fr_mt_description": "Prédisez la relation entre deux phrases (implication, neutre, contradiction).",
38
+ "benchmark_multiblimp_title": "MultiBLiMP-Fr - Paires minimales linguistiques en français",
39
+ "benchmark_multiblimp_description": "Une tâche de jugement de grammaticalité utilisant le sous-ensemble français du Multilingual Benchmark of Linguistic Minimal Pairs. Chaque instance est une paire minimale — l’une grammaticale et l’autre agrammaticale — ne différant que par une seule caractéristique ciblée. Le modèle doit sélectionner la phrase grammaticalement correcte. Cette tâche évalue les connaissances fines de la syntaxe, de la morphologie et des accords en français.",
40
+ "benchmark_paws_title": "PAWS : Paraphrase Adversaries from Word Scrambling",
41
+ "benchmark_paws_description": "Cette tâche vise à tester l’identification de paraphrases en donnant deux phrases et en demandant au modèle de définir si ces phrases sont équivalentes en sens ou non.",
42
+ "benchmark_piaf_title": "PIAF - Jeu de questions-réponses en français",
43
+ "benchmark_piaf_description": "Cette tâche consiste en paires de questions et de réponses textuelles avec l’indication de l’emplacement de l’information réellement pertinente dans la réponse.",
44
+ "benchmark_qfrblimp_title": "QFrBLiMP - Paires minimales linguistiques québécoises",
45
+ "benchmark_qfrblimp_description": "Cette tâche présente au modèle des paires de phrases. Le but est de déterminer si les phrases sont sémantiquement équivalentes, même avec une syntaxe et des mots légèrement différents.",
46
+ "benchmark_qfrcola_title": "QFrCoLA - Corpus québécois de jugements d’acceptabilité linguistique",
47
+ "benchmark_qfrcola_description": "QFrCoLA est un jeu de données français issu de plusieurs sites linguistiques tels qu’académie-française.fr et vitrinelinguistique.com. Il vise à tester la capacité des modèles à déterminer la correction grammaticale. La réponse est un label binaire indiquant si la phrase est correcte ou non.",
48
+ "benchmark_qfrcore_title": "QFRCoRE: Quebec-French Corpus of Regional Expressions",
49
+ "benchmark_qfrcore_description": "Associez l'expression québécoise à sa définition parmi une liste proposée.",
50
+ "benchmark_qfrcort_title": "QFRCoRT: Quebec-French Corpus of Regional Terms",
51
+ "benchmark_qfrcort_description": "Associez le terme québécois à sa définition parmi une liste proposée.",
52
+ "benchmark_rte3_french_title": "RTE3-Français",
53
+ "benchmark_rte3_french_description": "Prédisez la relation entre deux phrases (implication, neutre, contradiction).",
54
+ "benchmark_sickfr_title": "Sick-FR - Phrases françaises impliquant des connaissances compositionnelles",
55
+ "benchmark_sickfr_description": "Cette tâche propose des paires de phrases annotées selon deux dimensions : la similarité (1 à 5) et l’inférence (implique, contredit ou neutre).",
56
+ "benchmark_sts22_title": "Sts22-Crosslingual - Similarité d’articles d’actualités multilingues",
57
+ "benchmark_sts22_description": "Cette tâche évalue si des paires d’articles d’actualités, écrits dans différentes langues, couvrent la même histoire. Elle se concentre sur la similarité au niveau du document, où les systèmes notent les paires sur une échelle de 4 points, du plus similaire au moins similaire.",
58
+ "benchmark_wino_x_lm_title": "WiNo-X LM - Résolution de pronom ",
59
+ "benchmark_wino_x_lm_description": "Prédire le bon référent (1 ou 2) d’un pronom dans une phrase en choisissant parmi deux candidats.",
60
+ "benchmark_wino_x_mt_title": "WiNo-X MT - Résolution de pronom ",
61
+ "benchmark_wino_x_mt_description": " Choisir laquelle de deux traductions françaises utilise le bon pronom (il/elle) selon le référent correct de la phrase anglaise.",
62
+ "benchmark_xnli_title": "XNLI - Corpus NLI multilingue",
63
+ "benchmark_xnli_description": "Cette tâche consiste en paires de phrases où l’objectif est de déterminer la relation entre les deux : implication, neutre ou contradiction.",
64
+ "benchmark_wsd_title": "WSD-Fr : Désambiguïsation lexicale",
65
+ "benchmark_wsd_description": "WSD-Fr est une tâche de désambiguïsation lexicale dans laquelle le modèle doit identifier le sens correct d’un verbe ambigu en contexte, dans le cadre du benchmark FLUE.",
66
+ "home_whatIsColleTitle": "Qu’est-ce que COLE ?",
67
+ "home_paragraph1": "COLE est un benchmark multidisciplinaire de compréhension du langage naturel en français ( <1>NLU</1> ). Il s’inspire de ses prédécesseurs <3>GLUE</3> et <5>SuperGLUE</5> pour construire un benchmark capable d’évaluer les modèles en langue française sur plusieurs facettes de la compréhension du langage. Consultez <7>notre article</7> pour plus d’informations.",
68
+ "home_paragraph2": "Le benchmark COLE poursuit plusieurs objectifs : d’abord fournir une alternative solide et complète en français pour évaluer les modèles sur des tâches NLU, puis offrir à l’utilisateur plusieurs jeux de données, tous utilisables via les bibliothèques HuggingFace, pour entraîner ou affiner des modèles sur des tâches spécifiques.",
69
+ "home_paragraph3": "Nous avons choisi de masquer les étiquettes de test pour décourager la triche ou le sur-apprentissage sur les données de test. Pour obtenir des résultats sur vos données de test, vous pouvez nous envoyer vos résultats comme expliqué dans <1>notre guide</1>.",
70
+ "guide_title": "Utilisation du benchmark COLE",
71
+ "guide_section1_title": "Entraînement et tests",
72
+ "guide_section1_para1": "Le benchmark COLE peut être utilisé pour entraîner et/ou tester des modèles sur plusieurs tâches. Pour entraîner ou affiner un modèle, vous pouvez récupérer les jeux de données train, validation et test depuis notre <0>dépôt public Hugging Face</0>. Nous recommandons d’utiliser les bibliothèques Hugging Face pour simplifier le processus.",
73
+ "guide_section1_para2": "Pour tester un modèle, vous devez également récupérer les données de la même façon. Une fois fait, votre modèle doit inférer les prédictions pour chaque ligne de la partition de test. Notre dépôt inclut des scripts d’évaluation pour chaque dataset. Il vous suffit de connecter la méthode d’inférence de votre modèle via l’interface HuggingFace. Nos scripts d’inférence sont disponibles sur notre <0>dépôt GitHub</0>.",
74
+ "guide_section1_para3": "Si vous préférez lancer l’inférence séparément, assurez-vous que les prédictions sont correctement formatées avant de les soumettre pour évaluation (voir notre section « Formatting the Dataset »).",
75
+ "guide_section2_title": "Formatage du jeu de données",
76
+ "guide_section2_para1": "Avant de soumettre vos résultats, assurez-vous que votre sortie est correctement formatée afin que nos systèmes puissent la traiter. Le format attendu est un dictionnaire JSON imbriqué comme suit :",
77
+ "faq_title": "Foire aux questions",
78
+ "faqs": [
79
+ {
80
+ "question": "Comment évaluer mon modèle ?",
81
+ "answer": "Vous pouvez téléverser les sorties de votre modèle au format JSON sur le site. Le système les évaluera automatiquement, et vous pourrez consulter les résultats dans l’interface d’évaluation."
82
+ },
83
+ {
84
+ "question": "COLE est-il multilingue ?",
85
+ "answer": "Non, COLE est disponible uniquement en français. Le benchmark est spécifiquement conçu pour évaluer les modèles en compréhension de la langue française. ( NLU )"
86
+ }
87
+ ],
88
+ "contact_title": "Nous contacter",
89
+ "contact_paragraph": "Si vous avez des questions, des commentaires ou des suggestions concernant le benchmark COLE, n’hésitez pas à nous contacter. Nous serons ravis de vous aider — veuillez noter que les délais de réponse peuvent varier.",
90
+ "contact_email_label": "Envoyez-nous un email à :",
91
+ "submit_formTitle": "Soumettre vos résultats",
92
+ "submit_labelEmail": "Votre email",
93
+ "submit_placeholderEmail": "vous@exemple.com",
94
+ "submit_labelDisplayName": "Nom affiché",
95
+ "submit_placeholderDisplayName": "Nom au classement",
96
+ "submit_labelFile": "Fichier ZIP de prédictions",
97
+ "submit_labelZip": "Sélectionnez votre fichier de résultats",
98
+ "submit_requiredError": "⚠️ Email, nom affiché et ZIP sont requis.",
99
+ "submit_zipAlert": "Veuillez téléverser un fichier ZIP (.zip).",
100
+ "submit_button": "Soumettre vos résultats",
101
+ "submit_submitting": "Envoi en cours...",
102
+ "submit_successTitle": "Succès",
103
+ "submit_successMessage": "Votre soumission a été envoyée avec succès !",
104
+ "submit_checkResults": "Voir les résultats",
105
+ "submit_errorTitle": "Erreur ⚠️",
106
+ "submit_errorMessage": "Erreur de soumission : {{errorMessage}}",
107
+ "submit_closeButton": "Fermer",
108
+ "results_default_title": "Pas encore de résultats",
109
+ "results_default_message": "Veuillez soumettre un fichier ZIP pour générer les résultats du benchmark.",
110
+ "results_loading": "⏳ Chargement des résultats...",
111
+ "results_page_title": "📊 Résultats pour {{displayName}}",
112
+ "results_download": "Télécharger le JSON",
113
+ "results_no_results": "⚠️ Aucun résultat de benchmark trouvé.",
114
+ "results_benchmark_label": "🧪 Benchmark : {{name}}",
115
+ "leaderboard_title": "Classement",
116
+ "leaderboard_modelHeader": "Nom du modèle",
117
+ "leaderboard_overallHeader": "Global",
118
+ "leaderboard_avgScoreLabel": "(score moyen)",
119
+ "leaderboard_notSpecified": "NS",
120
+ "leaderboard_modalTitle": "Résultats pour {{name}}",
121
+ "leaderboard_closeButton": "Fermer",
122
+ "nav_home": "COLE",
123
+ "nav_guide": "Guide",
124
+ "nav_faq": "FAQ",
125
+ "nav_contact": "Nous contacter",
126
+ "nav_submit": "Soumettre vos résultats",
127
+ "nav_tasks": "Nos tâches",
128
+ "nav_results": "Résultats",
129
+ "nav_leaderboard": "Classement COLE",
130
+ "nav_datasets": "Nos données"
131
+ }
132
+
133
+
134
+
135
+
frontend/src/app/globals.css ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @import "tailwindcss";
2
+
3
+ :root {
4
+ --background: #ffffff;
5
+ --foreground: #6526ae;
6
+ }
7
+
8
+ @theme inline {
9
+ --color-background: var(--background);
10
+ --color-foreground: var(--foreground);
11
+ --font-sans: var(--font-geist-sans);
12
+ --font-mono: var(--font-geist-mono);
13
+ }
14
+
15
+ @media (prefers-color-scheme: dark) {
16
+ :root {
17
+ --background: #ffffff;
18
+ --foreground: #ededed;
19
+ }
20
+ }
21
+
22
+ body {
23
+ background: var(--background);
24
+ color: var(--foreground);
25
+ font-family: Arial, Helvetica, sans-serif;
26
+ }
frontend/src/app/guide/page.js ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n';
4
+ import {useTranslation, Trans} from 'react-i18next';
5
+ import Link from 'next/link';
6
+ import CodeBlock from '../components/CodeBlock';
7
+
8
+ export default function Guide() {
9
+ const {t} = useTranslation();
10
+
11
+ return (
12
+ <div className="max-w-3xl mx-auto px-6 py-3">
13
+ <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
14
+ {t('guide_title')}
15
+ </h2>
16
+
17
+ <div className="space-y-8">
18
+ {/* SECTION TRAINING & TESTING */}
19
+ <div className="p-6 bg-white border border-gray-200 rounded-lg shadow-sm hover:shadow transition">
20
+ <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
21
+ {t('guide_section1_title')}
22
+ </h3>
23
+
24
+ <p className="text-gray-700">
25
+ <Trans i18nKey="guide_section1_para1" components={[
26
+ <a key="hf-link"
27
+ href="https://huggingface.co/datasets/graalul/COLE-public"
28
+ target="_blank"
29
+ rel="noopener noreferrer"
30
+ className="text-blue-600 underline hover:text-blue-800"
31
+ />
32
+ ]}>
33
+ </Trans>
34
+ </p>
35
+
36
+ <p className="text-gray-700 mt-4">
37
+ <Trans i18nKey="guide_section1_para2" components={[<a key="github-ref"
38
+ href="https://github.com/GRAAL-Research/COLE"
39
+ target="_blank"
40
+ rel="noopener noreferrer"
41
+ className="text-blue-600 underline hover:text-blue-800">
42
+ GitHub Repository.
43
+ </a>]}> </Trans>
44
+
45
+ </p>
46
+
47
+ <p className="text-gray-700 mt-4">
48
+ <Trans i18nKey="guide_section1_para3">
49
+ </Trans>
50
+ </p>
51
+
52
+ {/* SECTION FORMATTING */}
53
+ <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
54
+ {t('guide_section2_title')}
55
+ </h3>
56
+ <p className="text-gray-700 mb-4">
57
+ {t('guide_section2_para1')}
58
+ </p>
59
+
60
+ <CodeBlock>{`{
61
+ "model_name": "a_model_name",
62
+ "model_url": "a_model_url",
63
+ "tasks": [
64
+ {
65
+ "qfrcola": { "predictions": [1,1,1,1,1] }
66
+ },
67
+ {
68
+ "allocine": { "predictions": [1,1,1,1,1] }
69
+ }
70
+ ]
71
+ }`}</CodeBlock>
72
+ </div>
73
+ </div>
74
+ </div>
75
+ );
76
+ }
frontend/src/app/i18n.js ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import i18n from 'i18next';
2
+ import { initReactI18next } from 'react-i18next';
3
+ import LanguageDetector from 'i18next-browser-languagedetector';
4
+
5
+ import en from "./en/translation.json";
6
+ import fr from './fr/translation.json';
7
+
8
+ i18n
9
+ .use(LanguageDetector)
10
+ .use(initReactI18next)
11
+ .init({
12
+ resources: {
13
+ en: { translation: en },
14
+ fr: { translation: fr },
15
+ },
16
+ lng: 'en',
17
+ fallbackLng: 'en',
18
+ interpolation: {
19
+ escapeValue: false,
20
+ },
21
+ detection: {
22
+ order: ['localStorage', 'navigator'],
23
+ caches: ['localStorage'],
24
+ },
25
+ });
26
+
27
+
28
+ export default i18n;
frontend/src/app/layout.js ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { Geist, Geist_Mono } from "next/font/google";
2
+ import "./globals.css";
3
+
4
+ import ClientHeader from "./components/ClientHeader";
5
+ import ModalManager from "./components/ModalManager";
6
+ import {Suspense} from "react";
7
+
8
+ const geistSans = Geist({
9
+ variable: "--font-geist-sans",
10
+ subsets: ["latin"],
11
+ });
12
+
13
+ const geistMono = Geist_Mono({
14
+ variable: "--font-geist-mono",
15
+ subsets: ["latin"],
16
+ });
17
+
18
+ export const metadata = {
19
+ title: "COLE NLU",
20
+ description: "COLE : An NLU benchmark",
21
+ };
22
+
23
+ export default function RootLayout({ children }) {
24
+ return (
25
+ <html lang="en">
26
+ <body className={`${geistSans.variable} ${geistMono.variable} antialiased`}>
27
+ <ClientHeader />
28
+ <main className="w-full flex justify-center px-4 pt-8">
29
+ <div className="w-full max-w-3xl">{children}</div>
30
+ </main>
31
+ <Suspense fallback={null}>
32
+ <ModalManager/>
33
+ </Suspense>
34
+ </body>
35
+ </html>
36
+ );
37
+ }
frontend/src/app/leaderboard/page.js ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import React, { useEffect, useState } from "react";
4
+ import {
5
+ normalizeBenchmarkName,
6
+ computeAverageScore,
7
+ } from "./util";
8
+ import { useTranslation } from "react-i18next";
9
+ import { useParams } from "next/navigation";
10
+ import { BACKEND_ADDRESS } from "@/app/resources/ResourcesPaths";
11
+
12
+ const allowedMetrics = [
13
+ 'acc',
14
+ 'accuracy',
15
+ 'f1',
16
+ 'pearson',
17
+ 'pearsonr',
18
+ 'spearman',
19
+ 'fquad',
20
+ 'exact_match',
21
+ ];
22
+
23
+ export default function LeaderboardPage() {
24
+ const { t } = useTranslation();
25
+ const { id: _ } = useParams(); // unused here
26
+ const [entries, setEntries] = useState([]);
27
+ const [benchmarks, setBenchmarks] = useState([]);
28
+ const [sortCol, setSortCol] = useState('overall');
29
+ const [sortOrder, setSortOrder] = useState('desc');
30
+ const [selectedEntry, setSelectedEntry] = useState(null);
31
+
32
+ const headerLabels = {
33
+ model: t('leaderboard_modelHeader'),
34
+ overall: t('leaderboard_overallHeader'),
35
+ };
36
+
37
+ useEffect(() => {
38
+ fetch(`${BACKEND_ADDRESS}/leaderboard`)
39
+ .then((res) => {
40
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
41
+ return res.json();
42
+ })
43
+ .then((data) => {
44
+ const withOverall = data.map((e) => ({
45
+ ...e,
46
+ averageScore: computeAverageScore(e),
47
+ }));
48
+ setEntries(withOverall);
49
+
50
+ const allBench = new Set();
51
+ withOverall.forEach((entry) => {
52
+ Object.keys(entry.results || {}).forEach((raw) => {
53
+ allBench.add(normalizeBenchmarkName(raw));
54
+ });
55
+ });
56
+ setBenchmarks(Array.from(allBench));
57
+ })
58
+ .catch((err) => console.error('Failed to load leaderboard:', err));
59
+ }, []);
60
+
61
+ const getCellValue = (entry, col) => {
62
+ if (col === 'model') return entry.display_name;
63
+ if (col === 'overall') return entry.averageScore ?? null;
64
+
65
+ const pair = Object.entries(entry.results || {}).find(
66
+ ([rawName]) => normalizeBenchmarkName(rawName) === col
67
+ );
68
+ if (!pair) return null;
69
+
70
+ const rawValues = [];
71
+ Object.values(pair[1]).forEach((metricGroup) => {
72
+ if (metricGroup && typeof metricGroup === 'object') {
73
+ Object.entries(metricGroup).forEach(([metricName, metricValue]) => {
74
+ if (
75
+ !metricName.includes('_warning') &&
76
+ typeof metricValue === 'number' &&
77
+ allowedMetrics.includes(metricName.toLowerCase())
78
+ ) {
79
+ rawValues.push(metricValue);
80
+ }
81
+ });
82
+ }
83
+ });
84
+
85
+ if (rawValues.length === 0) return null;
86
+ const normalized = rawValues.map((v) => v > 1 ? v / 100 : v);
87
+ const avg = normalized.reduce((a, b) => a + b, 0) / normalized.length;
88
+ return avg;
89
+ };
90
+
91
+ const sorted = [...entries].sort((a, b) => {
92
+ const va = getCellValue(a, sortCol);
93
+ const vb = getCellValue(b, sortCol);
94
+ if (sortCol === 'model') {
95
+ if (va == null) return 1;
96
+ if (vb == null) return -1;
97
+ return sortOrder === 'asc'
98
+ ? va.localeCompare(vb)
99
+ : vb.localeCompare(va);
100
+ }
101
+ const na = va ?? -Infinity;
102
+ const nb = vb ?? -Infinity;
103
+ return sortOrder === 'asc' ? na - nb : nb - na;
104
+ });
105
+
106
+ const handleSort = (col) => {
107
+ if (sortCol === col) {
108
+ setSortOrder(sortOrder === 'asc' ? 'desc' : 'asc');
109
+ } else {
110
+ setSortCol(col);
111
+ setSortOrder('desc');
112
+ }
113
+ };
114
+
115
+ const renderHeader = (col) => {
116
+ const baseLabel = headerLabels[col] ?? col;
117
+ const arrow = sortCol === col ? (sortOrder === 'asc' ? ' ▲' : ' ▼') : '';
118
+
119
+ if (col === 'overall') {
120
+ return (
121
+ <div>
122
+ <div onClick={() => handleSort(col)} className="cursor-pointer">
123
+ {baseLabel}
124
+ {arrow}
125
+ </div>
126
+ <div className="text-xs text-gray-600 text-center">
127
+ {t('leaderboard_avgScoreLabel')}
128
+ </div>
129
+ </div>
130
+ );
131
+ }
132
+
133
+ if (col === 'model') {
134
+ return (
135
+ <div onClick={() => handleSort(col)} className="cursor-pointer">
136
+ {baseLabel}
137
+ {arrow}
138
+ </div>
139
+ );
140
+ }
141
+
142
+ let metricText = '';
143
+ const sample = entries[0];
144
+ if (sample && sample.results) {
145
+ const p = Object.entries(sample.results).find(
146
+ ([raw]) => normalizeBenchmarkName(raw) === col
147
+ );
148
+ if (p) {
149
+ const grp = Object.values(p[1])[0];
150
+ if (grp) {
151
+ const metrics = Object.keys(grp)
152
+ .filter((m) => allowedMetrics.includes(m.toLowerCase()));
153
+ if (metrics.length > 0) {
154
+ metricText = ` (${metrics.join(', ')})`;
155
+ }
156
+ }
157
+ }
158
+ }
159
+
160
+ return (
161
+ <div onClick={() => handleSort(col)} className="cursor-pointer">
162
+ {baseLabel}
163
+ {arrow}
164
+ {metricText}
165
+ </div>
166
+ );
167
+ };
168
+
169
+ return (
170
+ <div className="space-y-8">
171
+ <h3 className="text-2xl font-semibold text-gray-900 mb-4 border-l-4 border-blue-600 pl-4">
172
+ {t('leaderboard_title')}</h3>
173
+ <div className="overflow-auto">
174
+ <table className="min-w-full border-collapse">
175
+ <thead>
176
+ <tr>
177
+ {['model', 'overall', ...benchmarks].map((b) => (
178
+ <th
179
+ key={b}
180
+ className="border border-gray-300 px-2 py-1 bg-blue-100 text-left text-sm font-semibold text-blue-700"
181
+ >
182
+ {renderHeader(b)}
183
+ </th>
184
+ ))}
185
+ </tr>
186
+ </thead>
187
+ <tbody>
188
+ {sorted.map((entry) => (
189
+ <tr
190
+ key={entry.submission_id}
191
+ className="bg-white hover:bg-gray-50 cursor-pointer"
192
+ onClick={() => setSelectedEntry(entry)}
193
+ >
194
+ <td className="border border-gray-300 px-2 py-1 font-medium text-blue-600">
195
+ {entry.display_name}
196
+ </td>
197
+ <td className="border border-gray-300 px-2 py-1 text-center">
198
+ {entry.averageScore == null
199
+ ? t('leaderboard_notSpecified')
200
+ : (entry.averageScore * 100).toFixed(1) + '%'}
201
+ </td>
202
+ {benchmarks.map((b) => {
203
+ const val = getCellValue(entry, b);
204
+ return (
205
+ <td
206
+ key={b}
207
+ className="border border-gray-200 px-2 py-1 text-center text-purple-700"
208
+ >
209
+ {val == null
210
+ ? t('leaderboard_notSpecified')
211
+ : (val * 100).toFixed(1) + '%'}
212
+ </td>
213
+ );
214
+ })}
215
+ </tr>
216
+ ))}
217
+ </tbody>
218
+ </table>
219
+ </div>
220
+
221
+ {selectedEntry && (
222
+ <div className="fixed inset-0 flex items-center justify-center bg-black bg-opacity-50">
223
+ <div className="bg-white p-6 rounded-2xl shadow-lg max-w-2xl w-full mx-4 max-h-[80vh] overflow-y-auto">
224
+ <h3 className="text-xl font-semibold text-gray-800 mb-4">
225
+ {t('leaderboard_modalTitle', {
226
+ name: selectedEntry.display_name,
227
+ })}
228
+ </h3>
229
+ {Object.entries(selectedEntry.results || {}).map(
230
+ ([taskKey, metricsObj]) => {
231
+ const prettyName = taskKey.split('|')[1] || taskKey;
232
+ const [metricType, values] = Object.entries(metricsObj)[0];
233
+ return (
234
+ <div key={taskKey} className="mb-4">
235
+ <h4 className="font-medium text-blue-700">
236
+ {prettyName}
237
+ </h4>
238
+ <ul className="list-disc list-inside text-gray-700">
239
+ {Object.entries(values)
240
+ .filter(([k]) => !k.endsWith('_warning'))
241
+ .map(([metricKey, value]) => (
242
+ <li key={metricKey}>
243
+ <strong>{metricKey.replace(/_/g, ' ')}</strong>:{' '}
244
+ {typeof value === 'number'
245
+ ? (value > 1
246
+ ? value.toFixed(1) + '%'
247
+ : (value * 100).toFixed(1) + '%')
248
+ : value}
249
+ </li>
250
+ ))}
251
+ </ul>
252
+ {values[`${metricType}_warning`] && (
253
+ <p className="text-sm text-yellow-700 mt-2">
254
+ ⚠️ {values[`${metricType}_warning`]}
255
+ </p>
256
+ )}
257
+ </div>
258
+ );
259
+ }
260
+ )}
261
+ <button
262
+ className="mt-4 px-4 py-2 bg-gray-200 rounded-full hover:bg-gray-300"
263
+ onClick={() => setSelectedEntry(null)}
264
+ >
265
+ {t('leaderboard_closeButton')}
266
+ </button>
267
+ </div>
268
+ </div>
269
+ )}
270
+ </div>
271
+ );
272
+ }
frontend/src/app/leaderboard/util.js ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ export const normalizeBenchmarkName = (name) => {
3
+ const parts = name.toLowerCase().split("|");
4
+ if (parts.length >= 2) return parts[1].replace(/-/g, "_");
5
+ return name.toLowerCase();
6
+ };
7
+
8
+ export const computeAverageScore = (entry) => {
9
+ const allowedMetrics = [
10
+ "acc",
11
+ "accuracy",
12
+ "f1",
13
+ "exact_match",
14
+ "fquad",
15
+ "pearson",
16
+ "pearsonr",
17
+ "spearman",
18
+ ];
19
+
20
+ const perTaskAverages = [];
21
+
22
+ Object.values(entry.results || {}).forEach((taskData) => {
23
+ if (taskData && typeof taskData === "object") {
24
+ Object.values(taskData).forEach((metricGroup) => {
25
+ if (metricGroup && typeof metricGroup === "object") {
26
+ const taskMetrics = Object.entries(metricGroup)
27
+ .filter(([metric]) => allowedMetrics.includes(metric.toLowerCase()))
28
+ .map(([, value]) =>
29
+ typeof value === "number" ? value : null
30
+ )
31
+ .filter((v) => v !== null);
32
+
33
+ if (taskMetrics.length > 0) {
34
+ const normalized = taskMetrics.map((v) => v > 1 ? v / 100 : v);
35
+ const taskAvg = normalized.reduce((a, b) => a + b, 0) / normalized.length;
36
+ perTaskAverages.push(taskAvg);
37
+ }
38
+ }
39
+ });
40
+ }
41
+ });
42
+
43
+ if (perTaskAverages.length === 0) return null;
44
+
45
+ return perTaskAverages.reduce((a, b) => a + b, 0) / perTaskAverages.length;
46
+ };
47
+
frontend/src/app/page.js ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client'
2
+
3
+ import Link from "next/link";
4
+ import { Trans } from 'react-i18next';
5
+ import { useTranslation } from 'react-i18next';
6
+
7
+ export default function Home() {
8
+ const { t } = useTranslation();
9
+
10
+ return (
11
+ <div className="max-w-3xl mx-auto px-6 py-3">
12
+ <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
13
+ {t('home_whatIsColleTitle')}
14
+ </h2>
15
+
16
+ <p className="text-gray-700 mb-4 leading-relaxed space-y-4">
17
+ <Trans i18nKey="home_paragraph1">
18
+ COLE is a multidisciplinary French Natural Language Understanding benchmark (
19
+ <a
20
+ href="https://en.wikipedia.org/wiki/Natural_language_understanding"
21
+ target="_blank"
22
+ rel="noopener noreferrer"
23
+ className="text-blue-600 underline hover:text-blue-800"
24
+ >
25
+ NLU
26
+ </a>
27
+ ). It takes inspiration from its predecessors&nbsp;
28
+ <a
29
+ href="https://gluebenchmark.com/"
30
+ target="_blank"
31
+ rel="noopener noreferrer"
32
+ className="text-blue-600 underline hover:text-blue-800"
33
+ >
34
+ GLUE
35
+ </a>
36
+ &nbsp;and&nbsp;
37
+ <a
38
+ href="https://super.gluebenchmark.com/"
39
+ target="_blank"
40
+ rel="noopener noreferrer"
41
+ className="text-blue-600 underline hover:text-blue-800"
42
+ >
43
+ SuperGLUE
44
+ </a>
45
+ &nbsp;to build a benchmark capable of evaluating models in the French language on multiple topics of language understanding. See&nbsp;
46
+ <Link
47
+ href="https://arxiv.org/abs/2510.05046"
48
+ className="text-blue-600 underline hover:text-blue-800"
49
+ >
50
+ our paper
51
+ </Link>
52
+ &nbsp;for more information.
53
+ </Trans>
54
+ </p>
55
+
56
+ <p className="text-gray-700 leading-relaxed">
57
+ {t('home_paragraph2')}
58
+ </p>
59
+
60
+ <p className="text-gray-700 leading-relaxed mt-4">
61
+ <Trans i18nKey="home_paragraph3">
62
+ We have made the choice to hide test labels to discourage cheating or overfitting on test data. To get results on your test data, you may send us your results as explained in&nbsp;
63
+ <Link
64
+ href="/guide"
65
+ className="text-blue-600 underline hover:text-blue-800"
66
+ >
67
+ our guide
68
+ </Link>
69
+ .
70
+ </Trans>
71
+ </p>
72
+ </div>
73
+ );
74
+ }
frontend/src/app/papers/page.js ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import React, { useState } from 'react';
4
+
5
+ export default function PapersPage() {
6
+ const [loaded, setLoaded] = useState(false);
7
+
8
+ return (
9
+ <div className="relative h-screen">
10
+ {!loaded && (
11
+ <div className="absolute inset-0 flex items-center justify-center bg-white z-10">
12
+ <div className="animate-spin h-12 w-12 border-4 border-blue-600 border-t-transparent rounded-full" />
13
+ </div>
14
+ )}
15
+ <div className="max-w-3xl mx-auto px-6 py-3">
16
+ <h2 className="text-3xl font-bold text-center text-blue-700 border-b pb-4 mb-10">
17
+ Our papers
18
+ </h2>
19
+ </div>
20
+
21
+ <iframe
22
+ onLoad={() => setLoaded(true)}
23
+ src="cole.pdf"
24
+ title="Document COLE"
25
+ width="100%"
26
+ height="100%"
27
+ className="border-none"
28
+ />
29
+ </div>
30
+ );
31
+ }
frontend/src/app/resources/BenchmarksResource.js ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const BASE_PATH = " http://127.0.0.1:8000"
2
+
3
+
4
+ const send_results = async (email, labels) => {
5
+ let path = `${BASE_PATH}/submit`;
6
+ if(!email||!labels){
7
+ alert("email and results must be present")
8
+ return
9
+ }
10
+ const formData = new FormData()
11
+ formData.append("email",email)
12
+ formData.append("labels",labels)
13
+ console.log(email)
14
+ console.log(labels)
15
+ try {
16
+ const response = await fetch(path, {
17
+ method: "POST",
18
+ body: formData,
19
+ });
20
+
21
+ if (!response.ok) {
22
+ console.log(response)
23
+ throw new Error("Failed to submit");
24
+
25
+ }
26
+ const result = await response.json();
27
+ console.log("Server response:", result);
28
+ alert("Submission successful!");
29
+ } catch (err) {
30
+ console.error("Error submitting results:", err);
31
+ alert("There was a problem with the submission.");
32
+ }
33
+ }
34
+
35
+ export {send_results}
frontend/src/app/resources/ResourcesPaths.js ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ const BACKEND_ADDRESS = "/api"
2
+ export {BACKEND_ADDRESS}
frontend/src/app/results/[id]/page.js ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../../i18n';
4
+ import { useTranslation } from 'react-i18next';
5
+ import React, { useEffect, useState } from 'react';
6
+ import { useParams } from 'next/navigation';
7
+
8
+ export default function ResultsPage() {
9
+ const { t } = useTranslation();
10
+ const { id: submissionId } = useParams();
11
+ const [data, setData] = useState(null);
12
+
13
+ // Noms de métriques fixes en anglais
14
+ const metricLabel = {
15
+ accuracy: 'Accuracy',
16
+ exact_match: 'Exact Match',
17
+ f1: 'F1 Score',
18
+ pearsonr: 'Pearson Correlation',
19
+ };
20
+ const getReadableMetricName = (metricKey) =>
21
+ metricLabel[metricKey] ||
22
+ metricKey.replace(/_/g, ' ').replace(/\b\w/g, (c) => c.toUpperCase());
23
+
24
+ useEffect(() => {
25
+ fetch(`http://localhost:8000/results/${submissionId}.json`)
26
+ .then((res) => {
27
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
28
+ return res.json();
29
+ })
30
+ .then(setData)
31
+ .catch(() => setData({ error: true }));
32
+ }, [submissionId]);
33
+
34
+ const handleDownload = async () => {
35
+ if (!data) return;
36
+ try {
37
+ const res = await fetch(`${BACKEND_ADDRESS}/results/${submissionId}.json`);
38
+ if (!res.ok) throw new Error(`HTTP ${res.status}`);
39
+ const blob = await res.blob();
40
+ const url = URL.createObjectURL(blob);
41
+ const link = document.createElement('a');
42
+ link.href = url;
43
+ link.download = `${submissionId}.json`;
44
+ document.body.appendChild(link);
45
+ link.click();
46
+ document.body.removeChild(link);
47
+ URL.revokeObjectURL(url);
48
+ } catch {
49
+ console.error('Download failed');
50
+ }
51
+ };
52
+
53
+ if (!data) {
54
+ return (
55
+ <main className="max-w-3xl mx-auto px-6 py-6 text-center">
56
+ <p className="text-gray-600">{t('results_loading')}</p>
57
+ </main>
58
+ );
59
+ }
60
+
61
+ const tasksArray = data.tasks || [];
62
+ const displayName = data.display_name || data.config_general?.display_name;
63
+
64
+ return (
65
+ <main className="max-w-3xl mx-auto px-6 py-6">
66
+ <h2 className="text-2xl font-bold text-center mb-4">
67
+ {t('results_page_title', { displayName })}
68
+ </h2>
69
+
70
+ <div className="flex justify-center mb-6">
71
+ <button
72
+ onClick={handleDownload}
73
+ className="px-4 py-2 bg-blue-600 text-white rounded-lg shadow hover:bg-blue-700 transition"
74
+ >
75
+ {t('results_download')}
76
+ </button>
77
+ </div>
78
+
79
+ {tasksArray.length === 0 ? (
80
+ <p className="text-blue-700 text-center">
81
+ {t('results_no_results')}
82
+ </p>
83
+ ) : (
84
+ <div className="space-y-6">
85
+ {tasksArray.map((taskObj) => {
86
+ const [taskName, metricsObj] = Object.entries(taskObj)[0];
87
+ const [metricType, metricValues] = Object.entries(metricsObj)[0];
88
+ const prettyName = taskName.split('|')[1] || taskName;
89
+ const warningKey = `${metricType}_warning`;
90
+
91
+ return (
92
+ <div
93
+ key={taskName}
94
+ className="p-5 border border-purple-400 rounded-xl shadow-md bg-white"
95
+ >
96
+ <h3 className="text-xl font-semibold text-blue-700 mb-3">
97
+ {t('results_benchmark_label', { name: prettyName })}
98
+ </h3>
99
+ <ul className="list-disc ml-6 text-gray-700">
100
+ {Object.entries(metricValues)
101
+ .filter(([k]) => !k.endsWith('_warning'))
102
+ .map(([metricKey, value]) => (
103
+ <li key={metricKey}>
104
+ <strong>{getReadableMetricName(metricKey)}</strong>:{' '}
105
+ {typeof value === 'number' ? (
106
+ (metricKey === 'exact_match' || metricKey === 'f1'
107
+ ? value
108
+ : value * 100
109
+ ).toFixed(1) + '%'
110
+ ) : (
111
+ value
112
+ )}
113
+ </li>
114
+ ))}
115
+ </ul>
116
+ {metricValues[warningKey] && (
117
+ <p className="text-sm text-yellow-700 mt-2">
118
+ ⚠️ {metricValues[warningKey]}
119
+ </p>
120
+ )}
121
+ </div>
122
+ );
123
+ })}
124
+ </div>
125
+ )}
126
+ </main>
127
+ );
128
+ }
frontend/src/app/results/page.js ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 'use client';
2
+
3
+ import '../i18n'
4
+ import { useEffect } from 'react';
5
+ import { useRouter } from 'next/navigation';
6
+ import { useTranslation } from 'react-i18next';
7
+
8
+ export default function ResultsDefaultPage() {
9
+ const router = useRouter();
10
+ const { t } = useTranslation();
11
+
12
+ useEffect(() => {
13
+ const justSubmitted = localStorage.getItem('just_submitted');
14
+ const savedFile = localStorage.getItem('last_result_file');
15
+
16
+ if (justSubmitted && savedFile) {
17
+ const id = savedFile.replace('.json', '');
18
+ localStorage.removeItem('just_submitted');
19
+ router.push(`/results/${id}`);
20
+ }
21
+ }, [router]);
22
+
23
+ return (
24
+ <main className="max-w-2xl mx-auto px-6 py-12 text-center">
25
+ <h1 className="text-3xl font-bold text-blue-700 mb-4">
26
+ {t('results_default_title')}
27
+ </h1>
28
+ <p className="text-gray-700">{t('results_default_message')}</p>
29
+ </main>
30
+ );
31
+ }
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ REPO_ID = "COLE-Graal/COLEGraal"
2
+ WANDB_PROJECT = "COLE-final"
3
+ NA_VALUE = -1
src/backend/__init__.py ADDED
File without changes
src/backend/evaluation.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import operator
3
+ from functools import reduce
4
+ from typing import List, Dict
5
+
6
+ from src.task.task_factory import Task
7
+
8
+
9
+ def compute_tasks_ratings(tasks: List[Task], submission: Dict) -> Dict:
10
+ """
11
+ Method to compute the tasks ratings.
12
+ :param tasks: list of tasks
13
+ :param submission: submission dictionary
14
+ """
15
+
16
+ # We merge the tasks dictionary for simpler handling.
17
+ submission_copy = copy.deepcopy(submission)
18
+ submission_response = reduce(operator.ior, submission_copy.get("tasks"), {})
19
+
20
+ for task in tasks:
21
+ task_name = task.task_name
22
+
23
+ # We remove the prediction since we do not keep it in the response.
24
+ predictions = submission_response.get(task_name).pop("predictions")
25
+
26
+ ratings, warning = task.compute(predictions=predictions)
27
+ ratings.update({f"{task.metric_name}_warning": warning})
28
+ submission_response.get(task_name).update({f"{task.metric_name}": ratings})
29
+
30
+ # Final submission response where we unwrap the merge tasks dictionary into a list of dictionary.
31
+ submission_response = {
32
+ "model_name": submission.get("model_name"),
33
+ "model_url": submission.get("model_url"),
34
+ "tasks": [{key: value} for key, value in submission_response.items()],
35
+ }
36
+ return submission_response
src/backend/results/leaderboard.json ADDED
The diff for this file is too large to render. See raw diff
 
src/backend/submission_api.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import glob
2
+ import json
3
+ import logging
4
+ import os
5
+ import sys
6
+ import uuid
7
+ from contextlib import asynccontextmanager
8
+ from datetime import datetime
9
+ from functools import lru_cache
10
+ from pathlib import Path
11
+ from typing import Dict, List, Any, Union
12
+
13
+ import huggingface_hub
14
+ from fastapi import FastAPI, UploadFile, Form, File
15
+ from fastapi.responses import JSONResponse
16
+ from fastapi.staticfiles import StaticFiles
17
+
18
+ from starlette.middleware.cors import CORSMiddleware
19
+
20
+ from src.backend.evaluation import compute_tasks_ratings
21
+ from src.backend.submit_tools import unzip_predictions_from_zip
22
+ from src.dataset.datasets_data import preload_all_datasets
23
+ from src.backend.validation_tools import (
24
+ validate_submission_tasks_name,
25
+ validate_submission_json,
26
+ validate_submission_template,
27
+ )
28
+ from src.task.task import Task
29
+ from src.task.task_factory import (
30
+ tasks_factory,
31
+ )
32
+
33
+
34
+ BASE_DIR = Path(__file__).resolve().parents[2]
35
+ SRC_DIR = BASE_DIR / "src"
36
+ sys.path.insert(0, str(SRC_DIR))
37
+
38
+ RESULTS_DIR = BASE_DIR / "src" / "backend" / "results"
39
+ RESULTS_DIR.mkdir(parents=True, exist_ok=True)
40
+ FRONTEND_DIR = BASE_DIR / "frontend"
41
+
42
+
43
+ @asynccontextmanager
44
+ async def lifespan(application: FastAPI = None): # pylint: disable=unused-argument
45
+ """Called before the backend comes online, is used to load datasets in memory."""
46
+ # Load the ML model
47
+ try:
48
+ token = os.environ.get("HF_TOKEN")
49
+ huggingface_hub.login(token=token)
50
+ preload_all_datasets()
51
+ except Exception as e:
52
+ error_message = f"The datasets could not be loaded : {e}"
53
+ logging.critical(error_message)
54
+
55
+ yield
56
+
57
+
58
+ app = FastAPI(lifespan=lifespan)
59
+ app.mount("/results", StaticFiles(directory=str(RESULTS_DIR)), name="results")
60
+ front_end_info_message = f"The Front-end directory is: {FRONTEND_DIR}"
61
+ logging.info(front_end_info_message)
62
+
63
+ app.add_middleware(
64
+ CORSMiddleware,
65
+ allow_credentials=True,
66
+ allow_origins=["*"],
67
+ allow_methods=["*"],
68
+ allow_headers=["*"],
69
+ )
70
+
71
+
72
+ @app.post("/submit")
73
+ async def submit(
74
+ email: str = Form(...),
75
+ predictions_zip: UploadFile = File(...),
76
+ display_name: str = Form(...),
77
+ ):
78
+ """Route for making submissions with user generated results.
79
+ :param email : The email of the user's submission
80
+ :param predictions_zip : The zip file of the user's predictions'
81
+ :param display_name : The display name associated with the user's submission'
82
+ """
83
+ logging.info("Starting submission")
84
+ info_message = f"Submission from {email!r} as {display_name!r}."
85
+ logging.info(info_message)
86
+ zip_bytes = await predictions_zip.read()
87
+ submission_json = unzip_predictions_from_zip(zip_bytes)
88
+
89
+ validate_submission_template(submission_json)
90
+ validate_submission_tasks_name(submission_json)
91
+ validate_submission_json(submission_json)
92
+
93
+ tasks: List[Task] = tasks_factory(submission_json)
94
+ logging.info("Computation started")
95
+ start = datetime.now()
96
+ submission_response = compute_tasks_ratings(tasks=tasks, submission=submission_json)
97
+ computation_time = datetime.now() - start
98
+ info_message = f"Computation ended in {computation_time}"
99
+ logging.info(info_message)
100
+ submission_id = str(uuid.uuid4())
101
+ submission_response.update(
102
+ {
103
+ "display_name": display_name,
104
+ "email": email,
105
+ "submission_id": submission_id,
106
+ }
107
+ )
108
+
109
+ out_path = RESULTS_DIR / f"{submission_id}.json"
110
+ with open(out_path, "w", encoding="utf-8") as f:
111
+ json.dump(submission_response, f, ensure_ascii=False, indent=2)
112
+
113
+ get_leaderboard_entries.cache_clear()
114
+
115
+ return JSONResponse(content=submission_response)
116
+
117
+
118
+ @lru_cache(maxsize=1)
119
+ def get_leaderboard_entries() -> List[Dict[str, Any]]:
120
+ """Returns all entries currently in the leaderboard.
121
+ Supporte aussi les fichiers JSON qui contiennent une LISTE d'entrées
122
+ et normalise les métriques 'plates' en groupes imbriqués pour le front.
123
+ """
124
+
125
+ def _wrap_flat_metrics(task_payload: Dict[str, Any]) -> Dict[str, Any]:
126
+ """
127
+ Si task_payload est 'plat' (ex: {"accuracy": 94.2}),
128
+ on le transforme en {"<group>": {...}} pour que le front puisse l'agréger.
129
+ Règles de nommage du groupe :
130
+ - présence de exact_match/f1 -> "fquad"
131
+ - sinon présence de acc/accuracy -> "accuracy"
132
+ - sinon présence de pearson/pearsonr/spearman -> "correlation"
133
+ - sinon -> "metrics"
134
+ Les valeurs >1 sont laissées telles quelles (le front normalise déjà % -> [0,1]).
135
+ """
136
+ if not isinstance(task_payload, dict):
137
+ return task_payload
138
+
139
+ # si c'est déjà "imbriqué" (une valeur est un dict), on ne touche pas
140
+ if any(isinstance(v, dict) for v in task_payload.values()):
141
+ return task_payload
142
+
143
+ keys = set(k.lower() for k in task_payload.keys())
144
+ if {"exact_match", "f1"} & keys:
145
+ group = "fquad"
146
+ elif {"accuracy", "acc"} & keys:
147
+ group = "accuracy"
148
+ elif {"pearson", "pearsonr", "spearman"} & keys:
149
+ group = "correlation"
150
+ else:
151
+ group = "metrics"
152
+
153
+ # Rien de spécial pour les warnings ici : le front les considère optionnels
154
+ # et s'attend à "<group>_warning" dans l'objet interne si on veut en fournir.
155
+ return {group: task_payload}
156
+
157
+ entries: List[Dict[str, Any]] = []
158
+
159
+ for filepath in glob.glob(str(RESULTS_DIR / "*.json")):
160
+ try:
161
+ with open(filepath, encoding="utf-8") as f:
162
+ data = json.load(f)
163
+
164
+ # Fonction interne qui traite UNE entrée (dict) au bon format minimal
165
+ def process_entry(entry: Dict[str, Any]) -> Union[Dict[str, Any], None]:
166
+ if not isinstance(entry, dict):
167
+ return None
168
+ if "model_name" not in entry or "tasks" not in entry:
169
+ return None
170
+
171
+ # Re-construire "results" comme le front s'y attend
172
+ results = {}
173
+ for task_obj in entry.get("tasks", []):
174
+ if not isinstance(task_obj, dict) or len(task_obj) != 1:
175
+ continue
176
+ task_name, payload = list(task_obj.items())[0]
177
+ normalized = _wrap_flat_metrics(payload)
178
+ results[task_name] = normalized
179
+
180
+ if not results:
181
+ return None
182
+
183
+ return {
184
+ "submission_id": entry.get("submission_id") or str(uuid.uuid4()),
185
+ "display_name": entry.get("display_name")
186
+ or entry.get("model_name")
187
+ or "Unnamed Model",
188
+ "email": entry.get("email", "N/A"),
189
+ "results": results,
190
+ }
191
+
192
+ # Le fichier peut contenir UNE entrée (dict) ou PLUSIEURS (list)
193
+ if isinstance(data, list):
194
+ for item in data:
195
+ processed = process_entry(item)
196
+ if processed:
197
+ entries.append(processed)
198
+ else:
199
+ processed = process_entry(data)
200
+ if processed:
201
+ entries.append(processed)
202
+
203
+ except Exception as e:
204
+ logging_message = f"Error processing file '{filepath}': {e}"
205
+ logging.error(logging_message)
206
+ continue
207
+
208
+ return entries
209
+
210
+
211
+ @app.get("/leaderboard")
212
+ async def leaderboard() -> List[Dict[str, Any]]:
213
+
214
+ return get_leaderboard_entries()
215
+
216
+
217
+ @app.get("/health")
218
+ async def health_check():
219
+ return {"status": "healthy", "message": "API is running."}
220
+
221
+
222
+ @app.get("/")
223
+ async def home():
224
+ return {"status": "working"}
src/backend/submit_tools.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import zipfile
4
+
5
+ from fastapi import HTTPException
6
+
7
+
8
+ def unzip_predictions_from_zip(zip_bytes: bytes) -> dict:
9
+ """
10
+ Reads predictions.json directly from the ZIP in memory.
11
+ """
12
+ with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
13
+ if "predictions.json" not in z.namelist():
14
+ error_message = (
15
+ "The uploaded ZIP file does not contains a predictions.json file."
16
+ )
17
+ raise HTTPException(400, error_message)
18
+ with z.open("predictions.json") as f:
19
+ return json.load(f)
src/backend/validation_tools.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, List
3
+
4
+ from fastapi import HTTPException
5
+
6
+ tasks_name = [
7
+ "allocine",
8
+ "fquad",
9
+ "gqnli",
10
+ "paws_x",
11
+ "piaf",
12
+ "qfrblimp",
13
+ "qfrcola",
14
+ "sickfr",
15
+ "sts22",
16
+ "xnli",
17
+ ]
18
+
19
+
20
+ def validate_submission_template(dictionary: Dict) -> None:
21
+ """Ensures the dictionnary follows the correct format.
22
+ :param dictionary: Dictionary to validate."""
23
+ if dictionary.get("model_name", None) is None:
24
+ error = "The submission is missing a model name."
25
+ logging.error(error)
26
+ raise HTTPException(200, error)
27
+ if dictionary.get("model_url", None) is None:
28
+ error = "The submission is missing a model URL."
29
+ logging.error(error)
30
+ raise HTTPException(200, error)
31
+ if dictionary.get("tasks", None) is None:
32
+ error = "The submission is missing a tasks keyword."
33
+ logging.error(error)
34
+ raise HTTPException(200, error)
35
+
36
+ tasks = dictionary.get("tasks")
37
+ if not isinstance(tasks, List):
38
+ error = (
39
+ "The tasks keyword value must be a list of dictionaries where they key is the tasks "
40
+ "and value is a dictionary of predictions (in a list format). See our documentation for"
41
+ "a template."
42
+ )
43
+ logging.error(error)
44
+ raise HTTPException(200, error)
45
+
46
+ for task in tasks:
47
+ if len(task.keys()) > 1:
48
+ error = (
49
+ "Each task must be a dictionary of one element where the key is "
50
+ "the task name and the value is a list."
51
+ )
52
+ logging.error(error)
53
+ raise HTTPException(200, error)
54
+
55
+
56
+ def validate_submission_tasks_name(dictionary: Dict) -> None:
57
+ """
58
+ Validate if the submission JSON key are the tasks name.
59
+ """
60
+ for task in dictionary.get("tasks"):
61
+ key = list(task.keys())[0]
62
+ if key not in tasks_name:
63
+ error = f"Unknown key '{key}' in the submission JSON. The expected tasks are: {tasks_name}."
64
+ logging.error(error)
65
+ raise HTTPException(200, error)
66
+
67
+
68
+ def validate_submission_json(dictionary: Dict) -> None:
69
+ """Validates that the submitted json is in the correct format.
70
+ :param dictionary: Dictionary to validate."""
71
+ task_payload = dictionary.get("tasks")
72
+
73
+ for task in task_payload:
74
+ for task_name, payload in task.items():
75
+ if not isinstance(payload, dict):
76
+ error = (
77
+ "The tasks payload must be a dictionary in the format '{'prediction': [<predictions>]}' "
78
+ "for each task."
79
+ )
80
+ logging.error(error)
81
+ raise HTTPException(200, error)
82
+ for key, value in payload.items():
83
+ if key not in ["predictions", "prediction"]:
84
+ error = f"The task '{task_name}' payload does not have the expected key: 'predictions'."
85
+ logging.error(error)
86
+ raise HTTPException(200, error)
87
+ if not isinstance(value, list):
88
+ error = (
89
+ f"The task '{task_name}' predictions payload is not in a list format. "
90
+ r"The expected format is: '{'prediction': [<predictions>]}'"
91
+ )
92
+ logging.error(error)
93
+ raise HTTPException(200, error)
src/dataset/__init__.py ADDED
File without changes
src/dataset/dataset.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Any, Union, List
2
+
3
+ from datasets import load_dataset
4
+
5
+
6
+ class Dataset:
7
+ """Class representing a usable dataset.
8
+ Allows dataset to be expressed as multiple forms, including as prompts, data or answers.
9
+ :param name : name of the dataset.
10
+ :param description : description of the dataset.
11
+ :param possible_ground_truths : the form that could be taken by ground truths.
12
+ :param hugging_face_repo : where to download the dataset on HuggingFace.
13
+ :param line_to_truth_fn : a function converting a dataset line to its truth value.
14
+ :param line_to_prompt_fn : a function converting a dataset line to a prompt for LLM inference.
15
+ :param line_to_data_fn : a function converting a dataset line to its data value for non LLM inference.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ name: str,
21
+ description: str,
22
+ possible_ground_truths: Union[List[str], List[int], List[float]],
23
+ hugging_face_repo: str,
24
+ line_to_truth_fn: Callable,
25
+ line_to_prompt_fn: Callable,
26
+ line_to_data_fn: Callable,
27
+ ):
28
+ self._dataset = None
29
+ self.name = name
30
+ self.description = description
31
+ self.hugging_face_repo = hugging_face_repo
32
+ self.possible_ground_truths = possible_ground_truths
33
+ self.line_to_prompt_fn = line_to_prompt_fn
34
+ self.line_to_truth_fn = line_to_truth_fn
35
+ self.line_to_data_fn = line_to_data_fn
36
+
37
+ @property
38
+ def dataset(self):
39
+ self.load_data()
40
+ return self._dataset
41
+
42
+ def load_data(self):
43
+ if self._dataset is None:
44
+ self._dataset = load_dataset(
45
+ self.hugging_face_repo, name=self.name, split="test"
46
+ )
47
+
48
+ @property
49
+ def ground_truths(self) -> Union[List[str], List[int], List[float]]:
50
+ """The dataset's ground truths as a list"""
51
+ return [self.line_to_truth_fn(line) for line in self.dataset]
52
+
53
+ @property
54
+ def prompts(self) -> List[str]:
55
+ """The dataset's prompts as a list"""
56
+ return [self.line_to_prompt_fn(line) for line in self.dataset]
57
+
58
+ @property
59
+ def data(self) -> List[str]:
60
+ """The dataset's data as a list"""
61
+ return [self.line_to_data_fn(line) for line in self.dataset]
62
+
63
+ @property
64
+ def metadata(self) -> dict[str, Any]:
65
+ """The dataset's metadata as a dict"""
66
+ return {
67
+ "name": self.name,
68
+ "description": self.description,
69
+ "possible_ground_truths": str(self.possible_ground_truths),
70
+ "Prompt template": self.line_to_prompt_fn(self.EchoDict()),
71
+ }
72
+
73
+ @property
74
+ def metadata_string(self) -> str:
75
+ """The dataset's metadata as a string"""
76
+ lines = []
77
+ for key, value in self.metadata.items():
78
+ lines.append(f"{key}: {value}")
79
+ return "\n".join(lines)
80
+
81
+ def __len__(self):
82
+ return len(self.ground_truths)
83
+
84
+ def __getitem__(self, index: Union[int, slice]):
85
+ if isinstance(index, slice):
86
+ get_item_data = self.ground_truths[index.start : index.stop]
87
+ else:
88
+ get_item_data = self.ground_truths[index]
89
+
90
+ return get_item_data
91
+
92
+ class EchoDict:
93
+ """Helper class for building prompt templates,always returns the accessed key"""
94
+
95
+ def __getitem__(self, key):
96
+ return key
src/dataset/datasets_data.py ADDED
@@ -0,0 +1,602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.dataset.dataset import Dataset
2
+ from src.dataset.prompt_builder import PromptBuilder
3
+ from src.task import COLE_REPOSITORY_NAME
4
+ from src.task.task_names import Tasks
5
+
6
+ datasets = {
7
+ Tasks.ALLOCINE.value: Dataset(
8
+ name=Tasks.ALLOCINE.value,
9
+ description="Binary classification on sentiment analysis"
10
+ " of movie reviews, with reviews being either positive (1) or negative (0).",
11
+ possible_ground_truths=["0", "1"],
12
+ hugging_face_repo=COLE_REPOSITORY_NAME,
13
+ line_to_truth_fn=lambda line: line["label"],
14
+ line_to_prompt_fn=lambda line: PromptBuilder()
15
+ .add_premise("Cette phrase possède-t-elle un sentiment positif ou négatif ?")
16
+ .add_data(line["review"])
17
+ .add_end(
18
+ (
19
+ "Réponds "
20
+ "uniquement par 1 si la phrase est positive, réponds par 0 sinon. La réponse est :"
21
+ )
22
+ )
23
+ .build(),
24
+ line_to_data_fn=lambda line: line["review"],
25
+ ),
26
+ Tasks.QFRCOLA.value: Dataset(
27
+ name=Tasks.QFRCOLA.value,
28
+ description="Binary grammatical judgement : "
29
+ "Predicts whether a sentence is grammatically correct (1) or not. (0).",
30
+ possible_ground_truths=["0", "1"],
31
+ hugging_face_repo=COLE_REPOSITORY_NAME,
32
+ line_to_truth_fn=lambda line: line["label"],
33
+ line_to_prompt_fn=lambda line: PromptBuilder()
34
+ .add_premise("Juge si cette phrase est grammaticalement correcte :")
35
+ .add_data(line["sentence"])
36
+ .add_end(
37
+ (
38
+ "Réponds avec seulement 1 si la phrase est grammaticalement correcte, 0 sinon. La réponse est :"
39
+ )
40
+ )
41
+ .build(),
42
+ line_to_data_fn=lambda line: line["sentence"],
43
+ ),
44
+ Tasks.QFRBLIMP.value: Dataset(
45
+ name=Tasks.QFRBLIMP.value,
46
+ description="Choice task between two sentences : Choose the one which is grammatically correct.",
47
+ possible_ground_truths=["0", "1"],
48
+ hugging_face_repo=COLE_REPOSITORY_NAME,
49
+ line_to_truth_fn=lambda line: str(
50
+ line["label"]
51
+ ), # The label is return as a string.
52
+ line_to_prompt_fn=lambda line: (
53
+ PromptBuilder()
54
+ .add_premise("Laquelle de ces phrases est grammaticalement correcte ?")
55
+ .add_data(f"Phrase 0:{line['sentence_a']}")
56
+ .add_data(f"Phrase 1:{line['sentence_b']}")
57
+ .add_end(
58
+ "Réponds avec seulement 0 si la phrase 0 "
59
+ "est grammaticalement correcte, et uniquement 1 si la phrase 1 est grammaticalement "
60
+ "correcte. La réponse est :"
61
+ )
62
+ .build()
63
+ ),
64
+ line_to_data_fn=lambda line: {line["sentence_a"], line["sentence_b"]},
65
+ ),
66
+ Tasks.GQNLI.value: Dataset(
67
+ name=Tasks.GQNLI.value,
68
+ description="Natural language inference task : "
69
+ "predict the relation between two sentences (implication, neutral, contradiction).",
70
+ possible_ground_truths=["0", "1", "2"],
71
+ hugging_face_repo=COLE_REPOSITORY_NAME,
72
+ line_to_truth_fn=lambda line: line["label"],
73
+ line_to_prompt_fn=lambda line: PromptBuilder()
74
+ .add_premise(
75
+ "Quelle est la relation de la deuxième phrase par rapport à la première ?"
76
+ )
77
+ .add_data(line["premise"])
78
+ .add_data(line["hypothesis"])
79
+ .add_end(
80
+ (
81
+ "Réponds uniquement par :\n"
82
+ "0 - si la deuxième phrase implique la première,\n"
83
+ "1 - si la relation est neutre,\n"
84
+ "2 - s'il y a contradiction.\n"
85
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
86
+ )
87
+ )
88
+ .build(),
89
+ line_to_data_fn=lambda line: {
90
+ "premise": line["premise"],
91
+ "hypothesis": line["hypothesis"],
92
+ },
93
+ ),
94
+ Tasks.SICKFR.value: Dataset(
95
+ name=Tasks.SICKFR.value,
96
+ description="Natural language inference task : "
97
+ "predict the relation between two sentences (implication, neutral, contradiction).",
98
+ possible_ground_truths=["0", "1", "2"],
99
+ hugging_face_repo=COLE_REPOSITORY_NAME,
100
+ line_to_truth_fn=lambda line: str(line["label"]),
101
+ line_to_prompt_fn=lambda line: PromptBuilder()
102
+ .add_premise("Détermine la relation entre les deux phrases suivantes :")
103
+ .add_data(f"Phrase A : {line['sentence_A']}\nPhrase B : {line['sentence_B']}")
104
+ .add_end(
105
+ "Réponds uniquement par 0, 1 ou 2 :\n"
106
+ "0 - si la deuxième phrase découle logiquement de la première,\n"
107
+ "1 - si leur relation est neutre,\n"
108
+ "2 - si les phrases se contredisent.\n"
109
+ "La réponse est :"
110
+ )
111
+ .build(),
112
+ line_to_data_fn=lambda line: {
113
+ "sentence_A": line["sentence_A"],
114
+ "sentence_B": line["sentence_B"],
115
+ },
116
+ ),
117
+ Tasks.STS22.value: Dataset(
118
+ name=Tasks.STS22.value,
119
+ description="Semantic textual similarity task : "
120
+ "Predict how similar two sentences are to each other (1 to 4).",
121
+ possible_ground_truths=["1", "2", "3", "4"],
122
+ hugging_face_repo=COLE_REPOSITORY_NAME,
123
+ line_to_truth_fn=lambda line: str(line["score"]),
124
+ line_to_prompt_fn=lambda line: PromptBuilder()
125
+ .add_premise(
126
+ "À quel point les deux phrases suivantes sont-elles similaires ? Donne une note entière de 1 à 4."
127
+ )
128
+ .add_data(f"Phrase 1 : {line['sentence1']}\nPhrase 2 : {line['sentence2']}")
129
+ .add_end(
130
+ "Réponds uniquement avec un nombre entier entre 1 (aucune similarité) et 4 (équivalence parfaite). "
131
+ "La réponse est :"
132
+ )
133
+ .build(),
134
+ line_to_data_fn=lambda line: {
135
+ "sentence1": line["sentence1"],
136
+ "sentence2": line["sentence2"],
137
+ },
138
+ ),
139
+ Tasks.PAWS_X.value: Dataset(
140
+ name=Tasks.PAWS_X.value,
141
+ description="Binary classification task : "
142
+ "Predict if two sentences have the same meaning (1) or not (0).",
143
+ possible_ground_truths=["0", "1"],
144
+ hugging_face_repo=COLE_REPOSITORY_NAME,
145
+ line_to_truth_fn=lambda line: line["label"],
146
+ line_to_prompt_fn=lambda line: PromptBuilder()
147
+ .add_premise(
148
+ "Les deux phrases suivantes veulent-elles dire la même chose, ou ont-elles des significations différentes ?"
149
+ )
150
+ .add_data(line["sentence1"])
151
+ .add_data(line["sentence2"])
152
+ .add_end(
153
+ (
154
+ "Réponds seulement 1 si les deux phrases ont la même signification, 0 sinon. La réponse est :"
155
+ )
156
+ )
157
+ .build(),
158
+ line_to_data_fn=lambda line: {
159
+ "sentence1": line["sentence1"],
160
+ "sentence2": line["sentence2"],
161
+ },
162
+ ),
163
+ Tasks.PIAF.value: Dataset(
164
+ name=Tasks.PIAF.value,
165
+ description="Extractive question answering task : Extract a question's answer from a given context.",
166
+ possible_ground_truths=[],
167
+ hugging_face_repo=COLE_REPOSITORY_NAME,
168
+ line_to_truth_fn=lambda line: line["answers"],
169
+ line_to_prompt_fn=lambda line: PromptBuilder()
170
+ .add_premise(
171
+ "Tu vas recevoir un contexte suivi d'une question.\n"
172
+ "Ta tâche est d'extraire **mot pour mot** le passage du contexte qui répond le mieux à la question.\n"
173
+ "N'invente rien. Ne reformule pas.\n"
174
+ "Réponds **en copiant uniquement** un extrait exact du texte ci-dessus."
175
+ )
176
+ .add_data(f"Contexte : {line['context']}")
177
+ .add_data(f"Question : {line['question']}")
178
+ .add_end(
179
+ "Réponds uniquement par un passage extrait du contexte. La réponse est :"
180
+ )
181
+ .build(),
182
+ line_to_data_fn=lambda line: {
183
+ "context": line["context"],
184
+ "question": line["question"],
185
+ },
186
+ ),
187
+ Tasks.FQUAD.value: Dataset(
188
+ name=Tasks.FQUAD.value,
189
+ description="Extractive question answering task : Extract a question's answer from a given context.",
190
+ possible_ground_truths=[],
191
+ hugging_face_repo=COLE_REPOSITORY_NAME,
192
+ line_to_truth_fn=lambda line: line["answers"],
193
+ line_to_prompt_fn=lambda line: PromptBuilder()
194
+ .add_premise(
195
+ "Tu vas recevoir un contexte suivi d'une question.\n"
196
+ "Ta tâche est d'extraire **mot pour mot** le passage du contexte qui répond le mieux à la question.\n"
197
+ "N'invente rien. Ne reformule pas.\n"
198
+ "Réponds **en copiant uniquement** un extrait exact du texte ci-dessus."
199
+ )
200
+ .add_data(f"Contexte : {line['context']}")
201
+ .add_data(f"Question : {line['question']}")
202
+ .add_end(
203
+ "Réponds uniquement par un passage extrait du contexte. La réponse est :"
204
+ )
205
+ .build(),
206
+ line_to_data_fn=lambda line: {
207
+ "context": line["context"],
208
+ "question": line["question"],
209
+ },
210
+ ),
211
+ Tasks.XNLI.value: Dataset(
212
+ name=Tasks.XNLI.value,
213
+ description="Natural language inference task : "
214
+ "predict the relation between two sentences (implication, neutral, contradiction).",
215
+ possible_ground_truths=["0", "1", "2"],
216
+ hugging_face_repo=COLE_REPOSITORY_NAME,
217
+ line_to_truth_fn=lambda line: str(line["label"]),
218
+ line_to_prompt_fn=lambda line: PromptBuilder()
219
+ .add_premise(
220
+ "Quelle est la relation de la deuxième phrase par rapport à la première ?"
221
+ )
222
+ .add_data(rf"premise : {line['premise']}\n" f"sentence 2: {line['hypothesis']}")
223
+ .add_end(
224
+ (
225
+ "Réponds uniquement par :\n"
226
+ "0 - si la deuxième phrase implique la première,\n"
227
+ "1 - si la relation est neutre,\n"
228
+ "2 - s'il y a contradiction.\n"
229
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
230
+ )
231
+ )
232
+ .build(),
233
+ line_to_data_fn=lambda line: {
234
+ "premise": line["premise"],
235
+ "hypothesis": line["hypothesis"],
236
+ },
237
+ ),
238
+ Tasks.QFRCORE.value: Dataset(
239
+ name=Tasks.QFRCORE.value,
240
+ description="Definition matching task : "
241
+ "Match the Quebec expression with its definition from a list.",
242
+ possible_ground_truths=[str(i) for i in range(10)],
243
+ hugging_face_repo=COLE_REPOSITORY_NAME,
244
+ line_to_truth_fn=lambda line: str(line["correct_index"]),
245
+ line_to_prompt_fn=lambda line: PromptBuilder()
246
+ .add_premise(
247
+ f"Que veut dire cette expression québécoise « {line['expression']} » ?"
248
+ )
249
+ .add_data(
250
+ "\n".join(
251
+ f"{idx} - {definition}"
252
+ for idx, definition in enumerate(line["choices"])
253
+ )
254
+ )
255
+ .add_end(
256
+ (
257
+ "Réponds uniquement par l'index, débutant à zéro, "
258
+ "de la bonne définition parmi la liste ci-dessus. Par exemple, si la "
259
+ "troisième phrase correspond à l'expression, la réponse sera 2. La réponse est :"
260
+ )
261
+ )
262
+ .build(),
263
+ line_to_data_fn=lambda line: {
264
+ "expression": line["expression"],
265
+ "choices": line["choices"],
266
+ },
267
+ ),
268
+ Tasks.QFRCORT.value: Dataset(
269
+ name=Tasks.QFRCORT.value,
270
+ description="Definition matching task : "
271
+ "Match the Quebec term with its definition from a list.",
272
+ possible_ground_truths=[str(i) for i in range(10)],
273
+ hugging_face_repo=COLE_REPOSITORY_NAME,
274
+ line_to_truth_fn=lambda line: str(line["correct_index"]),
275
+ line_to_prompt_fn=lambda line: PromptBuilder()
276
+ .add_premise(
277
+ f"Qu'est-ce que ça veut dire ce terme québécois « {line['terme']} » ?"
278
+ )
279
+ .add_data(
280
+ "\n".join(
281
+ f"{idx} - {definition}"
282
+ for idx, definition in enumerate(line["choices"])
283
+ )
284
+ )
285
+ .add_end(
286
+ (
287
+ "Réponds uniquement par l'index, débutant à zéro, "
288
+ "de la bonne définition parmi la liste ci-dessus. La réponse est :"
289
+ )
290
+ )
291
+ .build(),
292
+ line_to_data_fn=lambda line: {
293
+ "terme": line["terme"],
294
+ "choices": line["choices"],
295
+ },
296
+ ),
297
+ Tasks.DACCORD.value: Dataset(
298
+ name=Tasks.DACCORD.value,
299
+ description="Paraphrase detection task :"
300
+ "Predict whether the two sentences are compatible (0) "
301
+ "or contradict each other (1).",
302
+ possible_ground_truths=["0", "1"],
303
+ hugging_face_repo=COLE_REPOSITORY_NAME,
304
+ line_to_truth_fn=lambda line: str(line["label"]),
305
+ line_to_prompt_fn=lambda line: (
306
+ PromptBuilder()
307
+ .add_premise("Détermine la relation entre les deux phrases suivantes :")
308
+ .add_data(f"Première phrase : {line['premise']}")
309
+ .add_data(f"Deuxième phrase : {line['hypothesis']}")
310
+ .add_end(
311
+ "Réponds uniquement par :\n"
312
+ "0 - si les deux phrases sont compatibles (elles expriment la même information ou sont cohérentes),\n"
313
+ "1 - s'il y a contradiction entre les deux phrases.\n"
314
+ "Réponds uniquement par 0 ou 1. La réponse est :"
315
+ )
316
+ .build()
317
+ ),
318
+ line_to_data_fn=lambda line: {
319
+ "premise": line["premise"],
320
+ "hypothesis": line["hypothesis"],
321
+ },
322
+ ),
323
+ Tasks.FRENCH_BOOLQ.value: Dataset(
324
+ name=Tasks.FRENCH_BOOLQ.value,
325
+ description="Binary question answering task : "
326
+ "Answer whether the context allows answering 'yes' to the question (1)"
327
+ "or, if the context only allows answering 'no' "
328
+ "to the question or does not answer the question. (0).",
329
+ possible_ground_truths=["0", "1"],
330
+ hugging_face_repo=COLE_REPOSITORY_NAME,
331
+ line_to_truth_fn=lambda line: str(line["label"]),
332
+ line_to_prompt_fn=lambda line: (
333
+ PromptBuilder()
334
+ .add_premise(
335
+ "Lis le passage suivant et réponds à la question en te basant uniquement sur le texte :\n"
336
+ "- Si le passage permet d'affirmer que la réponse à la question est oui, réponds 1.\n"
337
+ "- Sinon, si la réponse est non ou que le passage ne permet pas de répondre à la question, réponds 0."
338
+ )
339
+ .add_data(f"Passage : {line['passage']}")
340
+ .add_data(f"Question : {line['question']}")
341
+ .add_end("La réponse est :")
342
+ .build()
343
+ ),
344
+ line_to_data_fn=lambda line: {
345
+ "question": line["question"],
346
+ "passage": line["passage"],
347
+ },
348
+ ),
349
+ Tasks.MNLI_NINEELEVEN_FR_MT.value: Dataset(
350
+ name=Tasks.MNLI_NINEELEVEN_FR_MT.value,
351
+ description="Natural language inference task : "
352
+ "predict the relation between two sentences (implication, neutral, contradiction).",
353
+ possible_ground_truths=["0", "1", "2"],
354
+ hugging_face_repo=COLE_REPOSITORY_NAME,
355
+ line_to_truth_fn=lambda line: line["label"],
356
+ line_to_prompt_fn=lambda line: PromptBuilder()
357
+ .add_premise(
358
+ "Quelle est la relation de la deuxième phrase par rapport à la première ?"
359
+ )
360
+ .add_data(line["premise"])
361
+ .add_data(line["hypothesis"])
362
+ .add_end(
363
+ (
364
+ "Réponds uniquement par :\n"
365
+ "0 - si la deuxième phrase implique la première,\n"
366
+ "1 - si la relation est neutre,\n"
367
+ "2 - s'il y a contradiction.\n"
368
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
369
+ )
370
+ )
371
+ .build(),
372
+ line_to_data_fn=lambda line: {
373
+ "premise": line["premise"],
374
+ "hypothesis": line["hypothesis"],
375
+ },
376
+ ),
377
+ Tasks.RTE3_FRENCH.value: Dataset(
378
+ name=Tasks.RTE3_FRENCH.value,
379
+ description="Natural language inference task : "
380
+ "predict the relation between two sentences (entailment, neutral, contradiction)",
381
+ possible_ground_truths=["0", "1", "2"],
382
+ hugging_face_repo=COLE_REPOSITORY_NAME,
383
+ line_to_truth_fn=lambda line: str(line["label"]),
384
+ line_to_prompt_fn=lambda line: (
385
+ PromptBuilder()
386
+ .add_premise(
387
+ "Lis le texte suivant et détermine la relation de l'énoncé par rapport au texte."
388
+ )
389
+ .add_data(f"Texte : {line['premise']}")
390
+ .add_data(f"Énoncé : {line['hypothesis']}")
391
+ .add_end(
392
+ "Réponds uniquement par 0, 1 ou 2 :\n"
393
+ "0 - si l'énoncé découle logiquement du texte (entailment),\n"
394
+ "1 - si la relation est neutre,\n"
395
+ "2 - s'il y a contradiction.\n"
396
+ "La réponse est :"
397
+ )
398
+ .build()
399
+ ),
400
+ line_to_data_fn=lambda line: {
401
+ "premise": line["premise"],
402
+ "hypothesis": line["hypothesis"],
403
+ },
404
+ ),
405
+ Tasks.WINO_X_LM.value: Dataset(
406
+ name=Tasks.WINO_X_LM.value,
407
+ description=(
408
+ "Pronoun resolution task : predict the correct referent (1 or 2) "
409
+ "of a pronoun in a sentence by choosing between two candidates."
410
+ ),
411
+ possible_ground_truths=["1", "2"],
412
+ hugging_face_repo=COLE_REPOSITORY_NAME,
413
+ line_to_truth_fn=lambda line: str(line["answer"]),
414
+ line_to_prompt_fn=lambda line: PromptBuilder()
415
+ .add_premise(
416
+ 'Voici une phrase en anglais contenant le pronom "it" dans un sens ambigu et sa traduction en français.'
417
+ )
418
+ .add_data(f"Phrase (originale en anglais) : {line['sentence']}")
419
+ .add_data(
420
+ f"Traduction en français (le pronom est caché par '_' ) : {line['context_fr']}"
421
+ )
422
+ .add_data("À quoi renvoie ce pronom ? Voici les choix: ")
423
+ .add_data(f"1 : {line['option1_fr']}")
424
+ .add_data(f"2 : {line['option2_fr']}")
425
+ .add_end("Réponds uniquement par 1 ou 2. La réponse est :")
426
+ .build(),
427
+ line_to_data_fn=lambda line: {
428
+ "sentence": line["sentence"],
429
+ "translation": line["context_fr"],
430
+ "referent1": line["option1_fr"],
431
+ "referent2": line["option2_fr"],
432
+ },
433
+ ),
434
+ Tasks.WINO_X_MT.value: Dataset(
435
+ name="wino_x_mt",
436
+ description=(
437
+ "Pronoun resolution based on translations: choose between two French translations of an English "
438
+ "sentence with an ambiguous pronoun. The goal is to identify which of the two translations uses "
439
+ "the correct pronoun (he or she) based on the correct referent."
440
+ ),
441
+ possible_ground_truths=["1", "2"],
442
+ hugging_face_repo=COLE_REPOSITORY_NAME,
443
+ line_to_truth_fn=lambda line: str(line["answer"]),
444
+ line_to_prompt_fn=lambda line: PromptBuilder()
445
+ .add_premise(
446
+ "Voici deux traductions d’une phrase anglaise contenant un pronom ambigu :"
447
+ )
448
+ .add_data(f"Phrase originale : {line['sentence']}")
449
+ .add_data(f"Traduction 1 (avec '{line['pronoun1']}') : {line['translation1']}")
450
+ .add_data(f"Traduction 2 (avec '{line['pronoun2']}') : {line['translation2']}")
451
+ .add_end(
452
+ "Quelle traduction utilise le bon pronom en fonction du référent visé dans la phrase originale ?\n"
453
+ "Réponds uniquement par 1 si la traduction 1 est correcte, ou 2 si la traduction 2 est correcte.\n"
454
+ "La réponse est :"
455
+ )
456
+ .build(),
457
+ line_to_data_fn=lambda line: {
458
+ "sentence": line["sentence"],
459
+ "translation1": line["translation1"],
460
+ "translation2": line["translation2"],
461
+ "pronoun1": line["pronoun1"],
462
+ "pronoun2": line["pronoun2"],
463
+ },
464
+ ),
465
+ Tasks.MULTIBLIMP.value: Dataset(
466
+ name=Tasks.MULTIBLIMP.value,
467
+ description="Choice task between two sentences : Choose the one which is grammatically correct.",
468
+ possible_ground_truths=["0", "1"],
469
+ hugging_face_repo=COLE_REPOSITORY_NAME,
470
+ line_to_truth_fn=lambda line: str(
471
+ line["label"]
472
+ ), # The label is return as a string.
473
+ line_to_prompt_fn=lambda line: (
474
+ PromptBuilder()
475
+ .add_premise("Laquelle de ces phrases est grammaticalement correcte ?")
476
+ .add_data(f"Phrase 0:{line['sentence_a']}")
477
+ .add_data(f"Phrase 1:{line['sentence_b']}")
478
+ .add_end(
479
+ "Réponds avec seulement 0 si la phrase 0 "
480
+ "est grammaticalement correcte, et uniquement 1 si la phrase 1 est grammaticalement "
481
+ "correcte. La réponse est :"
482
+ )
483
+ .build()
484
+ ),
485
+ line_to_data_fn=lambda line: {line["sentence_a"], line["sentence_b"]},
486
+ ),
487
+ Tasks.FRACAS.value: Dataset(
488
+ name=Tasks.FRACAS.value,
489
+ description="Natural language inference task : "
490
+ "predict the relation between two sentences (implication, neutral, contradiction).",
491
+ possible_ground_truths=["0", "1", "2"],
492
+ hugging_face_repo=COLE_REPOSITORY_NAME,
493
+ line_to_truth_fn=lambda line: line["label"],
494
+ line_to_prompt_fn=lambda line: PromptBuilder()
495
+ .add_premise(
496
+ "Quelle est la relation de la deuxième phrase par rapport à la première ?"
497
+ )
498
+ .add_data(line["premise"])
499
+ .add_data(line["hypothesis"])
500
+ .add_end(
501
+ (
502
+ "Réponds uniquement par :\n"
503
+ "0 - si la deuxième phrase implique la première,\n"
504
+ "1 - si la relation est neutre,\n"
505
+ "2 - s'il y a contradiction.\n"
506
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
507
+ )
508
+ )
509
+ .build(),
510
+ line_to_data_fn=lambda line: {
511
+ "premise": line["premise"],
512
+ "hypothesis": line["hypothesis"],
513
+ },
514
+ ),
515
+ Tasks.MMS.value: Dataset(
516
+ name=Tasks.MMS.value,
517
+ description="A sentiment analysis task for classifying text as positive (2), negative (0), or neutral (1).",
518
+ possible_ground_truths=["0", "1", "2"],
519
+ hugging_face_repo=COLE_REPOSITORY_NAME,
520
+ line_to_truth_fn=lambda line: line["label"],
521
+ line_to_prompt_fn=lambda line: PromptBuilder()
522
+ .add_premise("Quel est le sentiment de cette phrase?")
523
+ .add_data(line["text"])
524
+ .add_end(
525
+ (
526
+ "Réponds uniquement par :\n"
527
+ "0 - si la phrase est négative,\n"
528
+ "1 - si la phrase est neutre,\n"
529
+ "2 - si la phrase est positive.\n"
530
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
531
+ )
532
+ )
533
+ .build(),
534
+ line_to_data_fn=lambda line: {
535
+ "text": line["text"],
536
+ },
537
+ ),
538
+ Tasks.WSD.value: Dataset(
539
+ name=Tasks.WSD.value,
540
+ description="Extractive word sense disambiguation : Extract an ambiguous word in a sentence.",
541
+ possible_ground_truths=[],
542
+ hugging_face_repo=COLE_REPOSITORY_NAME,
543
+ line_to_truth_fn=lambda line: line["label"],
544
+ line_to_prompt_fn=lambda line: PromptBuilder()
545
+ .add_premise(
546
+ "Tu vas recevoir une phrase contenant un mot ambigu ainsi que les étiquettes du 'part-of-speech tagging "
547
+ "(PoS)' pour chaque mot de la phrase. Le mot ambigu peut être un verbe ou un adjectif.\n"
548
+ "Ta tâche est d’indiquer **exactement** ce mot ambigu dans la phrase, sans rien ajouter ni reformuler.\n"
549
+ "Réponds uniquement avec le mot ambigu identifié."
550
+ )
551
+ .add_data(f"Phrase : {line['sentence']}")
552
+ .add_data(f"Part-of-speech tagging: {line['pos_tag_labels']}")
553
+ .add_end("La réponse est :")
554
+ .build(),
555
+ line_to_data_fn=lambda line: {
556
+ "sentence": line["sentence"],
557
+ "pos_tag_labels": line["pos_tag_labels"],
558
+ },
559
+ ),
560
+ Tasks.LINGNLI.value: Dataset(
561
+ name=Tasks.LINGNLI.value,
562
+ description="Natural language inference task : "
563
+ "predict the relation between two sentences (implication, neutral, contradiction).",
564
+ possible_ground_truths=["0", "1", "2"],
565
+ hugging_face_repo=COLE_REPOSITORY_NAME,
566
+ line_to_truth_fn=lambda line: line["label"],
567
+ line_to_prompt_fn=lambda line: PromptBuilder()
568
+ .add_premise(
569
+ "Quelle est la relation de la deuxième phrase par rapport à la première ?"
570
+ )
571
+ .add_data(line["premise"])
572
+ .add_data(line["hypothesis"])
573
+ .add_end(
574
+ (
575
+ "Réponds uniquement par :\n"
576
+ "0 - si la deuxième phrase implique la première,\n"
577
+ "1 - si la relation est neutre,\n"
578
+ "2 - s'il y a contradiction.\n"
579
+ "Réponds uniquement par 0, 1 ou 2. La réponse est :"
580
+ )
581
+ )
582
+ .build(),
583
+ line_to_data_fn=lambda line: {
584
+ "premise": line["premise"],
585
+ "hypothesis": line["hypothesis"],
586
+ },
587
+ ),
588
+ }
589
+
590
+
591
+ def preload_all_datasets():
592
+ """Loads all datasets into cache for later usage"""
593
+ for dataset in datasets.values():
594
+ dataset.load_data()
595
+
596
+
597
+ def generate_metadata_dict():
598
+ """Generates a dictionary with all the datasets metadata information"""
599
+ metadata_dict = {}
600
+ for dataset in datasets.values():
601
+ metadata_dict[dataset.name] = dataset.metadata
602
+ return metadata_dict
src/dataset/prompt_builder.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List
3
+
4
+
5
+ class PromptBuilder:
6
+ """Builder class for creating prompt strings with dynamic data."""
7
+
8
+ def __init__(self):
9
+ self.premise: List[str] = []
10
+ self.end: List[str] = []
11
+ self.data: List[str] = []
12
+ self.data_only = False
13
+
14
+ def add_data(self, data):
15
+ self.data.append(data)
16
+ return self
17
+
18
+ def add_end(self, end):
19
+ self.end.append(end)
20
+ return self
21
+
22
+ def set_data_only(self, data_only):
23
+ self.data_only = data_only
24
+ return self
25
+
26
+ def add_premise(self, premise):
27
+ self.premise.append(premise)
28
+ return self
29
+
30
+ def build(self):
31
+ """Builds and returns the prompt as a string based on data, premise and end that were added to the builder."""
32
+ if len(self.data) == 0:
33
+ logging.warning(
34
+ "This prompt did not contain any data, was that intentional ?"
35
+ )
36
+
37
+ data = "\n".join(self.data)
38
+ if self.data_only:
39
+ return data
40
+
41
+ end = "".join(self.end)
42
+ premise = "".join(self.premise)
43
+ return f"{premise}\n{data}\n{end}"
src/docker_requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core
2
+ python-dotenv
3
+ numpy
4
+ python-multipart
5
+ scikit-learn
6
+ # Web backend (si tu utilises FastAPI/Flask, sinon à ignorer)
7
+ fastapi
8
+ uvicorn
9
+ # Optional: pretty printing, progress bars, etc.
10
+ tqdm
11
+ aenum
12
+
13
+ evaluate
14
+ wheel
15
+
16
+ # Pour compatibilité ancienne
17
+ protobuf<=3.20.3
src/evaluation/__init__.py ADDED
File without changes
src/evaluation/evaluation_pipeline.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import gc
3
+ import logging
4
+ from datetime import datetime
5
+
6
+ import torch
7
+ import wandb
8
+ from tqdm import tqdm
9
+
10
+ from predictions.all_llms import llms
11
+ from src import WANDB_PROJECT
12
+ from src.evaluation.llm_evaluator import ModelEvaluator
13
+ from src.evaluation.llm_factory import model_factory
14
+ from src.evaluation.tools import split_llm_list
15
+ from src.task.task_factory import tasks_factory
16
+ from src.task.task_names import Tasks
17
+
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument(
20
+ "--test",
21
+ help="If set to true, the system will default to testing only a small model with a few examples.",
22
+ default=False,
23
+ type=bool,
24
+ )
25
+ parser.add_argument(
26
+ "--max_examples",
27
+ "-m",
28
+ help="The maximum number of examples to use, defaults to None.",
29
+ type=int,
30
+ default=None,
31
+ )
32
+ parser.add_argument(
33
+ "--models_name",
34
+ "-mn",
35
+ help="The name of the model(s) to load.",
36
+ type=str,
37
+ default=None,
38
+ )
39
+
40
+ parser.add_argument(
41
+ "--batch_size",
42
+ help="The batch size to use during the evaluation.",
43
+ type=int,
44
+ default=32,
45
+ )
46
+
47
+ parser.add_argument(
48
+ "--llm_split",
49
+ help="The split of the LLMs list to use. It can be '1', '2' or '3'.",
50
+ type=int,
51
+ default=None,
52
+ choices=[1, 2, 3],
53
+ )
54
+
55
+ parser.add_argument(
56
+ "--skip_first_n",
57
+ help="The number of LLM to skip in the list of split",
58
+ type=int,
59
+ default=None,
60
+ )
61
+
62
+ args = parser.parse_args()
63
+
64
+ tasks_names = list(Tasks)
65
+
66
+ tasks = tasks_factory(tasks_names)
67
+
68
+ models = []
69
+ if args.models_name is not None:
70
+ if args.models_name in llms:
71
+ models = llms[args.models_name]
72
+ else:
73
+ models = args.models_name.split(",")
74
+ else:
75
+ models = llms["all"]
76
+
77
+ models = split_llm_list(models=models, llm_split=args.llm_split)
78
+
79
+ if args.skip_first_n is not None:
80
+ models = models[args.skip_first_n :]
81
+
82
+ logging.info("Starting Evaluation")
83
+
84
+ time_start = datetime.now()
85
+
86
+ for model_name in tqdm(
87
+ models, total=len(models), desc="Processing LLM inference on tasks."
88
+ ):
89
+
90
+ try:
91
+ model = model_factory(model_name, batch_size=args.batch_size)
92
+ logging.info("Creating model")
93
+ evaluator = ModelEvaluator()
94
+ logging.info("Evaluating model")
95
+
96
+ exp_name = f"{model_name}"
97
+ wandb.init(
98
+ project=WANDB_PROJECT,
99
+ entity="doctorate",
100
+ config={
101
+ "model_name": model_name,
102
+ "tasks": "; ".join(tasks_names),
103
+ "batch_size": args.batch_size,
104
+ },
105
+ name=exp_name,
106
+ )
107
+
108
+ predictions_payload = evaluator.evaluate_subset(model, tasks, args.max_examples)
109
+ wandb.log(predictions_payload)
110
+
111
+ logging.info("Saving results")
112
+ evaluator.save_results("./results")
113
+
114
+ metrics_payload = evaluator.compute_metrics()
115
+ evaluator.save_metrics("./results")
116
+ wandb.log(metrics_payload)
117
+
118
+ except Exception as e:
119
+ error_message = f"Evaluation failed for model {model_name}: {e}"
120
+ logging.error(error_message)
121
+ wandb.finish(exit_code=1)
122
+ continue
123
+ finally:
124
+ # Memory cleaning
125
+ if "model" in locals():
126
+ del model
127
+ if "evaluator" in locals():
128
+ del evaluator
129
+ gc.collect()
130
+ torch.cuda.empty_cache()
131
+ wandb.finish(exit_code=0)
132
+
133
+ time_end = datetime.now()
134
+ info_message = f"End time: {time_end}"
135
+ logging.info(info_message)
136
+ elapsed_time = time_end - time_start
137
+ info_message = f"Elapsed time: {elapsed_time}"
138
+ logging.info(info_message)