Spaces:
Running
Running
Update index.html
Browse files- index.html +197 -17
index.html
CHANGED
|
@@ -1,19 +1,199 @@
|
|
| 1 |
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
| 1 |
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 6 |
+
<title>MedInjection-FR • French Biomedical Instruction Dataset</title>
|
| 7 |
+
<link rel="stylesheet" href="style.css" />
|
| 8 |
+
<meta name="description" content="MedInjection-FR: a French biomedical instruction dataset with native, synthetic, and translated components, plus fine-tuned models." />
|
| 9 |
+
</head>
|
| 10 |
+
<body>
|
| 11 |
+
<header class="site-header">
|
| 12 |
+
<div class="wrap">
|
| 13 |
+
<h1>MedInjection-FR</h1>
|
| 14 |
+
<p class="subtitle">A French biomedical instruction dataset and model suite</p>
|
| 15 |
+
<p class="meta">Native • Synthetic • Translated | 577,577 instruction–response pairs</p>
|
| 16 |
+
<div class="cta-row">
|
| 17 |
+
<a class="btn primary" href="#download">Download</a>
|
| 18 |
+
<a class="btn" href="#models">Models</a>
|
| 19 |
+
<a class="btn" href="#citation">Cite</a>
|
| 20 |
+
<a class="btn" href="#contact">Contact</a>
|
| 21 |
+
</div>
|
| 22 |
+
</div>
|
| 23 |
+
</header>
|
| 24 |
+
|
| 25 |
+
<main class="wrap">
|
| 26 |
+
<!-- Overview -->
|
| 27 |
+
<section id="overview" class="card">
|
| 28 |
+
<h2>Overview</h2>
|
| 29 |
+
<p>
|
| 30 |
+
MedInjection-FR is a large-scale French biomedical instruction dataset designed to study
|
| 31 |
+
how the <strong>provenance of supervision</strong> (native, synthetic, translated) affects instruction-tuning of LLMs.
|
| 32 |
+
The corpus supports multiple-choice QA (single and multi-answer) and open-ended QA,
|
| 33 |
+
and is released together with a family of fine-tuned baseline models.
|
| 34 |
+
</p>
|
| 35 |
+
<ul class="pill-list">
|
| 36 |
+
<li>Native: <strong>77,247</strong></li>
|
| 37 |
+
<li>Synthetic: <strong>82,269</strong></li>
|
| 38 |
+
<li>Translated: <strong>418,061</strong></li>
|
| 39 |
+
<li>Total: <strong>577,577</strong></li>
|
| 40 |
+
</ul>
|
| 41 |
+
</section>
|
| 42 |
+
|
| 43 |
+
<!-- What’s inside -->
|
| 44 |
+
<section id="composition" class="card">
|
| 45 |
+
<h2>Composition & Tasks</h2>
|
| 46 |
+
<div class="grid-2">
|
| 47 |
+
<div>
|
| 48 |
+
<h3>Task types</h3>
|
| 49 |
+
<ul>
|
| 50 |
+
<li>MCQ (single-answer)</li>
|
| 51 |
+
<li>MCQU (multiple-answer)</li>
|
| 52 |
+
<li>OEQ (open-ended QA)</li>
|
| 53 |
+
</ul>
|
| 54 |
+
<p class="muted">Counts (all components): OEQ <strong>63,267</strong>, MCQ <strong>59,597</strong>, MCQU <strong>454,713</strong>.</p>
|
| 55 |
+
|
| 56 |
+
<h3>Splits</h3>
|
| 57 |
+
<table class="clean">
|
| 58 |
+
<thead>
|
| 59 |
+
<tr><th>Component</th><th>Train</th><th>Validation</th><th>Test</th><th>Total</th></tr>
|
| 60 |
+
</thead>
|
| 61 |
+
<tbody>
|
| 62 |
+
<tr><td>Native</td><td>57,563</td><td>5,055</td><td>14,629</td><td>77,247</td></tr>
|
| 63 |
+
<tr><td>Synthetic</td><td>82,269</td><td>—</td><td>—</td><td>82,269</td></tr>
|
| 64 |
+
<tr><td>Translated</td><td>367,704</td><td>38,337</td><td>12,020</td><td>418,061</td></tr>
|
| 65 |
+
<tr class="total"><td>Total</td><td>507,536</td><td>43,392</td><td>26,649</td><td>577,577</td></tr>
|
| 66 |
+
</tbody>
|
| 67 |
+
</table>
|
| 68 |
+
</div>
|
| 69 |
+
<div>
|
| 70 |
+
<h3>Translated quality (WMT24 biomedical parallel)</h3>
|
| 71 |
+
<table class="clean">
|
| 72 |
+
<thead><tr><th>Model</th><th>BLEU</th><th>COMET</th></tr></thead>
|
| 73 |
+
<tbody>
|
| 74 |
+
<tr><td>GPT-4o-mini</td><td>51.01</td><td>0.8751</td></tr>
|
| 75 |
+
<tr><td>Gemini 2.0 Flash</td><td>53.72</td><td>0.8783</td></tr>
|
| 76 |
+
<tr class="muted"><td>WMT’24 best (ref.)</td><td>53.54</td><td>0.8760</td></tr>
|
| 77 |
+
</tbody>
|
| 78 |
+
</table>
|
| 79 |
+
<p class="muted">Higher is better. These scores indicate strong translation fidelity for the translated subset.</p>
|
| 80 |
+
</div>
|
| 81 |
+
</div>
|
| 82 |
+
</section>
|
| 83 |
+
|
| 84 |
+
<!-- Downloads -->
|
| 85 |
+
<section id="download" class="card">
|
| 86 |
+
<h2>Download</h2>
|
| 87 |
+
<p>Each component is published separately. Use the links below or load via the 🤗 Datasets library.</p>
|
| 88 |
+
<div class="grid-3 tight">
|
| 89 |
+
<a class="tile" href="./nat/" target="_blank">
|
| 90 |
+
<h3>Native</h3>
|
| 91 |
+
<p>French medical exams, resources, curated QA.</p>
|
| 92 |
+
<code>dataset/medinjection-fr-nat</code>
|
| 93 |
+
</a>
|
| 94 |
+
<a class="tile" href="./syn/" target="_blank">
|
| 95 |
+
<h3>Synthetic</h3>
|
| 96 |
+
<p>GPT-4o generated from clinical cases and abstracts.</p>
|
| 97 |
+
<code>dataset/medinjection-fr-syn</code>
|
| 98 |
+
</a>
|
| 99 |
+
<a class="tile" href="./trad/" target="_blank">
|
| 100 |
+
<h3>Translated</h3>
|
| 101 |
+
<p>FR translations of established EN biomedical sets.</p>
|
| 102 |
+
<code>dataset/medinjection-fr-trad</code>
|
| 103 |
+
</a>
|
| 104 |
+
</div>
|
| 105 |
+
|
| 106 |
+
<h3>Python (🤗 Datasets)</h3>
|
| 107 |
+
<pre><code class="code">
|
| 108 |
+
from datasets import load_dataset
|
| 109 |
+
|
| 110 |
+
# choose one: "nat", "syn", or "trad"
|
| 111 |
+
ds = load_dataset("your-org/medinjection-fr", "nat") # or "syn", "trad"
|
| 112 |
+
print(ds)
|
| 113 |
+
</code></pre>
|
| 114 |
+
</section>
|
| 115 |
+
|
| 116 |
+
<!-- Models -->
|
| 117 |
+
<section id="models" class="card">
|
| 118 |
+
<h2>Fine-tuned Models</h2>
|
| 119 |
+
<p>We release seven instruction-tuned baselines (Qwen-4B-Instruct backbone, DoRA adapters), trained on 30k samples per configuration:</p>
|
| 120 |
+
<ul class="pill-list">
|
| 121 |
+
<li>QWEN-4B-NAT</li>
|
| 122 |
+
<li>QWEN-4B-TRAD</li>
|
| 123 |
+
<li>QWEN-4B-SYN</li>
|
| 124 |
+
<li>QWEN-4B-NAT-TRAD</li>
|
| 125 |
+
<li>QWEN-4B-NAT-SYN</li>
|
| 126 |
+
<li>QWEN-4B-TRAD-SYN</li>
|
| 127 |
+
<li>QWEN-4B-COMBO</li>
|
| 128 |
+
</ul>
|
| 129 |
+
|
| 130 |
+
<div class="grid-3 tight">
|
| 131 |
+
<a class="tile" href="./models/qwen-4b-nat/" target="_blank"><h3>NAT</h3><p>Best single-source (MCQ/MCQU).</p></a>
|
| 132 |
+
<a class="tile" href="./models/qwen-4b-nat-trad/" target="_blank"><h3>NAT-TRAD</h3><p>Top mixed configuration.</p></a>
|
| 133 |
+
<a class="tile" href="./models/qwen-4b-combo/" target="_blank"><h3>COMBO</h3><p>All sources combined.</p></a>
|
| 134 |
+
</div>
|
| 135 |
+
|
| 136 |
+
<h3>Quick inference (🤗 Transformers)</h3>
|
| 137 |
+
<pre><code class="code">
|
| 138 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 139 |
+
import torch
|
| 140 |
+
|
| 141 |
+
model_id = "your-org/qwen-4b-nat-trad" # pick one of the released models
|
| 142 |
+
tok = AutoTokenizer.from_pretrained(model_id)
|
| 143 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
|
| 144 |
+
|
| 145 |
+
prompt = "Question: Quelle est la prise en charge initiale d'un OAP ?\nChoix: A) ... B) ... C) ... D) ...\nRépondez par la lettre."
|
| 146 |
+
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
| 147 |
+
out = model.generate(**inputs, max_new_tokens=64)
|
| 148 |
+
print(tok.decode(out[0], skip_special_tokens=True))
|
| 149 |
+
</code></pre>
|
| 150 |
+
</section>
|
| 151 |
+
|
| 152 |
+
<!-- Evaluation note -->
|
| 153 |
+
<section class="card">
|
| 154 |
+
<h2>Evaluation at a glance</h2>
|
| 155 |
+
<ul>
|
| 156 |
+
<li>MCQ/MCQU reported with Exact-Match; MCQU also uses Hamming score.</li>
|
| 157 |
+
<li>OEQ uses BLEU/ROUGE/METEOR/BERTScore and an LLM-as-a-judge calibrated on <em>human annotations</em> (100 samples).</li>
|
| 158 |
+
<li>Mixed training (especially <strong>NAT-TRAD</strong>) provides complementary gains over single-source setups.</li>
|
| 159 |
+
</ul>
|
| 160 |
+
</section>
|
| 161 |
+
|
| 162 |
+
<!-- Ethics & License -->
|
| 163 |
+
<section id="ethics" class="card">
|
| 164 |
+
<h2>Ethics, Intended Use & License</h2>
|
| 165 |
+
<p class="warning">This dataset and the released models are for <strong>research use only</strong>. They are <strong>not</strong> a substitute for professional medical advice, diagnosis, or treatment.</p>
|
| 166 |
+
<ul>
|
| 167 |
+
<li>No PHI included; sources compiled from public datasets and teaching material.</li>
|
| 168 |
+
<li>Evaluation includes human expert checks for a small sample; outputs may still contain errors.</li>
|
| 169 |
+
<li>Please review the <a href="./LICENSE" target="_blank">LICENSE</a> before use. If unsure, contact the maintainers.</li>
|
| 170 |
+
</ul>
|
| 171 |
+
</section>
|
| 172 |
+
|
| 173 |
+
<!-- Citation -->
|
| 174 |
+
<section id="citation" class="card">
|
| 175 |
+
<h2>Citation</h2>
|
| 176 |
+
<p>If you use MedInjection-FR or the models, please cite:</p>
|
| 177 |
+
<pre><code class="code">
|
| 178 |
+
@inproceedings{medinjection-fr-2025,
|
| 179 |
+
title = {MedInjection-FR: Investigating Data Provenance for French Biomedical Instruction Tuning},
|
| 180 |
+
author = {Your Name and Coauthors},
|
| 181 |
+
booktitle = {Proceedings of ...},
|
| 182 |
+
year = {2025},
|
| 183 |
+
note = {Dataset and models available on Hugging Face}
|
| 184 |
+
}
|
| 185 |
+
</code></pre>
|
| 186 |
+
</section>
|
| 187 |
+
|
| 188 |
+
<!-- Contact -->
|
| 189 |
+
<section id="contact" class="card">
|
| 190 |
+
<h2>Contact</h2>
|
| 191 |
+
<p>Questions, feedback, or requests: open an issue on the repo or email <a href="mailto:you@example.com">you@example.com</a>.</p>
|
| 192 |
+
</section>
|
| 193 |
+
|
| 194 |
+
<footer class="site-footer">
|
| 195 |
+
<p>© 2025 MedInjection-FR. Built for research and reproducibility.</p>
|
| 196 |
+
</footer>
|
| 197 |
+
</main>
|
| 198 |
+
</body>
|
| 199 |
</html>
|