Spaces:
Running
Running
Update index.html
Browse files- index.html +56 -24
index.html
CHANGED
|
@@ -12,7 +12,7 @@
|
|
| 12 |
<div class="wrap">
|
| 13 |
<h1>MedInjection-FR</h1>
|
| 14 |
<p class="subtitle">A French biomedical instruction dataset and model suite</p>
|
| 15 |
-
<p class="meta">Native • Synthetic • Translated |
|
| 16 |
<div class="cta-row">
|
| 17 |
<a class="btn primary" href="#download">Download</a>
|
| 18 |
<a class="btn" href="#models">Models</a>
|
|
@@ -34,9 +34,9 @@
|
|
| 34 |
</p>
|
| 35 |
<ul class="pill-list">
|
| 36 |
<li>Native: <strong>77,247</strong></li>
|
| 37 |
-
<li>Synthetic: <strong>
|
| 38 |
-
<li>Translated: <strong>
|
| 39 |
-
<li>Total: <strong>
|
| 40 |
</ul>
|
| 41 |
</section>
|
| 42 |
|
|
@@ -60,9 +60,9 @@
|
|
| 60 |
</thead>
|
| 61 |
<tbody>
|
| 62 |
<tr><td>Native</td><td>57,563</td><td>5,055</td><td>14,629</td><td>77,247</td></tr>
|
| 63 |
-
<tr><td>Synthetic</td><td>
|
| 64 |
-
<tr><td>Translated</td><td>
|
| 65 |
-
<tr class="total"><td>Total</td><td>
|
| 66 |
</tbody>
|
| 67 |
</table>
|
| 68 |
</div>
|
|
@@ -124,28 +124,66 @@ print(ds)
|
|
| 124 |
<li>QWEN-4B-NAT-TRAD</li>
|
| 125 |
<li>QWEN-4B-NAT-SYN</li>
|
| 126 |
<li>QWEN-4B-TRAD-SYN</li>
|
| 127 |
-
<li>QWEN-4B-
|
| 128 |
</ul>
|
| 129 |
|
| 130 |
<div class="grid-3 tight">
|
| 131 |
<a class="tile" href="./models/qwen-4b-nat/" target="_blank"><h3>NAT</h3><p>Best single-source (MCQ/MCQU).</p></a>
|
| 132 |
<a class="tile" href="./models/qwen-4b-nat-trad/" target="_blank"><h3>NAT-TRAD</h3><p>Top mixed configuration.</p></a>
|
| 133 |
-
<a class="tile" href="./models/
|
| 134 |
</div>
|
| 135 |
|
| 136 |
<h3>Quick inference (🤗 Transformers)</h3>
|
| 137 |
<pre><code class="code">
|
| 138 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 139 |
-
import torch
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
prompt = "Question: Quelle est la prise en charge initiale d'un OAP ?\nChoix: A) ... B) ... C) ... D) ...\nRépondez par la lettre."
|
| 146 |
-
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
| 147 |
-
out = model.generate(**inputs, max_new_tokens=64)
|
| 148 |
-
print(tok.decode(out[0], skip_special_tokens=True))
|
| 149 |
</code></pre>
|
| 150 |
</section>
|
| 151 |
|
|
@@ -175,13 +213,7 @@ print(tok.decode(out[0], skip_special_tokens=True))
|
|
| 175 |
<h2>Citation</h2>
|
| 176 |
<p>If you use MedInjection-FR or the models, please cite:</p>
|
| 177 |
<pre><code class="code">
|
| 178 |
-
|
| 179 |
-
title = {MedInjection-FR: Investigating Data Provenance for French Biomedical Instruction Tuning},
|
| 180 |
-
author = {Your Name and Coauthors},
|
| 181 |
-
booktitle = {Proceedings of ...},
|
| 182 |
-
year = {2025},
|
| 183 |
-
note = {Dataset and models available on Hugging Face}
|
| 184 |
-
}
|
| 185 |
</code></pre>
|
| 186 |
</section>
|
| 187 |
|
|
|
|
| 12 |
<div class="wrap">
|
| 13 |
<h1>MedInjection-FR</h1>
|
| 14 |
<p class="subtitle">A French biomedical instruction dataset and model suite</p>
|
| 15 |
+
<p class="meta">Native • Synthetic • Translated | 570,154 instruction–response pairs</p>
|
| 16 |
<div class="cta-row">
|
| 17 |
<a class="btn primary" href="#download">Download</a>
|
| 18 |
<a class="btn" href="#models">Models</a>
|
|
|
|
| 34 |
</p>
|
| 35 |
<ul class="pill-list">
|
| 36 |
<li>Native: <strong>77,247</strong></li>
|
| 37 |
+
<li>Synthetic: <strong>76,506</strong></li>
|
| 38 |
+
<li>Translated: <strong>416,401</strong></li>
|
| 39 |
+
<li>Total: <strong>570,154</strong></li>
|
| 40 |
</ul>
|
| 41 |
</section>
|
| 42 |
|
|
|
|
| 60 |
</thead>
|
| 61 |
<tbody>
|
| 62 |
<tr><td>Native</td><td>57,563</td><td>5,055</td><td>14,629</td><td>77,247</td></tr>
|
| 63 |
+
<tr><td>Synthetic</td><td>76,506</td><td>—</td><td>—</td><td>76,506</td></tr>
|
| 64 |
+
<tr><td>Translated</td><td>366,370 </td><td>38,011</td><td>12,020</td><td>416,401</td></tr>
|
| 65 |
+
<tr class="total"><td>Total</td><td>500,439</td><td>43,066</td><td>26,649</td><td>570,154</td></tr>
|
| 66 |
</tbody>
|
| 67 |
</table>
|
| 68 |
</div>
|
|
|
|
| 124 |
<li>QWEN-4B-NAT-TRAD</li>
|
| 125 |
<li>QWEN-4B-NAT-SYN</li>
|
| 126 |
<li>QWEN-4B-TRAD-SYN</li>
|
| 127 |
+
<li>QWEN-4B-ALL</li>
|
| 128 |
</ul>
|
| 129 |
|
| 130 |
<div class="grid-3 tight">
|
| 131 |
<a class="tile" href="./models/qwen-4b-nat/" target="_blank"><h3>NAT</h3><p>Best single-source (MCQ/MCQU).</p></a>
|
| 132 |
<a class="tile" href="./models/qwen-4b-nat-trad/" target="_blank"><h3>NAT-TRAD</h3><p>Top mixed configuration.</p></a>
|
| 133 |
+
<a class="tile" href="./models/QWEN-4B-ALL/" target="_blank"><h3>ALL</h3><p>All sources combined.</p></a>
|
| 134 |
</div>
|
| 135 |
|
| 136 |
<h3>Quick inference (🤗 Transformers)</h3>
|
| 137 |
<pre><code class="code">
|
| 138 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
| 139 |
|
| 140 |
+
model_name = "MedInjection-FR/QWEN-4B-NAT-TRAD"
|
| 141 |
+
|
| 142 |
+
# load the tokenizer and the model
|
| 143 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 144 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 145 |
+
model_name,
|
| 146 |
+
torch_dtype="auto",
|
| 147 |
+
device_map="auto"
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
# prepare the model input
|
| 151 |
+
prompt = """Un professionnel de santé de 54 ans consulte un spécialiste des maladies infectieuses pour un suivi concernant un diagnostic récent d'hépatite C chronique.
|
| 152 |
+
Il s'est initialement présenté avec des symptômes tels que fatigue, malaise et enzymes hépatiques élevées et soupçonne d'avoir contracté l'infection à la suite
|
| 153 |
+
d'une piqûre d'aiguille il y a des années. Malgré le début du traitement, son titre viral reste élevé, ce qui incite le médecin à ajouter un nouveau médicament
|
| 154 |
+
qui inhibe la maturation virale en bloquant la synthèse des protéines. Quel est l'effet indésirable le plus probable de ce médicament ?
|
| 155 |
+
Choix de réponses :
|
| 156 |
+
(A) Uropathie cristalline obstructive
|
| 157 |
+
(B) Suppression de la moelle osseuse
|
| 158 |
+
(C) Insomnie et irritabilité
|
| 159 |
+
(D) Céphalées et photosensibilité
|
| 160 |
+
(E) Rêves lucides
|
| 161 |
+
(F) Hyperbilirubinémie
|
| 162 |
+
(G) Pancréatite
|
| 163 |
+
(H) Neuropathie périphérique
|
| 164 |
+
(I) Augmentation de la créatine kinase
|
| 165 |
+
(J) Alopécie"""
|
| 166 |
+
messages = [
|
| 167 |
+
{"role": "user", "content": prompt}
|
| 168 |
+
]
|
| 169 |
+
text = tokenizer.apply_chat_template(
|
| 170 |
+
messages,
|
| 171 |
+
tokenize=False,
|
| 172 |
+
add_generation_prompt=True,
|
| 173 |
+
)
|
| 174 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 175 |
+
|
| 176 |
+
# conduct text completion
|
| 177 |
+
generated_ids = model.generate(
|
| 178 |
+
**model_inputs,
|
| 179 |
+
max_new_tokens=1
|
| 180 |
+
)
|
| 181 |
+
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
|
| 182 |
+
|
| 183 |
+
content = tokenizer.decode(output_ids, skip_special_tokens=True)
|
| 184 |
+
|
| 185 |
+
print("content:", content)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
</code></pre>
|
| 188 |
</section>
|
| 189 |
|
|
|
|
| 213 |
<h2>Citation</h2>
|
| 214 |
<p>If you use MedInjection-FR or the models, please cite:</p>
|
| 215 |
<pre><code class="code">
|
| 216 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
</code></pre>
|
| 218 |
</section>
|
| 219 |
|