Spaces:

MedInjection-FR
/

README

Running

App Files Files Community

MedInjection-FR commited on Oct 22, 2025

Commit

e12b63f

verified ·

1 Parent(s): 9ad7c66

Update index.html

Browse files

Files changed (1) hide show

index.html +197 -17

index.html CHANGED Viewed

@@ -1,19 +1,199 @@
 <!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

 <!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <title>MedInjection-FR • French Biomedical Instruction Dataset</title>
+    <link rel="stylesheet" href="style.css" />
+    <meta name="description" content="MedInjection-FR: a French biomedical instruction dataset with native, synthetic, and translated components, plus fine-tuned models." />
+  </head>
+  <body>
+    <header class="site-header">
+      <div class="wrap">
+        <h1>MedInjection-FR</h1>
+        <p class="subtitle">A French biomedical instruction dataset and model suite</p>
+        <p class="meta">Native • Synthetic • Translated | 577,577 instruction–response pairs</p>
+        <div class="cta-row">
+          <a class="btn primary" href="#download">Download</a>
+          <a class="btn" href="#models">Models</a>
+          <a class="btn" href="#citation">Cite</a>
+          <a class="btn" href="#contact">Contact</a>
+        </div>
+      </div>
+    </header>
+    <main class="wrap">
+      <!-- Overview -->
+      <section id="overview" class="card">
+        <h2>Overview</h2>
+        <p>
+          MedInjection-FR is a large-scale French biomedical instruction dataset designed to study
+          how the <strong>provenance of supervision</strong> (native, synthetic, translated) affects instruction-tuning of LLMs.
+          The corpus supports multiple-choice QA (single and multi-answer) and open-ended QA,
+          and is released together with a family of fine-tuned baseline models.
+        </p>
+        <ul class="pill-list">
+          <li>Native: <strong>77,247</strong></li>
+          <li>Synthetic: <strong>82,269</strong></li>
+          <li>Translated: <strong>418,061</strong></li>
+          <li>Total: <strong>577,577</strong></li>
+        </ul>
+      </section>
+      <!-- What’s inside -->
+      <section id="composition" class="card">
+        <h2>Composition & Tasks</h2>
+        <div class="grid-2">
+          <div>
+            <h3>Task types</h3>
+            <ul>
+              <li>MCQ (single-answer)</li>
+              <li>MCQU (multiple-answer)</li>
+              <li>OEQ (open-ended QA)</li>
+            </ul>
+            <p class="muted">Counts (all components): OEQ <strong>63,267</strong>, MCQ <strong>59,597</strong>, MCQU <strong>454,713</strong>.</p>
+            <h3>Splits</h3>
+            <table class="clean">
+              <thead>
+                <tr><th>Component</th><th>Train</th><th>Validation</th><th>Test</th><th>Total</th></tr>
+              </thead>
+              <tbody>
+                <tr><td>Native</td><td>57,563</td><td>5,055</td><td>14,629</td><td>77,247</td></tr>
+                <tr><td>Synthetic</td><td>82,269</td><td>—</td><td>—</td><td>82,269</td></tr>
+                <tr><td>Translated</td><td>367,704</td><td>38,337</td><td>12,020</td><td>418,061</td></tr>
+                <tr class="total"><td>Total</td><td>507,536</td><td>43,392</td><td>26,649</td><td>577,577</td></tr>
+              </tbody>
+            </table>
+          </div>
+          <div>
+            <h3>Translated quality (WMT24 biomedical parallel)</h3>
+            <table class="clean">
+              <thead><tr><th>Model</th><th>BLEU</th><th>COMET</th></tr></thead>
+              <tbody>
+                <tr><td>GPT-4o-mini</td><td>51.01</td><td>0.8751</td></tr>
+                <tr><td>Gemini 2.0 Flash</td><td>53.72</td><td>0.8783</td></tr>
+                <tr class="muted"><td>WMT’24 best (ref.)</td><td>53.54</td><td>0.8760</td></tr>
+              </tbody>
+            </table>
+            <p class="muted">Higher is better. These scores indicate strong translation fidelity for the translated subset.</p>
+          </div>
+        </div>
+      </section>
+      <!-- Downloads -->
+      <section id="download" class="card">
+        <h2>Download</h2>
+        <p>Each component is published separately. Use the links below or load via the 🤗 Datasets library.</p>
+        <div class="grid-3 tight">
+          <a class="tile" href="./nat/" target="_blank">
+            <h3>Native</h3>
+            <p>French medical exams, resources, curated QA.</p>
+            <code>dataset/medinjection-fr-nat</code>
+          </a>
+          <a class="tile" href="./syn/" target="_blank">
+            <h3>Synthetic</h3>
+            <p>GPT-4o generated from clinical cases and abstracts.</p>
+            <code>dataset/medinjection-fr-syn</code>
+          </a>
+          <a class="tile" href="./trad/" target="_blank">
+            <h3>Translated</h3>
+            <p>FR translations of established EN biomedical sets.</p>
+            <code>dataset/medinjection-fr-trad</code>
+          </a>
+        </div>
+        <h3>Python (🤗 Datasets)</h3>
+        <pre><code class="code">
+from datasets import load_dataset
+# choose one: "nat", "syn", or "trad"
+ds = load_dataset("your-org/medinjection-fr", "nat")  # or "syn", "trad"
+print(ds)
+        </code></pre>
+      </section>
+      <!-- Models -->
+      <section id="models" class="card">
+        <h2>Fine-tuned Models</h2>
+        <p>We release seven instruction-tuned baselines (Qwen-4B-Instruct backbone, DoRA adapters), trained on 30k samples per configuration:</p>
+        <ul class="pill-list">
+          <li>QWEN-4B-NAT</li>
+          <li>QWEN-4B-TRAD</li>
+          <li>QWEN-4B-SYN</li>
+          <li>QWEN-4B-NAT-TRAD</li>
+          <li>QWEN-4B-NAT-SYN</li>
+          <li>QWEN-4B-TRAD-SYN</li>
+          <li>QWEN-4B-COMBO</li>
+        </ul>
+        <div class="grid-3 tight">
+          <a class="tile" href="./models/qwen-4b-nat/" target="_blank"><h3>NAT</h3><p>Best single-source (MCQ/MCQU).</p></a>
+          <a class="tile" href="./models/qwen-4b-nat-trad/" target="_blank"><h3>NAT-TRAD</h3><p>Top mixed configuration.</p></a>
+          <a class="tile" href="./models/qwen-4b-combo/" target="_blank"><h3>COMBO</h3><p>All sources combined.</p></a>
+        </div>
+        <h3>Quick inference (🤗 Transformers)</h3>
+        <pre><code class="code">
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+model_id = "your-org/qwen-4b-nat-trad"  # pick one of the released models
+tok = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
+prompt = "Question: Quelle est la prise en charge initiale d'un OAP ?\nChoix: A) ... B) ... C) ... D) ...\nRépondez par la lettre."
+inputs = tok(prompt, return_tensors="pt").to(model.device)
+out = model.generate(**inputs, max_new_tokens=64)
+print(tok.decode(out[0], skip_special_tokens=True))
+        </code></pre>
+      </section>
+      <!-- Evaluation note -->
+      <section class="card">
+        <h2>Evaluation at a glance</h2>
+        <ul>
+          <li>MCQ/MCQU reported with Exact-Match; MCQU also uses Hamming score.</li>
+          <li>OEQ uses BLEU/ROUGE/METEOR/BERTScore and an LLM-as-a-judge calibrated on <em>human annotations</em> (100 samples).</li>
+          <li>Mixed training (especially <strong>NAT-TRAD</strong>) provides complementary gains over single-source setups.</li>
+        </ul>
+      </section>
+      <!-- Ethics & License -->
+      <section id="ethics" class="card">
+        <h2>Ethics, Intended Use & License</h2>
+        <p class="warning">This dataset and the released models are for <strong>research use only</strong>. They are <strong>not</strong> a substitute for professional medical advice, diagnosis, or treatment.</p>
+        <ul>
+          <li>No PHI included; sources compiled from public datasets and teaching material.</li>
+          <li>Evaluation includes human expert checks for a small sample; outputs may still contain errors.</li>
+          <li>Please review the <a href="./LICENSE" target="_blank">LICENSE</a> before use. If unsure, contact the maintainers.</li>
+        </ul>
+      </section>
+      <!-- Citation -->
+      <section id="citation" class="card">
+        <h2>Citation</h2>
+        <p>If you use MedInjection-FR or the models, please cite:</p>
+        <pre><code class="code">
+@inproceedings{medinjection-fr-2025,
+  title   = {MedInjection-FR: Investigating Data Provenance for French Biomedical Instruction Tuning},
+  author  = {Your Name and Coauthors},
+  booktitle = {Proceedings of ...},
+  year    = {2025},
+  note    = {Dataset and models available on Hugging Face}
+}
+        </code></pre>
+      </section>
+      <!-- Contact -->
+      <section id="contact" class="card">
+        <h2>Contact</h2>
+        <p>Questions, feedback, or requests: open an issue on the repo or email <a href="mailto:you@example.com">you@example.com</a>.</p>
+      </section>
+      <footer class="site-footer">
+        <p>© 2025 MedInjection-FR. Built for research and reproducibility.</p>
+      </footer>
+    </main>
+  </body>
 </html>