Spaces:

Junaidi-AI
/

med-vllm-train

Sleeping

App Files Files Community

Consolidate: resolve app.py conflicts; finalize training Space

by SHA888 - opened Sep 23, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+55

-12

Files changed (1) hide show

app.py +55 -12

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any
 import gradio as gr
-from huggingface_hub import HfApi
 DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
@@ -55,13 +55,15 @@ def _train_ner_lora(
     if "train" not in ds:
         raise RuntimeError("Dataset must have a train split")
-    # Expecting CoNLL-style fields
     features = ds["train"].features
-    token_col = "tokens" if "tokens" in features else None
-    tag_col = "ner_tags" if "ner_tags" in features else None
     if not token_col or not tag_col:
         raise RuntimeError(
-            "Dataset does not expose 'tokens' and 'ner_tags'. For medical datasets, add custom preprocessing."
         )
     label_list = ds["train"].features[tag_col].feature.names
@@ -88,11 +90,7 @@ def _train_ner_lora(
         tokenized = tokenizer(
             batch[token_col], is_split_into_words=True, truncation=True, padding=False
         )
-        labels = []
-        for i, word_ids in enumerate(tokenized.word_ids(batch_index=None)):
-            # The Transformers tokenizer returns word_ids per example only if batch=False; we do per-example loop
-            pass
-        # Re-run per example to ensure correct mapping
         new_input_ids = []
         new_labels = []
         for tokens, tags in zip(batch[token_col], batch[tag_col]):
@@ -124,6 +122,8 @@ def _train_ner_lora(
     data_collator = DataCollatorForTokenClassification(tokenizer)
     def compute_metrics(p):
         preds, labels = p
         preds = preds.argmax(-1)
@@ -138,12 +138,14 @@ def _train_ner_lora(
                     curr_lab.append(id2label[int(l_i)])
             true_predictions.append(curr_pred)
             true_labels.append(curr_lab)
-        return {
             "f1": f1_score(true_labels, true_predictions),
             "precision": precision_score(true_labels, true_predictions),
             "recall": recall_score(true_labels, true_predictions),
             "accuracy": accuracy_score(true_labels, true_predictions),
         }
     training_args = TrainingArguments(
         output_dir=output_dir,
@@ -175,6 +177,22 @@ def _train_ner_lora(
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
     # Push to the umbrella repo under checkpoints/
     api = HfApi()
     run_name = os.path.basename(output_dir.rstrip("/"))
@@ -186,10 +204,35 @@ def _train_ner_lora(
         folder_path=output_dir,
         path_in_repo=path_in_repo,
         commit_message=f"Add NER LoRA checkpoint ({run_name})",
         create_pr=True,
     )
     log(f"Pushed: {commit}")
-    return {"commit": str(commit), "path_in_repo": path_in_repo}
 class TrainerThread:

 import gradio as gr
+from huggingface_hub import HfApi, create_repo
 DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
     if "train" not in ds:
         raise RuntimeError("Dataset must have a train split")
+    # Detect token and label columns across common schemas
     features = ds["train"].features
+    token_candidates = ["tokens", "words"]
+    tag_candidates = ["ner_tags", "tags", "labels", "ner_tags_general"]
+    token_col = next((c for c in token_candidates if c in features), None)
+    tag_col = next((c for c in tag_candidates if c in features), None)
     if not token_col or not tag_col:
         raise RuntimeError(
+            "Dataset must provide token and tag columns. Looked for tokens/words and ner_tags/tags/labels."
         )
     label_list = ds["train"].features[tag_col].feature.names
         tokenized = tokenizer(
             batch[token_col], is_split_into_words=True, truncation=True, padding=False
         )
+        # Build aligned labels per example
         new_input_ids = []
         new_labels = []
         for tokens, tags in zip(batch[token_col], batch[tag_col]):
     data_collator = DataCollatorForTokenClassification(tokenizer)
+    metrics_holder: Dict[str, float] = {}
     def compute_metrics(p):
         preds, labels = p
         preds = preds.argmax(-1)
                     curr_lab.append(id2label[int(l_i)])
             true_predictions.append(curr_pred)
             true_labels.append(curr_lab)
+        out = {
             "f1": f1_score(true_labels, true_predictions),
             "precision": precision_score(true_labels, true_predictions),
             "recall": recall_score(true_labels, true_predictions),
             "accuracy": accuracy_score(true_labels, true_predictions),
         }
+        metrics_holder.update(out)
+        return out
     training_args = TrainingArguments(
         output_dir=output_dir,
     model.save_pretrained(output_dir)
     tokenizer.save_pretrained(output_dir)
+    # Compose commit description with metrics
+    desc_lines = [
+        f"base_model: {base_model}",
+        f"dataset: {dataset_name}",
+        f"epochs: {num_train_epochs}",
+        f"batch_size: {per_device_train_batch_size}",
+        f"learning_rate: {learning_rate}",
+        f"lora_r: {lora_r}",
+        f"lora_alpha: {lora_alpha}",
+        f"lora_dropout: {lora_dropout}",
+        "",
+        "metrics:",
+        *(f"- {k}: {v:.4f}" for k, v in metrics_holder.items()),
+    ]
+    commit_description = "\n".join(desc_lines)
     # Push to the umbrella repo under checkpoints/
     api = HfApi()
     run_name = os.path.basename(output_dir.rstrip("/"))
         folder_path=output_dir,
         path_in_repo=path_in_repo,
         commit_message=f"Add NER LoRA checkpoint ({run_name})",
+        commit_description=commit_description,
         create_pr=True,
     )
     log(f"Pushed: {commit}")
+    # Also publish to a dedicated med-vllm-* variant repo
+    try:
+        base_short = base_model.split("/")[-1].replace(" ", "-").lower()
+        ds_short = dataset_name.split("/")[-1].replace(" ", "-").lower()
+        variant_name = f"Junaidi-AI/med-vllm-ner-{ds_short}-{base_short}-lora-v1"
+        log(f"Ensuring repo exists: {variant_name}")
+        try:
+            create_repo(repo_id=variant_name, repo_type="model", exist_ok=True, private=False)
+        except Exception:
+            pass
+        commit2 = api.upload_folder(
+            repo_id=variant_name,
+            repo_type="model",
+            folder_path=output_dir,
+            path_in_repo=".",
+            commit_message=f"Initial LoRA checkpoint from {base_model} on {dataset_name}",
+            commit_description=commit_description,
+            create_pr=False,
+        )
+        log(f"Variant published: {commit2}")
+    except Exception as e:
+        log(f"Warning: failed to publish variant repo: {e}")
+    return {"commit": str(commit), "path_in_repo": path_in_repo, "metrics": metrics_holder}
 class TrainerThread: