Spaces:
Sleeping
Sleeping
Consolidate: resolve app.py conflicts; finalize training Space
#3
by
SHA888
- opened
app.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
-
from huggingface_hub import HfApi
|
| 9 |
|
| 10 |
|
| 11 |
DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
|
|
@@ -55,13 +55,15 @@ def _train_ner_lora(
|
|
| 55 |
if "train" not in ds:
|
| 56 |
raise RuntimeError("Dataset must have a train split")
|
| 57 |
|
| 58 |
-
#
|
| 59 |
features = ds["train"].features
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
if not token_col or not tag_col:
|
| 63 |
raise RuntimeError(
|
| 64 |
-
"Dataset
|
| 65 |
)
|
| 66 |
|
| 67 |
label_list = ds["train"].features[tag_col].feature.names
|
|
@@ -88,11 +90,7 @@ def _train_ner_lora(
|
|
| 88 |
tokenized = tokenizer(
|
| 89 |
batch[token_col], is_split_into_words=True, truncation=True, padding=False
|
| 90 |
)
|
| 91 |
-
labels
|
| 92 |
-
for i, word_ids in enumerate(tokenized.word_ids(batch_index=None)):
|
| 93 |
-
# The Transformers tokenizer returns word_ids per example only if batch=False; we do per-example loop
|
| 94 |
-
pass
|
| 95 |
-
# Re-run per example to ensure correct mapping
|
| 96 |
new_input_ids = []
|
| 97 |
new_labels = []
|
| 98 |
for tokens, tags in zip(batch[token_col], batch[tag_col]):
|
|
@@ -124,6 +122,8 @@ def _train_ner_lora(
|
|
| 124 |
|
| 125 |
data_collator = DataCollatorForTokenClassification(tokenizer)
|
| 126 |
|
|
|
|
|
|
|
| 127 |
def compute_metrics(p):
|
| 128 |
preds, labels = p
|
| 129 |
preds = preds.argmax(-1)
|
|
@@ -138,12 +138,14 @@ def _train_ner_lora(
|
|
| 138 |
curr_lab.append(id2label[int(l_i)])
|
| 139 |
true_predictions.append(curr_pred)
|
| 140 |
true_labels.append(curr_lab)
|
| 141 |
-
|
| 142 |
"f1": f1_score(true_labels, true_predictions),
|
| 143 |
"precision": precision_score(true_labels, true_predictions),
|
| 144 |
"recall": recall_score(true_labels, true_predictions),
|
| 145 |
"accuracy": accuracy_score(true_labels, true_predictions),
|
| 146 |
}
|
|
|
|
|
|
|
| 147 |
|
| 148 |
training_args = TrainingArguments(
|
| 149 |
output_dir=output_dir,
|
|
@@ -175,6 +177,22 @@ def _train_ner_lora(
|
|
| 175 |
model.save_pretrained(output_dir)
|
| 176 |
tokenizer.save_pretrained(output_dir)
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
# Push to the umbrella repo under checkpoints/
|
| 179 |
api = HfApi()
|
| 180 |
run_name = os.path.basename(output_dir.rstrip("/"))
|
|
@@ -186,10 +204,35 @@ def _train_ner_lora(
|
|
| 186 |
folder_path=output_dir,
|
| 187 |
path_in_repo=path_in_repo,
|
| 188 |
commit_message=f"Add NER LoRA checkpoint ({run_name})",
|
|
|
|
| 189 |
create_pr=True,
|
| 190 |
)
|
| 191 |
log(f"Pushed: {commit}")
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
class TrainerThread:
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
+
from huggingface_hub import HfApi, create_repo
|
| 9 |
|
| 10 |
|
| 11 |
DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
|
|
|
|
| 55 |
if "train" not in ds:
|
| 56 |
raise RuntimeError("Dataset must have a train split")
|
| 57 |
|
| 58 |
+
# Detect token and label columns across common schemas
|
| 59 |
features = ds["train"].features
|
| 60 |
+
token_candidates = ["tokens", "words"]
|
| 61 |
+
tag_candidates = ["ner_tags", "tags", "labels", "ner_tags_general"]
|
| 62 |
+
token_col = next((c for c in token_candidates if c in features), None)
|
| 63 |
+
tag_col = next((c for c in tag_candidates if c in features), None)
|
| 64 |
if not token_col or not tag_col:
|
| 65 |
raise RuntimeError(
|
| 66 |
+
"Dataset must provide token and tag columns. Looked for tokens/words and ner_tags/tags/labels."
|
| 67 |
)
|
| 68 |
|
| 69 |
label_list = ds["train"].features[tag_col].feature.names
|
|
|
|
| 90 |
tokenized = tokenizer(
|
| 91 |
batch[token_col], is_split_into_words=True, truncation=True, padding=False
|
| 92 |
)
|
| 93 |
+
# Build aligned labels per example
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
new_input_ids = []
|
| 95 |
new_labels = []
|
| 96 |
for tokens, tags in zip(batch[token_col], batch[tag_col]):
|
|
|
|
| 122 |
|
| 123 |
data_collator = DataCollatorForTokenClassification(tokenizer)
|
| 124 |
|
| 125 |
+
metrics_holder: Dict[str, float] = {}
|
| 126 |
+
|
| 127 |
def compute_metrics(p):
|
| 128 |
preds, labels = p
|
| 129 |
preds = preds.argmax(-1)
|
|
|
|
| 138 |
curr_lab.append(id2label[int(l_i)])
|
| 139 |
true_predictions.append(curr_pred)
|
| 140 |
true_labels.append(curr_lab)
|
| 141 |
+
out = {
|
| 142 |
"f1": f1_score(true_labels, true_predictions),
|
| 143 |
"precision": precision_score(true_labels, true_predictions),
|
| 144 |
"recall": recall_score(true_labels, true_predictions),
|
| 145 |
"accuracy": accuracy_score(true_labels, true_predictions),
|
| 146 |
}
|
| 147 |
+
metrics_holder.update(out)
|
| 148 |
+
return out
|
| 149 |
|
| 150 |
training_args = TrainingArguments(
|
| 151 |
output_dir=output_dir,
|
|
|
|
| 177 |
model.save_pretrained(output_dir)
|
| 178 |
tokenizer.save_pretrained(output_dir)
|
| 179 |
|
| 180 |
+
# Compose commit description with metrics
|
| 181 |
+
desc_lines = [
|
| 182 |
+
f"base_model: {base_model}",
|
| 183 |
+
f"dataset: {dataset_name}",
|
| 184 |
+
f"epochs: {num_train_epochs}",
|
| 185 |
+
f"batch_size: {per_device_train_batch_size}",
|
| 186 |
+
f"learning_rate: {learning_rate}",
|
| 187 |
+
f"lora_r: {lora_r}",
|
| 188 |
+
f"lora_alpha: {lora_alpha}",
|
| 189 |
+
f"lora_dropout: {lora_dropout}",
|
| 190 |
+
"",
|
| 191 |
+
"metrics:",
|
| 192 |
+
*(f"- {k}: {v:.4f}" for k, v in metrics_holder.items()),
|
| 193 |
+
]
|
| 194 |
+
commit_description = "\n".join(desc_lines)
|
| 195 |
+
|
| 196 |
# Push to the umbrella repo under checkpoints/
|
| 197 |
api = HfApi()
|
| 198 |
run_name = os.path.basename(output_dir.rstrip("/"))
|
|
|
|
| 204 |
folder_path=output_dir,
|
| 205 |
path_in_repo=path_in_repo,
|
| 206 |
commit_message=f"Add NER LoRA checkpoint ({run_name})",
|
| 207 |
+
commit_description=commit_description,
|
| 208 |
create_pr=True,
|
| 209 |
)
|
| 210 |
log(f"Pushed: {commit}")
|
| 211 |
+
|
| 212 |
+
# Also publish to a dedicated med-vllm-* variant repo
|
| 213 |
+
try:
|
| 214 |
+
base_short = base_model.split("/")[-1].replace(" ", "-").lower()
|
| 215 |
+
ds_short = dataset_name.split("/")[-1].replace(" ", "-").lower()
|
| 216 |
+
variant_name = f"Junaidi-AI/med-vllm-ner-{ds_short}-{base_short}-lora-v1"
|
| 217 |
+
log(f"Ensuring repo exists: {variant_name}")
|
| 218 |
+
try:
|
| 219 |
+
create_repo(repo_id=variant_name, repo_type="model", exist_ok=True, private=False)
|
| 220 |
+
except Exception:
|
| 221 |
+
pass
|
| 222 |
+
commit2 = api.upload_folder(
|
| 223 |
+
repo_id=variant_name,
|
| 224 |
+
repo_type="model",
|
| 225 |
+
folder_path=output_dir,
|
| 226 |
+
path_in_repo=".",
|
| 227 |
+
commit_message=f"Initial LoRA checkpoint from {base_model} on {dataset_name}",
|
| 228 |
+
commit_description=commit_description,
|
| 229 |
+
create_pr=False,
|
| 230 |
+
)
|
| 231 |
+
log(f"Variant published: {commit2}")
|
| 232 |
+
except Exception as e:
|
| 233 |
+
log(f"Warning: failed to publish variant repo: {e}")
|
| 234 |
+
|
| 235 |
+
return {"commit": str(commit), "path_in_repo": path_in_repo, "metrics": metrics_holder}
|
| 236 |
|
| 237 |
|
| 238 |
class TrainerThread:
|