Consolidate: resolve app.py conflicts; finalize training Space

#3
by SHA888 - opened
Files changed (1) hide show
  1. app.py +55 -12
app.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any
5
 
6
  import gradio as gr
7
 
8
- from huggingface_hub import HfApi
9
 
10
 
11
  DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
@@ -55,13 +55,15 @@ def _train_ner_lora(
55
  if "train" not in ds:
56
  raise RuntimeError("Dataset must have a train split")
57
 
58
- # Expecting CoNLL-style fields
59
  features = ds["train"].features
60
- token_col = "tokens" if "tokens" in features else None
61
- tag_col = "ner_tags" if "ner_tags" in features else None
 
 
62
  if not token_col or not tag_col:
63
  raise RuntimeError(
64
- "Dataset does not expose 'tokens' and 'ner_tags'. For medical datasets, add custom preprocessing."
65
  )
66
 
67
  label_list = ds["train"].features[tag_col].feature.names
@@ -88,11 +90,7 @@ def _train_ner_lora(
88
  tokenized = tokenizer(
89
  batch[token_col], is_split_into_words=True, truncation=True, padding=False
90
  )
91
- labels = []
92
- for i, word_ids in enumerate(tokenized.word_ids(batch_index=None)):
93
- # The Transformers tokenizer returns word_ids per example only if batch=False; we do per-example loop
94
- pass
95
- # Re-run per example to ensure correct mapping
96
  new_input_ids = []
97
  new_labels = []
98
  for tokens, tags in zip(batch[token_col], batch[tag_col]):
@@ -124,6 +122,8 @@ def _train_ner_lora(
124
 
125
  data_collator = DataCollatorForTokenClassification(tokenizer)
126
 
 
 
127
  def compute_metrics(p):
128
  preds, labels = p
129
  preds = preds.argmax(-1)
@@ -138,12 +138,14 @@ def _train_ner_lora(
138
  curr_lab.append(id2label[int(l_i)])
139
  true_predictions.append(curr_pred)
140
  true_labels.append(curr_lab)
141
- return {
142
  "f1": f1_score(true_labels, true_predictions),
143
  "precision": precision_score(true_labels, true_predictions),
144
  "recall": recall_score(true_labels, true_predictions),
145
  "accuracy": accuracy_score(true_labels, true_predictions),
146
  }
 
 
147
 
148
  training_args = TrainingArguments(
149
  output_dir=output_dir,
@@ -175,6 +177,22 @@ def _train_ner_lora(
175
  model.save_pretrained(output_dir)
176
  tokenizer.save_pretrained(output_dir)
177
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  # Push to the umbrella repo under checkpoints/
179
  api = HfApi()
180
  run_name = os.path.basename(output_dir.rstrip("/"))
@@ -186,10 +204,35 @@ def _train_ner_lora(
186
  folder_path=output_dir,
187
  path_in_repo=path_in_repo,
188
  commit_message=f"Add NER LoRA checkpoint ({run_name})",
 
189
  create_pr=True,
190
  )
191
  log(f"Pushed: {commit}")
192
- return {"commit": str(commit), "path_in_repo": path_in_repo}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
 
195
  class TrainerThread:
 
5
 
6
  import gradio as gr
7
 
8
+ from huggingface_hub import HfApi, create_repo
9
 
10
 
11
  DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
 
55
  if "train" not in ds:
56
  raise RuntimeError("Dataset must have a train split")
57
 
58
+ # Detect token and label columns across common schemas
59
  features = ds["train"].features
60
+ token_candidates = ["tokens", "words"]
61
+ tag_candidates = ["ner_tags", "tags", "labels", "ner_tags_general"]
62
+ token_col = next((c for c in token_candidates if c in features), None)
63
+ tag_col = next((c for c in tag_candidates if c in features), None)
64
  if not token_col or not tag_col:
65
  raise RuntimeError(
66
+ "Dataset must provide token and tag columns. Looked for tokens/words and ner_tags/tags/labels."
67
  )
68
 
69
  label_list = ds["train"].features[tag_col].feature.names
 
90
  tokenized = tokenizer(
91
  batch[token_col], is_split_into_words=True, truncation=True, padding=False
92
  )
93
+ # Build aligned labels per example
 
 
 
 
94
  new_input_ids = []
95
  new_labels = []
96
  for tokens, tags in zip(batch[token_col], batch[tag_col]):
 
122
 
123
  data_collator = DataCollatorForTokenClassification(tokenizer)
124
 
125
+ metrics_holder: Dict[str, float] = {}
126
+
127
  def compute_metrics(p):
128
  preds, labels = p
129
  preds = preds.argmax(-1)
 
138
  curr_lab.append(id2label[int(l_i)])
139
  true_predictions.append(curr_pred)
140
  true_labels.append(curr_lab)
141
+ out = {
142
  "f1": f1_score(true_labels, true_predictions),
143
  "precision": precision_score(true_labels, true_predictions),
144
  "recall": recall_score(true_labels, true_predictions),
145
  "accuracy": accuracy_score(true_labels, true_predictions),
146
  }
147
+ metrics_holder.update(out)
148
+ return out
149
 
150
  training_args = TrainingArguments(
151
  output_dir=output_dir,
 
177
  model.save_pretrained(output_dir)
178
  tokenizer.save_pretrained(output_dir)
179
 
180
+ # Compose commit description with metrics
181
+ desc_lines = [
182
+ f"base_model: {base_model}",
183
+ f"dataset: {dataset_name}",
184
+ f"epochs: {num_train_epochs}",
185
+ f"batch_size: {per_device_train_batch_size}",
186
+ f"learning_rate: {learning_rate}",
187
+ f"lora_r: {lora_r}",
188
+ f"lora_alpha: {lora_alpha}",
189
+ f"lora_dropout: {lora_dropout}",
190
+ "",
191
+ "metrics:",
192
+ *(f"- {k}: {v:.4f}" for k, v in metrics_holder.items()),
193
+ ]
194
+ commit_description = "\n".join(desc_lines)
195
+
196
  # Push to the umbrella repo under checkpoints/
197
  api = HfApi()
198
  run_name = os.path.basename(output_dir.rstrip("/"))
 
204
  folder_path=output_dir,
205
  path_in_repo=path_in_repo,
206
  commit_message=f"Add NER LoRA checkpoint ({run_name})",
207
+ commit_description=commit_description,
208
  create_pr=True,
209
  )
210
  log(f"Pushed: {commit}")
211
+
212
+ # Also publish to a dedicated med-vllm-* variant repo
213
+ try:
214
+ base_short = base_model.split("/")[-1].replace(" ", "-").lower()
215
+ ds_short = dataset_name.split("/")[-1].replace(" ", "-").lower()
216
+ variant_name = f"Junaidi-AI/med-vllm-ner-{ds_short}-{base_short}-lora-v1"
217
+ log(f"Ensuring repo exists: {variant_name}")
218
+ try:
219
+ create_repo(repo_id=variant_name, repo_type="model", exist_ok=True, private=False)
220
+ except Exception:
221
+ pass
222
+ commit2 = api.upload_folder(
223
+ repo_id=variant_name,
224
+ repo_type="model",
225
+ folder_path=output_dir,
226
+ path_in_repo=".",
227
+ commit_message=f"Initial LoRA checkpoint from {base_model} on {dataset_name}",
228
+ commit_description=commit_description,
229
+ create_pr=False,
230
+ )
231
+ log(f"Variant published: {commit2}")
232
+ except Exception as e:
233
+ log(f"Warning: failed to publish variant repo: {e}")
234
+
235
+ return {"commit": str(commit), "path_in_repo": path_in_repo, "metrics": metrics_holder}
236
 
237
 
238
  class TrainerThread: