feat(dataset): use Parquet hf:// for BigBio (bc5cdr, ncbi_disease) and add dataset dropdown

#6
by SHA888 - opened
Files changed (1) hide show
  1. app.py +22 -6
app.py CHANGED
@@ -57,14 +57,20 @@ def _train_ner_lora(
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
- "bc5cdr": ("bigbio/bc5cdr", "bigbio_ner"),
61
- "ncbi_disease": ("bigbio/ncbi_disease", "bigbio_ner"),
 
62
  }
63
  lower_spec = ds_spec.lower()
64
  if lower_spec in alias_map:
65
- ds_name, ds_config = alias_map[lower_spec]
66
- log(f"Using alias mapping: {ds_spec} -> {ds_name}:{ds_config}")
67
- ds = load_dataset(ds_name, ds_config, trust_remote_code=True)
 
 
 
 
 
68
  elif ":" in ds_spec:
69
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
70
  # If loading from community repo, allow remote code
@@ -324,7 +330,17 @@ def build_ui():
324
  )
325
  with gr.Row():
326
  base_model = gr.Textbox(value=DEFAULT_BASE_MODEL, label="Base model")
327
- dataset_name = gr.Textbox(value=DEFAULT_DATASET, label="Dataset (token classification)")
 
 
 
 
 
 
 
 
 
 
328
  with gr.Row():
329
  epochs = gr.Slider(minimum=1, maximum=3, step=1, value=1, label="Epochs")
330
  batch = gr.Slider(minimum=4, maximum=16, step=2, value=8, label="Batch size")
 
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
+ # Use Parquet conversion branch via hf:// scheme
61
+ "bc5cdr": "hf://datasets/bigbio/bc5cdr@refs/convert/parquet/bigbio_ner",
62
+ "ncbi_disease": "hf://datasets/bigbio/ncbi_disease@refs/convert/parquet/bigbio_ner",
63
  }
64
  lower_spec = ds_spec.lower()
65
  if lower_spec in alias_map:
66
+ base = alias_map[lower_spec]
67
+ log(f"Using alias mapping (parquet): {ds_spec} -> {base}")
68
+ data_files = {
69
+ "train": f"{base}/train-*.parquet",
70
+ "validation": f"{base}/validation-*.parquet",
71
+ "test": f"{base}/test-*.parquet",
72
+ }
73
+ ds = load_dataset("parquet", data_files=data_files)
74
  elif ":" in ds_spec:
75
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
76
  # If loading from community repo, allow remote code
 
330
  )
331
  with gr.Row():
332
  base_model = gr.Textbox(value=DEFAULT_BASE_MODEL, label="Base model")
333
+ dataset_name = gr.Dropdown(
334
+ choices=[
335
+ "bc5cdr",
336
+ "ncbi_disease",
337
+ "wikiann:en",
338
+ "conll2003",
339
+ ],
340
+ value=DEFAULT_DATASET,
341
+ allow_custom_value=True,
342
+ label="Dataset (token classification)",
343
+ )
344
  with gr.Row():
345
  epochs = gr.Slider(minimum=1, maximum=3, step=1, value=1, label="Epochs")
346
  batch = gr.Slider(minimum=4, maximum=16, step=2, value=8, label="Batch size")