Spaces:
Sleeping
Sleeping
feat(dataset): use Parquet hf:// for BigBio (bc5cdr, ncbi_disease) and add dataset dropdown
#6
by
SHA888
- opened
app.py
CHANGED
|
@@ -57,14 +57,20 @@ def _train_ner_lora(
|
|
| 57 |
try:
|
| 58 |
# Medical aliases -> BigBio NER configs
|
| 59 |
alias_map = {
|
| 60 |
-
|
| 61 |
-
"
|
|
|
|
| 62 |
}
|
| 63 |
lower_spec = ds_spec.lower()
|
| 64 |
if lower_spec in alias_map:
|
| 65 |
-
|
| 66 |
-
log(f"Using alias mapping: {ds_spec} -> {
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
elif ":" in ds_spec:
|
| 69 |
ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
|
| 70 |
# If loading from community repo, allow remote code
|
|
@@ -324,7 +330,17 @@ def build_ui():
|
|
| 324 |
)
|
| 325 |
with gr.Row():
|
| 326 |
base_model = gr.Textbox(value=DEFAULT_BASE_MODEL, label="Base model")
|
| 327 |
-
dataset_name = gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
with gr.Row():
|
| 329 |
epochs = gr.Slider(minimum=1, maximum=3, step=1, value=1, label="Epochs")
|
| 330 |
batch = gr.Slider(minimum=4, maximum=16, step=2, value=8, label="Batch size")
|
|
|
|
| 57 |
try:
|
| 58 |
# Medical aliases -> BigBio NER configs
|
| 59 |
alias_map = {
|
| 60 |
+
# Use Parquet conversion branch via hf:// scheme
|
| 61 |
+
"bc5cdr": "hf://datasets/bigbio/bc5cdr@refs/convert/parquet/bigbio_ner",
|
| 62 |
+
"ncbi_disease": "hf://datasets/bigbio/ncbi_disease@refs/convert/parquet/bigbio_ner",
|
| 63 |
}
|
| 64 |
lower_spec = ds_spec.lower()
|
| 65 |
if lower_spec in alias_map:
|
| 66 |
+
base = alias_map[lower_spec]
|
| 67 |
+
log(f"Using alias mapping (parquet): {ds_spec} -> {base}")
|
| 68 |
+
data_files = {
|
| 69 |
+
"train": f"{base}/train-*.parquet",
|
| 70 |
+
"validation": f"{base}/validation-*.parquet",
|
| 71 |
+
"test": f"{base}/test-*.parquet",
|
| 72 |
+
}
|
| 73 |
+
ds = load_dataset("parquet", data_files=data_files)
|
| 74 |
elif ":" in ds_spec:
|
| 75 |
ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
|
| 76 |
# If loading from community repo, allow remote code
|
|
|
|
| 330 |
)
|
| 331 |
with gr.Row():
|
| 332 |
base_model = gr.Textbox(value=DEFAULT_BASE_MODEL, label="Base model")
|
| 333 |
+
dataset_name = gr.Dropdown(
|
| 334 |
+
choices=[
|
| 335 |
+
"bc5cdr",
|
| 336 |
+
"ncbi_disease",
|
| 337 |
+
"wikiann:en",
|
| 338 |
+
"conll2003",
|
| 339 |
+
],
|
| 340 |
+
value=DEFAULT_DATASET,
|
| 341 |
+
allow_custom_value=True,
|
| 342 |
+
label="Dataset (token classification)",
|
| 343 |
+
)
|
| 344 |
with gr.Row():
|
| 345 |
epochs = gr.Slider(minimum=1, maximum=3, step=1, value=1, label="Epochs")
|
| 346 |
batch = gr.Slider(minimum=4, maximum=16, step=2, value=8, label="Batch size")
|