fix(dataset): list parquet via HfApi and load via HTTPS data_files; add dataset dropdown

#7
by SHA888 - opened
Files changed (1) hide show
  1. app.py +27 -9
app.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any
5
 
6
  import gradio as gr
7
 
8
- from huggingface_hub import HfApi, create_repo
9
 
10
 
11
  DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
@@ -57,19 +57,37 @@ def _train_ner_lora(
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
- # Use Parquet conversion branch via hf:// scheme
61
- "bc5cdr": "hf://datasets/bigbio/bc5cdr@refs/convert/parquet/bigbio_ner",
62
- "ncbi_disease": "hf://datasets/bigbio/ncbi_disease@refs/convert/parquet/bigbio_ner",
63
  }
64
  lower_spec = ds_spec.lower()
65
  if lower_spec in alias_map:
66
- base = alias_map[lower_spec]
67
- log(f"Using alias mapping (parquet): {ds_spec} -> {base}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  data_files = {
69
- "train": f"{base}/train-*.parquet",
70
- "validation": f"{base}/validation-*.parquet",
71
- "test": f"{base}/test-*.parquet",
72
  }
 
 
 
 
73
  ds = load_dataset("parquet", data_files=data_files)
74
  elif ":" in ds_spec:
75
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
 
5
 
6
  import gradio as gr
7
 
8
+ from huggingface_hub import HfApi, create_repo, hf_hub_url
9
 
10
 
11
  DEFAULT_BASE_MODEL = "dmis-lab/biobert-base-cased-v1.2"
 
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
+ # Use Parquet conversion branch via HTTPS URLs, discovered dynamically
61
+ "bc5cdr": ("bigbio/bc5cdr", "bigbio_ner"),
62
+ "ncbi_disease": ("bigbio/ncbi_disease", "bigbio_ner"),
63
  }
64
  lower_spec = ds_spec.lower()
65
  if lower_spec in alias_map:
66
+ repo_id, subset = alias_map[lower_spec]
67
+ log(f"Using alias mapping (parquet HTTPS): {ds_spec} -> {repo_id} [{subset}]")
68
+ api = HfApi()
69
+ # List files at the parquet conversion revision
70
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset", revision="refs/convert/parquet")
71
+ # Build HTTPS URLs for each split
72
+ def split_files(split: str):
73
+ prefix = f"{subset}/{split}-"
74
+ return [
75
+ hf_hub_url(repo_id=repo_id, filename=path, repo_type="dataset", revision="refs/convert/parquet")
76
+ for path in files
77
+ if path.startswith(prefix) and path.endswith(".parquet")
78
+ ]
79
+ train_files = split_files("train")
80
+ val_files = split_files("validation") or split_files("valid") or split_files("dev")
81
+ test_files = split_files("test")
82
+ if not train_files:
83
+ raise RuntimeError("No train parquet files found for BigBio subset")
84
  data_files = {
85
+ "train": train_files,
 
 
86
  }
87
+ if val_files:
88
+ data_files["validation"] = val_files
89
+ if test_files:
90
+ data_files["test"] = test_files
91
  ds = load_dataset("parquet", data_files=data_files)
92
  elif ":" in ds_spec:
93
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]