feat(loader): prefer BigBio script loader; fallback to parquet; clearer errors

#9
by SHA888 - opened
Files changed (1) hide show
  1. app.py +34 -26
app.py CHANGED
@@ -57,38 +57,46 @@ def _train_ner_lora(
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
- # Use Parquet conversion branch via HTTPS URLs, discovered dynamically
61
  "bc5cdr": ("bigbio/bc5cdr", "bigbio_ner"),
62
  "ncbi_disease": ("bigbio/ncbi_disease", "bigbio_ner"),
63
  }
64
  lower_spec = ds_spec.lower()
65
  if lower_spec in alias_map:
66
  repo_id, subset = alias_map[lower_spec]
67
- log(f"Using alias mapping (parquet HTTPS): {ds_spec} -> {repo_id} [{subset}]")
68
- api = HfApi()
69
- # List files at the parquet conversion revision
70
- files = api.list_repo_files(repo_id=repo_id, repo_type="dataset", revision="refs/convert/parquet")
71
- # Build HTTPS URLs for each split
72
- def split_files(split: str):
73
- prefix = f"{subset}/{split}-"
74
- return [
75
- hf_hub_url(repo_id=repo_id, filename=path, repo_type="dataset", revision="refs/convert/parquet")
76
- for path in files
77
- if path.startswith(prefix) and path.endswith(".parquet")
78
- ]
79
- train_files = split_files("train")
80
- val_files = split_files("validation") or split_files("valid") or split_files("dev")
81
- test_files = split_files("test")
82
- if not train_files:
83
- raise RuntimeError("No train parquet files found for BigBio subset")
84
- data_files = {
85
- "train": train_files,
86
- }
87
- if val_files:
88
- data_files["validation"] = val_files
89
- if test_files:
90
- data_files["test"] = test_files
91
- ds = load_dataset("parquet", data_files=data_files)
 
 
 
 
 
 
 
 
92
  elif ":" in ds_spec:
93
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
94
  # If loading from community repo, allow remote code
 
57
  try:
58
  # Medical aliases -> BigBio NER configs
59
  alias_map = {
60
+ # BigBio script-based configs (preferred with datasets<3.0)
61
  "bc5cdr": ("bigbio/bc5cdr", "bigbio_ner"),
62
  "ncbi_disease": ("bigbio/ncbi_disease", "bigbio_ner"),
63
  }
64
  lower_spec = ds_spec.lower()
65
  if lower_spec in alias_map:
66
  repo_id, subset = alias_map[lower_spec]
67
+ # 1) Try script loader first (requires datasets<3.0)
68
+ try:
69
+ log(f"Trying BigBio script loader: load_dataset('{repo_id}', '{subset}')")
70
+ ds = load_dataset(repo_id, subset, trust_remote_code=True)
71
+ except Exception as e_script:
72
+ log(f"Script loader failed: {e_script}")
73
+ # 2) Fallback to Parquet discovery via HTTPS
74
+ log("Falling back to Parquet discovery via refs/convert/parquet")
75
+ api = HfApi()
76
+ files = api.list_repo_files(repo_id=repo_id, repo_type="dataset", revision="refs/convert/parquet")
77
+ def split_files(split: str):
78
+ shard_prefix = f"{subset}/{split}-"
79
+ dir_prefix = f"{subset}/{split}/"
80
+ out = []
81
+ for path in files:
82
+ if not path.endswith(".parquet"):
83
+ continue
84
+ if path.startswith(shard_prefix) or path.startswith(dir_prefix):
85
+ out.append(
86
+ hf_hub_url(repo_id=repo_id, filename=path, repo_type="dataset", revision="refs/convert/parquet")
87
+ )
88
+ return sorted(out)
89
+ train_files = split_files("train")
90
+ val_files = split_files("validation") or split_files("valid") or split_files("dev")
91
+ test_files = split_files("test")
92
+ if not train_files:
93
+ raise RuntimeError("No train parquet files found for BigBio subset; merge PR to pin datasets<3.0 or choose another dataset")
94
+ data_files = {"train": train_files}
95
+ if val_files:
96
+ data_files["validation"] = val_files
97
+ if test_files:
98
+ data_files["test"] = test_files
99
+ ds = load_dataset("parquet", data_files=data_files)
100
  elif ":" in ds_spec:
101
  ds_name, ds_config = [s.strip() for s in ds_spec.split(":", 1)]
102
  # If loading from community repo, allow remote code