Sina1138 commited on
Commit
d2ab04b
·
1 Parent(s): b9432ba

Fix model loading utility and improve error handling in scoring pipeline

Browse files
dependencies/scoring_utils.py CHANGED
@@ -31,42 +31,50 @@ def find_available_years(data_dir: Path) -> list:
31
  return sorted(years)
32
 
33
 
34
- def load_model_and_tokenizer(model_dir: Path, device: str = "cuda"):
 
 
 
 
 
 
 
 
 
35
  """
36
- Load a model and tokenizer from a local directory.
37
-
38
  Args:
39
- model_dir: Path to directory containing model (config.json, pytorch_model.bin, etc.)
40
  device: Device to load model onto ("cuda" or "cpu")
41
-
 
42
  Returns:
43
- Tuple of (tokenizer, model)
44
-
45
- Raises:
46
- FileNotFoundError: If model directory doesn't exist or is missing model files
47
  """
48
- if not model_dir.exists():
49
- raise FileNotFoundError(f"Model directory not found: {model_dir}")
50
-
51
- # Check for required files
52
- required_files = ["config.json", "pytorch_model.bin"]
53
- for required_file in required_files:
54
- if not (model_dir / required_file).exists():
55
- raise FileNotFoundError(f"Missing {required_file} in {model_dir}")
56
-
 
57
  try:
58
- tokenizer = AutoTokenizer.from_pretrained(str(model_dir))
59
- model = AutoModelForSequenceClassification.from_pretrained(str(model_dir))
60
  model.eval()
61
-
62
  # Move to device
63
  device_obj = torch.device(device if torch.cuda.is_available() else "cpu")
64
  model.to(device_obj)
65
-
66
  return tokenizer, model, device_obj
67
-
68
  except Exception as e:
69
- raise RuntimeError(f"Failed to load model from {model_dir}: {e}")
70
 
71
 
72
  def predict_batch(sentences: list, tokenizer, model, device, max_length: int = 512) -> list:
@@ -199,15 +207,18 @@ def load_polarity_model(model_variant: str, base_dir: Path, device: str = "cuda"
199
  "deberta": base_dir / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model",
200
  "scideberta": base_dir / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model",
201
  }
202
-
 
 
 
203
  if model_variant not in variant_map:
204
  raise ValueError(
205
  f"Unknown polarity model variant: {model_variant}. "
206
  f"Supported: {list(variant_map.keys())}"
207
  )
208
-
209
  model_dir = variant_map[model_variant]
210
- return load_model_and_tokenizer(model_dir, device)
211
 
212
 
213
  def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
@@ -236,15 +247,18 @@ def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
236
  "deberta": base_dir / "alternative_topic" / "deberta" / "final_model",
237
  "scideberta": base_dir / "alternative_topic" / "scideberta" / "final_model",
238
  }
239
-
 
 
 
240
  if model_variant not in variant_map:
241
  raise ValueError(
242
  f"Unknown topic model variant: {model_variant}. "
243
  f"Supported: {list(variant_map.keys())}"
244
  )
245
-
246
  model_dir = variant_map[model_variant]
247
- return load_model_and_tokenizer(model_dir, device)
248
 
249
 
250
  # Topic label mapping
 
31
  return sorted(years)
32
 
33
 
34
+ def _local_model_available(model_dir: Path) -> bool:
35
+ """Check if a local model directory has the required files."""
36
+ if not model_dir.exists():
37
+ return False
38
+ # Accept either pytorch_model.bin or safetensors
39
+ has_weights = (model_dir / "pytorch_model.bin").exists() or (model_dir / "model.safetensors").exists()
40
+ return has_weights and (model_dir / "config.json").exists()
41
+
42
+
43
+ def load_model_and_tokenizer(model_dir: Path, device: str = "cuda", hub_fallback: str = None):
44
  """
45
+ Load a model and tokenizer from a local directory, or fall back to HuggingFace Hub.
46
+
47
  Args:
48
+ model_dir: Path to local model directory
49
  device: Device to load model onto ("cuda" or "cpu")
50
+ hub_fallback: HuggingFace Hub model ID to use if local files are missing
51
+
52
  Returns:
53
+ Tuple of (tokenizer, model, device_obj)
 
 
 
54
  """
55
+ model_source = str(model_dir)
56
+
57
+ if not _local_model_available(model_dir):
58
+ if hub_fallback:
59
+ print(f" Local model not found at {model_dir}")
60
+ print(f" Falling back to HuggingFace Hub: {hub_fallback}")
61
+ model_source = hub_fallback
62
+ else:
63
+ raise FileNotFoundError(f"Model not found at {model_dir} and no hub fallback configured")
64
+
65
  try:
66
+ tokenizer = AutoTokenizer.from_pretrained(model_source)
67
+ model = AutoModelForSequenceClassification.from_pretrained(model_source)
68
  model.eval()
69
+
70
  # Move to device
71
  device_obj = torch.device(device if torch.cuda.is_available() else "cpu")
72
  model.to(device_obj)
73
+
74
  return tokenizer, model, device_obj
75
+
76
  except Exception as e:
77
+ raise RuntimeError(f"Failed to load model from {model_source}: {e}")
78
 
79
 
80
  def predict_batch(sentences: list, tokenizer, model, device, max_length: int = 512) -> list:
 
207
  "deberta": base_dir / "alternative_polarity" / "deberta" / "deberta_v3_base_polarity_final_model",
208
  "scideberta": base_dir / "alternative_polarity" / "scideberta" / "scideberta_full_polarity_final_model",
209
  }
210
+ hub_fallback_map = {
211
+ "scibert": "Sina1138/Scibert_polarity_Review",
212
+ }
213
+
214
  if model_variant not in variant_map:
215
  raise ValueError(
216
  f"Unknown polarity model variant: {model_variant}. "
217
  f"Supported: {list(variant_map.keys())}"
218
  )
219
+
220
  model_dir = variant_map[model_variant]
221
+ return load_model_and_tokenizer(model_dir, device, hub_fallback=hub_fallback_map.get(model_variant))
222
 
223
 
224
  def load_topic_model(model_variant: str, base_dir: Path, device: str = "cuda"):
 
247
  "deberta": base_dir / "alternative_topic" / "deberta" / "final_model",
248
  "scideberta": base_dir / "alternative_topic" / "scideberta" / "final_model",
249
  }
250
+ hub_fallback_map = {
251
+ "scibert": "Sina1138/SciDeberta_Review",
252
+ }
253
+
254
  if model_variant not in variant_map:
255
  raise ValueError(
256
  f"Unknown topic model variant: {model_variant}. "
257
  f"Supported: {list(variant_map.keys())}"
258
  )
259
+
260
  model_dir = variant_map[model_variant]
261
+ return load_model_and_tokenizer(model_dir, device, hub_fallback=hub_fallback_map.get(model_variant))
262
 
263
 
264
  # Topic label mapping
pipeline/run_scoring.py CHANGED
@@ -124,8 +124,10 @@ def run_full_pipeline(
124
  return True
125
 
126
  except Exception as e:
 
127
  print(f"\n{'='*60}")
128
  print(f"✗ Pipeline failed for {year}: {e}")
 
129
  print(f"{'='*60}")
130
  return False
131
 
 
124
  return True
125
 
126
  except Exception as e:
127
+ import traceback
128
  print(f"\n{'='*60}")
129
  print(f"✗ Pipeline failed for {year}: {e}")
130
+ traceback.print_exc()
131
  print(f"{'='*60}")
132
  return False
133