Spaces:

shelfgot
/

talmud-language-classifier

Sleeping

App Files Files Community

shelfgot commited on Nov 11, 2025

Commit

d5a48ff

verified ·

1 Parent(s): be1bc6c

Update train.py

Browse files

Files changed (1) hide show

train.py +32 -4

train.py CHANGED Viewed

@@ -117,10 +117,33 @@ def train_model(training_data_text: str):
     if len(all_texts) == 0:
         raise ValueError("No training data provided")
     # Stratify ensures the split has a similar distribution of labels
-    train_texts, test_texts, train_labels, test_labels = train_test_split(
-        all_texts, all_labels, test_size=0.2, random_state=42, stratify=all_labels
-    )
     print(f"\nTotal samples: {len(all_texts)}")
     print(f"Training set size: {len(train_texts)} (80%)")
@@ -185,7 +208,8 @@ def train_model(training_data_text: str):
         print(f"Validation Accuracy for Fold {fold+1}: {accuracy:.2f}%")
         # Save the best model found across all folds
-        if accuracy > best_val_accuracy:
             best_val_accuracy = accuracy
             best_model_state = copy.deepcopy(model.state_dict())
@@ -193,6 +217,10 @@ def train_model(training_data_text: str):
     print(f"Fold Accuracies: {[f'{acc:.2f}%' for acc in fold_results]}")
     print(f"Average CV Accuracy: {np.mean(fold_results):.2f}%")
     # Final Evaluation on the Held-Out Test Set
     print("\n----- Final Evaluation on Test Set -----")
     final_model = TalmudClassifierLSTM(len(word_to_idx), EMBEDDING_DIM, HIDDEN_DIM, num_classes)

     if len(all_texts) == 0:
         raise ValueError("No training data provided")
+    # Check for sufficient data and multiple classes
+    unique_labels = set(all_labels)
+    num_classes = len(unique_labels)
+    if num_classes < 2:
+        raise ValueError(f"Training data must contain at least 2 different classes. Found {num_classes} class(es).")
+    if len(all_texts) < 10:
+        raise ValueError(f"Training data must contain at least 10 samples. Found {len(all_texts)} samples.")
+    # Check if we have enough samples per class for stratification
+    # Stratification requires at least 2 samples per class for a 80/20 split
+    min_samples_per_class = min(all_labels.count(label) for label in unique_labels)
+    if min_samples_per_class < 2:
+        raise ValueError(f"Each class must have at least 2 samples for train/test split. Minimum samples per class: {min_samples_per_class}")
     # Stratify ensures the split has a similar distribution of labels
+    # Only use stratify if we have multiple classes and sufficient samples
+    try:
+        train_texts, test_texts, train_labels, test_labels = train_test_split(
+            all_texts, all_labels, test_size=0.2, random_state=42, stratify=all_labels
+        )
+    except ValueError as e:
+        # If stratification fails (e.g., insufficient samples per class), fall back to non-stratified split
+        if "least 2 samples" in str(e) or "class" in str(e).lower():
+            raise ValueError(f"Stratification failed: {str(e)}. Ensure each class has at least 2 samples.")
+        raise
     print(f"\nTotal samples: {len(all_texts)}")
     print(f"Training set size: {len(train_texts)} (80%)")
         print(f"Validation Accuracy for Fold {fold+1}: {accuracy:.2f}%")
         # Save the best model found across all folds
+        # Always save at least the first fold's model, or if this fold is better
+        if best_model_state is None or accuracy >= best_val_accuracy:
             best_val_accuracy = accuracy
             best_model_state = copy.deepcopy(model.state_dict())
     print(f"Fold Accuracies: {[f'{acc:.2f}%' for acc in fold_results]}")
     print(f"Average CV Accuracy: {np.mean(fold_results):.2f}%")
+    # Verify that we have a model state to load
+    if best_model_state is None:
+        raise RuntimeError("No model state was saved during cross-validation. This should not happen.")
     # Final Evaluation on the Held-Out Test Set
     print("\n----- Final Evaluation on Test Set -----")
     final_model = TalmudClassifierLSTM(len(word_to_idx), EMBEDDING_DIM, HIDDEN_DIM, num_classes)