Spaces:

msmaje
/

Hausa_Health_Assistant

Sleeping

App Files Files Community

msmaje commited on Aug 25, 2025

Commit

e532a61

verified ·

1 Parent(s): 715c9e3

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -12

app.py CHANGED Viewed

@@ -73,13 +73,13 @@ status_manager = StatusManager()
 # --- Model Loading ---
 def initialize_model_background():
     """
-    Loads the base pre-trained language model (DialoGPT-medium) and its tokenizer
     in a background thread to keep the Gradio UI responsive.
     """
     global model, tokenizer
     try:
-        status_manager.update_status("🔄 Loading base DialoGPT-medium model...", 10)
         # Clear CUDA cache if a GPU is available to free up memory before loading a new model
         if torch.cuda.is_available():
@@ -87,8 +87,8 @@ def initialize_model_background():
         status_manager.update_status("🔄 Downloading model weights (this might take a while)...", 30)
-        # Model name for Microsoft DialoGPT-medium, a good general-purpose conversational model
-        model_name = "microsoft/DialoGPT-medium"
         # Load the tokenizer associated with the model
         tokenizer = AutoTokenizer.from_pretrained(
@@ -164,14 +164,15 @@ def prepare_model_for_training():
             status_manager.update_status("✅ Model already prepared for training", 100)
             return "✅ Model already prepared for training"
-        # Define LoRA configuration. Target modules are specific to DialoGPT's architecture.
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             r=8,                          # LoRA attention dimension (e.g., 8, 16, 32)
             lora_alpha=16,                # Alpha parameter for LoRA scaling
             lora_dropout=0.1,             # Dropout probability for LoRA layers
             bias="none",                  # Bias type (none, all, lora_only)
-            target_modules=["c_attn", "c_proj"], # Key attention and projection layers in DialoGPT
         )
         # Apply LoRA to the base model, making only a small portion trainable
@@ -362,11 +363,11 @@ def train_model_background(batch_size, grad_accum, epochs, lr):
     """
     global model, tokenizer, trainer, training_stats, train_dataset, eval_dataset
-    # Enable PyTorch anomaly detection for debugging in-place operation errors
-    # WARNING: This can significantly slow down training, use only for debugging.
-    # It will provide a detailed traceback to pinpoint the exact problematic operation.
-    torch.autograd.set_detect_anomaly(True)
-    print("PyTorch anomaly detection is ENABLED. Training may be slower but will provide detailed error traces.")
     try:
         # Step 1: Ensure dataset is loaded and ready
@@ -689,4 +690,3 @@ def main():
 if __name__ == "__main__":
     main()

 # --- Model Loading ---
 def initialize_model_background():
     """
+    Loads the base pre-trained language model (distilgpt2) and its tokenizer
     in a background thread to keep the Gradio UI responsive.
     """
     global model, tokenizer
     try:
+        status_manager.update_status("🔄 Loading base distilgpt2 model...", 10)
         # Clear CUDA cache if a GPU is available to free up memory before loading a new model
         if torch.cuda.is_available():
         status_manager.update_status("🔄 Downloading model weights (this might take a while)...", 30)
+        # Changed model to distilgpt2 for lighter computation
+        model_name = "distilgpt2"
         # Load the tokenizer associated with the model
         tokenizer = AutoTokenizer.from_pretrained(
             status_manager.update_status("✅ Model already prepared for training", 100)
             return "✅ Model already prepared for training"
+        # Define LoRA configuration. Target modules are specific to distilgpt2's architecture.
         lora_config = LoraConfig(
             task_type=TaskType.CAUSAL_LM,
             r=8,                          # LoRA attention dimension (e.g., 8, 16, 32)
             lora_alpha=16,                # Alpha parameter for LoRA scaling
             lora_dropout=0.1,             # Dropout probability for LoRA layers
             bias="none",                  # Bias type (none, all, lora_only)
+            # Adjusted target modules for distilgpt2
+            target_modules=["c_attn", "c_proj", "c_fc"],
         )
         # Apply LoRA to the base model, making only a small portion trainable
     """
     global model, tokenizer, trainer, training_stats, train_dataset, eval_dataset
+    # Disable PyTorch anomaly detection for faster training.
+    # Re-enable if in-place modification errors persist with the new model.
+    # torch.autograd.set_detect_anomaly(True)
+    # print("PyTorch anomaly detection is ENABLED. Training may be slower but will provide detailed error traces.")
+    print("PyTorch anomaly detection is DISABLED for faster training.")
     try:
         # Step 1: Ensure dataset is loaded and ready
 if __name__ == "__main__":
     main()