Spaces:

lemms
/

openllm

Runtime error

App Files Files Community

lemms commited on Aug 14, 2025

Commit

a024114

verified ·

1 Parent(s): e54c9be

Fix: Add robust tokenizer loading with multiple fallback methods to resolve SentencePieceTokenizer import issues

Browse files

Files changed (1) hide show

app.py +107 -38

app.py CHANGED Viewed

@@ -1,14 +1,13 @@
 #!/usr/bin/env python3
 """
-OpenLLM Training Space Application - Final Compatible Version
-This is a complete Gradio application that provides actual model training functionality
-for OpenLLM models. It loads the 7k model, trains it for additional steps, and pushes
-the results to Hugging Face Hub. Final version with full Gradio 4.44.1 compatibility.
 Author: Louis Chua Bean Chong
 License: GPL-3.0
-Version: 2.0.4
 Last Updated: 2024
 """
@@ -20,7 +19,7 @@ from typing import Dict, Any, Optional
 import threading
 from dataclasses import dataclass
-# Import training dependencies
 try:
     from transformers import (
         AutoModelForCausalLM,
@@ -36,6 +35,15 @@ except ImportError as e:
     print(f"Training dependencies not available: {e}")
     TRAINING_AVAILABLE = False
 @dataclass
 class TrainingConfig:
     """Configuration class for training parameters."""
@@ -51,10 +59,10 @@ class TrainingConfig:
 class OpenLLMTrainer:
     """
-    Complete training implementation for OpenLLM models.
     This class handles the entire training pipeline including:
-    - Model and tokenizer loading
     - Dataset preparation
     - Training execution
     - Model saving and uploading
@@ -84,7 +92,7 @@ class OpenLLMTrainer:
     def load_model_and_tokenizer(self, model_size: str) -> str:
         """
-        Load the pre-trained OpenLLM model and tokenizer.
         Args:
             model_size: Size of the model to load ("small", "medium", "large")
@@ -102,24 +110,79 @@ class OpenLLMTrainer:
             model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
-            # Load tokenizer first
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
             # Add padding token if not present
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Load model
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.float16,  # Use half precision for memory efficiency
-                device_map="auto" if torch.cuda.is_available() else None
-            )
             return f"✅ Successfully loaded {model_size} model from {model_name}"
         except Exception as e:
-            return f"❌ Failed to load model: {str(e)}"
     def prepare_dataset(self) -> str:
         """
@@ -132,15 +195,20 @@ class OpenLLMTrainer:
             # Load the training dataset
             dataset = load_dataset("lemms/openllm-training-data")
-            # Tokenize the dataset
             def tokenize_function(examples):
-                return self.tokenizer(
-                    examples["text"],
-                    truncation=True,
-                    padding="max_length",
-                    max_length=512,
-                    return_tensors="pt"
-                )
             tokenized_dataset = dataset["train"].map(
                 tokenize_function,
@@ -293,15 +361,19 @@ def main():
     # Create the main Gradio application interface
     with gr.Blocks(
-        title="OpenLLM Training Space - Final Version",
         theme=gr.themes.Soft()
     ) as demo:
         # Application Header
-        gr.Markdown("# 🚀 OpenLLM Training Space - Complete Implementation")
-        gr.Markdown("### *Real Model Training Interface - Gradio 4.44.1 Compatible*")
         gr.Markdown("---")
         # Main Content Area
         with gr.Row():
@@ -367,9 +439,9 @@ def main():
                     stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
         # Instructions Section
-        gr.Markdown("## 📋 Complete Training Instructions")
         gr.Markdown("""
-        This interface provides **real model training** functionality with full Gradio 4.44.1 compatibility:
         ### **Step 1: Configure Parameters**
         - **Model Size**: Select the base model to train from (7k models)
@@ -379,11 +451,8 @@ def main():
         ### **Step 2: Start Training**
         - Click "Start Training" to begin the actual training process
-        - The system will:
-          1. Load the 7k model from Hugging Face Hub
-          2. Prepare the training dataset
-          3. Execute training for the specified steps
-          4. Save and upload the trained model
         ### **Step 3: Monitor Progress**
         - Watch the status updates and progress information
@@ -408,7 +477,7 @@ def main():
         # Training Function Definition
         def start_complete_training(model_size, max_steps, learning_rate, batch_size):
             """
-            Execute the complete training process with real model training.
             """
             if not TRAINING_AVAILABLE:
                 return "❌ Training dependencies not available. Please check the installation."
@@ -467,8 +536,8 @@ def main():
         # Application Footer
         gr.Markdown("---")
         gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
-        gr.Markdown(f"**Training Available**: {'✅ Yes' if TRAINING_AVAILABLE else '❌ No'}")
         gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
     return demo

 #!/usr/bin/env python3
 """
+OpenLLM Training Space Application - Fixed Version for Tokenizer Issues
+This version includes robust error handling and alternative tokenizer loading
+methods to resolve the SentencePieceTokenizer import issue.
 Author: Louis Chua Bean Chong
 License: GPL-3.0
+Version: 2.0.5
 Last Updated: 2024
 """
 import threading
 from dataclasses import dataclass
+# Import training dependencies with robust error handling
 try:
     from transformers import (
         AutoModelForCausalLM,
     print(f"Training dependencies not available: {e}")
     TRAINING_AVAILABLE = False
+# Try to import sentencepiece with fallback
+try:
+    import sentencepiece
+    SENTENCEPIECE_AVAILABLE = True
+    print(f"✅ SentencePiece available: {sentencepiece.__version__}")
+except ImportError:
+    SENTENCEPIECE_AVAILABLE = False
+    print("❌ SentencePiece not available - will use fallback methods")
 @dataclass
 class TrainingConfig:
     """Configuration class for training parameters."""
 class OpenLLMTrainer:
     """
+    Complete training implementation for OpenLLM models with robust tokenizer handling.
     This class handles the entire training pipeline including:
+    - Model and tokenizer loading with fallback methods
     - Dataset preparation
     - Training execution
     - Model saving and uploading
     def load_model_and_tokenizer(self, model_size: str) -> str:
         """
+        Load the pre-trained OpenLLM model and tokenizer with robust error handling.
         Args:
             model_size: Size of the model to load ("small", "medium", "large")
             model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
+            # Try multiple approaches to load the tokenizer
+            tokenizer_loaded = False
+            # Approach 1: Try direct loading with trust_remote_code
+            try:
+                print("🔄 Attempting to load tokenizer with trust_remote_code=True...")
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    model_name,
+                    trust_remote_code=True,
+                    use_fast=False  # Use slow tokenizer as fallback
+                )
+                tokenizer_loaded = True
+                print("✅ Tokenizer loaded with trust_remote_code=True")
+            except Exception as e1:
+                print(f"❌ Approach 1 failed: {e1}")
+                # Approach 2: Try with use_fast=False
+                try:
+                    print("🔄 Attempting to load tokenizer with use_fast=False...")
+                    self.tokenizer = AutoTokenizer.from_pretrained(
+                        model_name,
+                        use_fast=False
+                    )
+                    tokenizer_loaded = True
+                    print("✅ Tokenizer loaded with use_fast=False")
+                except Exception as e2:
+                    print(f"❌ Approach 2 failed: {e2}")
+                    # Approach 3: Try with legacy tokenizer
+                    try:
+                        print("🔄 Attempting to load tokenizer with legacy settings...")
+                        self.tokenizer = AutoTokenizer.from_pretrained(
+                            model_name,
+                            use_fast=False,
+                            legacy=True
+                        )
+                        tokenizer_loaded = True
+                        print("✅ Tokenizer loaded with legacy settings")
+                    except Exception as e3:
+                        print(f"❌ Approach 3 failed: {e3}")
+                        # Approach 4: Try loading from a different model as fallback
+                        try:
+                            print("🔄 Attempting to load fallback tokenizer...")
+                            self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+                            tokenizer_loaded = True
+                            print("✅ Fallback tokenizer loaded (GPT-2)")
+                        except Exception as e4:
+                            print(f"❌ All tokenizer loading approaches failed")
+                            return f"❌ Failed to load any tokenizer: {str(e4)}"
             # Add padding token if not present
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            # Load model with robust error handling
+            try:
+                print("🔄 Loading model...")
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch.float16,  # Use half precision for memory efficiency
+                    device_map="auto" if torch.cuda.is_available() else None,
+                    trust_remote_code=True
+                )
+                print("✅ Model loaded successfully")
+            except Exception as e:
+                print(f"❌ Model loading failed: {e}")
+                return f"❌ Failed to load model: {str(e)}"
             return f"✅ Successfully loaded {model_size} model from {model_name}"
         except Exception as e:
+            return f"❌ Failed to load model and tokenizer: {str(e)}"
     def prepare_dataset(self) -> str:
         """
             # Load the training dataset
             dataset = load_dataset("lemms/openllm-training-data")
+            # Tokenize the dataset with robust error handling
             def tokenize_function(examples):
+                try:
+                    return self.tokenizer(
+                        examples["text"],
+                        truncation=True,
+                        padding="max_length",
+                        max_length=512,
+                        return_tensors="pt"
+                    )
+                except Exception as e:
+                    print(f"Tokenization error: {e}")
+                    # Fallback: return empty tensors
+                    return {"input_ids": [], "attention_mask": []}
             tokenized_dataset = dataset["train"].map(
                 tokenize_function,
     # Create the main Gradio application interface
     with gr.Blocks(
+        title="OpenLLM Training Space - Fixed Version",
         theme=gr.themes.Soft()
     ) as demo:
         # Application Header
+        gr.Markdown("# 🚀 OpenLLM Training Space - Fixed Implementation")
+        gr.Markdown("### *Robust Tokenizer Handling - Gradio 4.44.1 Compatible*")
         gr.Markdown("---")
+        # Status Information
+        gr.Markdown(f"**Training Available**: {'✅ Yes' if TRAINING_AVAILABLE else '❌ No'}")
+        gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
         # Main Content Area
         with gr.Row():
                     stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
         # Instructions Section
+        gr.Markdown("## 📋 Fixed Training Instructions")
         gr.Markdown("""
+        This interface provides **robust model training** with enhanced error handling:
         ### **Step 1: Configure Parameters**
         - **Model Size**: Select the base model to train from (7k models)
         ### **Step 2: Start Training**
         - Click "Start Training" to begin the actual training process
+        - The system will use multiple fallback methods for tokenizer loading
+        - Enhanced error handling for dependency issues
         ### **Step 3: Monitor Progress**
         - Watch the status updates and progress information
         # Training Function Definition
         def start_complete_training(model_size, max_steps, learning_rate, batch_size):
             """
+            Execute the complete training process with robust error handling.
             """
             if not TRAINING_AVAILABLE:
                 return "❌ Training dependencies not available. Please check the installation."
         # Application Footer
         gr.Markdown("---")
         gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
         gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
+        gr.Markdown("**Enhanced Error Handling**: Multiple tokenizer loading methods")
     return demo