Spaces:

lemms
/

openllm

Runtime error

App Files Files Community

lemms commited on Aug 14, 2025

Commit

d6ce9fb

verified ·

1 Parent(s): 4744625

Fix: Use sentencepiece.SentencePieceProcessor() like local training code instead of AutoTokenizer

Browse files

Files changed (1) hide show

app.py +171 -58

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 #!/usr/bin/env python3
 """
-OpenLLM Training Space Application - Fixed Version for Tokenizer Issues
-This version includes robust error handling and alternative tokenizer loading
-methods to resolve the SentencePieceTokenizer import issue.
 Author: Louis Chua Bean Chong
 License: GPL-3.0
-Version: 2.0.5
 Last Updated: 2024
 """
@@ -23,7 +25,6 @@ from dataclasses import dataclass
 try:
     from transformers import (
         AutoModelForCausalLM,
-        AutoTokenizer,
         TrainingArguments,
         Trainer,
         DataCollatorForLanguageModeling
@@ -37,9 +38,9 @@ except ImportError as e:
 # Try to import sentencepiece with fallback
 try:
-    import sentencepiece
     SENTENCEPIECE_AVAILABLE = True
-    print(f"✅ SentencePiece available: {sentencepiece.__version__}")
 except ImportError:
     SENTENCEPIECE_AVAILABLE = False
     print("❌ SentencePiece not available - will use fallback methods")
@@ -59,10 +60,11 @@ class TrainingConfig:
 class OpenLLMTrainer:
     """
-    Complete training implementation for OpenLLM models with robust tokenizer handling.
     This class handles the entire training pipeline including:
-    - Model and tokenizer loading with fallback methods
     - Dataset preparation
     - Training execution
     - Model saving and uploading
@@ -92,7 +94,7 @@ class OpenLLMTrainer:
     def load_model_and_tokenizer(self, model_size: str) -> str:
         """
-        Load the pre-trained OpenLLM model and tokenizer with robust error handling.
         Args:
             model_size: Size of the model to load ("small", "medium", "large")
@@ -110,47 +112,100 @@ class OpenLLMTrainer:
             model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
-            # Load OpenLLM custom tokenizer with trust_remote_code
             try:
-                print("🔄 Loading OpenLLM custom tokenizer...")
-                self.tokenizer = AutoTokenizer.from_pretrained(
                     model_name,
-                    trust_remote_code=True,  # CRITICAL for OpenLLM custom tokenizer classes
-                    use_fast=False          # Use slow tokenizer for compatibility
                 )
-                print(f"✅ OpenLLM custom tokenizer loaded: {type(self.tokenizer).__name__}")
-                # Add padding token if not present
-                if self.tokenizer.pad_token is None:
-                    self.tokenizer.pad_token = self.tokenizer.eos_token
-                    print("✅ Added padding token")
             except Exception as e:
-                print(f"❌ Failed to load OpenLLM custom tokenizer: {e}")
-                return f"❌ Failed to load OpenLLM custom tokenizer: {str(e)}"
-            # Add padding token if not present
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Load model with robust error handling
             try:
-                print("🔄 Loading model...")
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.float16,  # Use half precision for memory efficiency
-                    device_map="auto" if torch.cuda.is_available() else None,
-                    trust_remote_code=True
                 )
-                print("✅ Model loaded successfully")
             except Exception as e:
-                print(f"❌ Model loading failed: {e}")
-                return f"❌ Failed to load model: {str(e)}"
-            return f"✅ Successfully loaded {model_size} model from {model_name}"
         except Exception as e:
-            return f"❌ Failed to load model and tokenizer: {str(e)}"
     def prepare_dataset(self) -> str:
         """
@@ -161,23 +216,49 @@ class OpenLLMTrainer:
         """
         try:
             # Load the training dataset
             dataset = load_dataset("lemms/openllm-training-data")
-            # Tokenize the dataset with robust error handling
             def tokenize_function(examples):
                 try:
-                    return self.tokenizer(
-                        examples["text"],
-                        truncation=True,
-                        padding="max_length",
-                        max_length=512,
-                        return_tensors="pt"
-                    )
                 except Exception as e:
                     print(f"Tokenization error: {e}")
                     # Fallback: return empty tensors
                     return {"input_ids": [], "attention_mask": []}
             tokenized_dataset = dataset["train"].map(
                 tokenize_function,
                 batched=True,
@@ -185,6 +266,7 @@ class OpenLLMTrainer:
             )
             self.dataset = tokenized_dataset
             return f"✅ Successfully prepared dataset with {len(tokenized_dataset)} samples"
@@ -263,6 +345,8 @@ class OpenLLMTrainer:
             self.training_progress["status"] = "Training"
             self.training_progress["total_steps"] = config.max_steps
             # Start training
             train_result = self.trainer.train()
@@ -271,10 +355,13 @@ class OpenLLMTrainer:
             self.training_progress["current_step"] = config.max_steps
             self.training_progress["loss"] = train_result.training_loss
             return f"✅ Training completed successfully! Final loss: {train_result.training_loss:.4f}"
         except Exception as e:
             self.training_progress["status"] = "Failed"
             return f"❌ Training failed: {str(e)}"
         finally:
             self.is_training = False
@@ -290,9 +377,29 @@ class OpenLLMTrainer:
             Status message indicating success or failure
         """
         try:
             # Save the model locally
             self.trainer.save_model()
-            self.tokenizer.save_pretrained(config.output_dir)
             # Generate model name for upload
             model_name = f"openllm-{config.model_size}-extended-8k"
@@ -300,6 +407,8 @@ class OpenLLMTrainer:
             # Upload to Hugging Face Hub
             if self.hf_api:
                 # Upload model files
                 self.hf_api.upload_folder(
                     folder_path=config.output_dir,
@@ -308,11 +417,13 @@ class OpenLLMTrainer:
                     commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
                 )
                 return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
             else:
                 return f"✅ Model saved locally to {config.output_dir}"
         except Exception as e:
             return f"❌ Failed to save/upload model: {str(e)}"
     def get_training_progress(self) -> Dict[str, Any]:
@@ -329,18 +440,19 @@ def main():
     # Create the main Gradio application interface
     with gr.Blocks(
-        title="OpenLLM Training Space - Fixed Version",
         theme=gr.themes.Soft()
     ) as demo:
         # Application Header
-        gr.Markdown("# 🚀 OpenLLM Training Space - Fixed Implementation")
-        gr.Markdown("### *Robust Tokenizer Handling - Gradio 4.44.1 Compatible*")
         gr.Markdown("---")
         # Status Information
         gr.Markdown(f"**Training Available**: {'✅ Yes' if TRAINING_AVAILABLE else '❌ No'}")
         gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
         # Main Content Area
         with gr.Row():
@@ -407,9 +519,9 @@ def main():
                     stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
         # Instructions Section
-        gr.Markdown("## 📋 Fixed Training Instructions")
         gr.Markdown("""
-        This interface provides **robust model training** with enhanced error handling:
         ### **Step 1: Configure Parameters**
         - **Model Size**: Select the base model to train from (7k models)
@@ -419,8 +531,9 @@ def main():
         ### **Step 2: Start Training**
         - Click "Start Training" to begin the actual training process
-        - The system will use multiple fallback methods for tokenizer loading
-        - Enhanced error handling for dependency issues
         ### **Step 3: Monitor Progress**
         - Watch the status updates and progress information
@@ -445,7 +558,7 @@ def main():
         # Training Function Definition
         def start_complete_training(model_size, max_steps, learning_rate, batch_size):
             """
-            Execute the complete training process with robust error handling.
             """
             if not TRAINING_AVAILABLE:
                 return "❌ Training dependencies not available. Please check the installation."
@@ -459,7 +572,7 @@ def main():
                     batch_size=batch_size
                 )
-                # Step 1: Load model and tokenizer
                 status = trainer.load_model_and_tokenizer(model_size)
                 if "❌" in status:
                     return status
@@ -505,7 +618,7 @@ def main():
         gr.Markdown("---")
         gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
         gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
-        gr.Markdown("**Enhanced Error Handling**: Multiple tokenizer loading methods")
     return demo

 #!/usr/bin/env python3
 """
+OpenLLM Training Space Application - Local Training Code Compatible
+This version uses the same tokenizer loading approach as the local OpenLLM training code:
+- Uses sentencepiece.SentencePieceProcessor() directly
+- Loads tokenizer from tokenizer.model file
+- Compatible with OpenLLM's actual implementation
 Author: Louis Chua Bean Chong
 License: GPL-3.0
+Version: 2.0.7
 Last Updated: 2024
 """
 try:
     from transformers import (
         AutoModelForCausalLM,
         TrainingArguments,
         Trainer,
         DataCollatorForLanguageModeling
 # Try to import sentencepiece with fallback
 try:
+    import sentencepiece as spm
     SENTENCEPIECE_AVAILABLE = True
+    print(f"✅ SentencePiece available: {spm.__version__}")
 except ImportError:
     SENTENCEPIECE_AVAILABLE = False
     print("❌ SentencePiece not available - will use fallback methods")
 class OpenLLMTrainer:
     """
+    Complete training implementation for OpenLLM models using local training approach.
     This class handles the entire training pipeline including:
+    - Model loading with trust_remote_code for custom model classes
+    - Tokenizer loading using sentencepiece.SentencePieceProcessor() (same as local code)
     - Dataset preparation
     - Training execution
     - Model saving and uploading
     def load_model_and_tokenizer(self, model_size: str) -> str:
         """
+        Load the pre-trained OpenLLM model and tokenizer using local training approach.
         Args:
             model_size: Size of the model to load ("small", "medium", "large")
             model_name = model_mapping.get(model_size, "lemms/openllm-small-extended-7k")
+            print(f"🔄 Loading OpenLLM model: {model_name}")
+            print("📝 Using local training approach: sentencepiece.SentencePieceProcessor()")
+            # Load model with trust_remote_code for custom model classes
             try:
+                print("🔄 Loading OpenLLM model...")
+                self.model = AutoModelForCausalLM.from_pretrained(
                     model_name,
+                    torch_dtype=torch.float16,  # Use half precision for memory efficiency
+                    device_map="auto" if torch.cuda.is_available() else None,
+                    trust_remote_code=True      # CRITICAL for custom model classes
                 )
+                print(f"✅ OpenLLM model loaded successfully: {type(self.model).__name__}")
             except Exception as e:
+                print(f"❌ Failed to load model: {e}")
+                return f"❌ Failed to load OpenLLM model: {str(e)}"
+            # Load tokenizer using the same approach as local training code
             try:
+                print("🔄 Loading tokenizer using sentencepiece.SentencePieceProcessor()...")
+                # Create a custom tokenizer class that wraps SentencePieceProcessor
+                # This is needed for Hugging Face Trainer compatibility
+                class OpenLLMTokenizer:
+                    def __init__(self, sp_processor):
+                        self.sp_processor = sp_processor
+                        self.pad_token = "<pad>"
+                        self.eos_token = "</s>"
+                        self.bos_token = "<s>"
+                        self.unk_token = "<unk>"
+                    def __call__(self, texts, **kwargs):
+                        """Tokenize texts using SentencePieceProcessor."""
+                        if isinstance(texts, str):
+                            texts = [texts]
+                        results = []
+                        for text in texts:
+                            # Encode text to token IDs
+                            token_ids = self.sp_processor.encode(text)
+                            # Create attention mask (all tokens are attended to)
+                            attention_mask = [1] * len(token_ids)
+                            results.append({
+                                'input_ids': token_ids,
+                                'attention_mask': attention_mask
+                            })
+                        return results
+                    def encode(self, text, **kwargs):
+                        """Encode text to token IDs."""
+                        return self.sp_processor.encode(text)
+                    def decode(self, token_ids, **kwargs):
+                        """Decode token IDs to text."""
+                        return self.sp_processor.decode(token_ids)
+                    def save_pretrained(self, path):
+                        """Save tokenizer files."""
+                        # The SentencePieceProcessor is already saved as tokenizer.model
+                        pass
+                # Download and load the tokenizer.model file
+                from huggingface_hub import hf_hub_download
+                print("🔄 Downloading tokenizer.model from HF Hub...")
+                tokenizer_path = hf_hub_download(
+                    repo_id=model_name,
+                    filename="tokenizer.model"
                 )
+                print(f"✅ Tokenizer downloaded to: {tokenizer_path}")
+                # Load using SentencePieceProcessor (same as local code)
+                sp_processor = spm.SentencePieceProcessor()
+                sp_processor.load(tokenizer_path)
+                # Wrap in our custom tokenizer class for HF Trainer compatibility
+                self.tokenizer = OpenLLMTokenizer(sp_processor)
+                print(f"✅ Tokenizer loaded successfully using SentencePieceProcessor")
+                print(f"   Vocabulary size: {sp_processor.vocab_size()}")
             except Exception as e:
+                print(f"❌ Failed to load tokenizer: {e}")
+                return f"❌ Failed to load OpenLLM tokenizer: {str(e)}"
+            return f"✅ Successfully loaded OpenLLM {model_size} model from {model_name}"
         except Exception as e:
+            return f"❌ Failed to load OpenLLM model and tokenizer: {str(e)}"
     def prepare_dataset(self) -> str:
         """
         """
         try:
             # Load the training dataset
+            print("🔄 Loading training dataset...")
             dataset = load_dataset("lemms/openllm-training-data")
+            print(f"✅ Dataset loaded: {len(dataset['train'])} samples")
+            # Tokenize the dataset using our custom tokenizer
             def tokenize_function(examples):
                 try:
+                    # Use our custom tokenizer
+                    tokenized = self.tokenizer(examples["text"])
+                    # Extract input_ids and attention_mask
+                    input_ids = [item['input_ids'] for item in tokenized]
+                    attention_mask = [item['attention_mask'] for item in tokenized]
+                    # Pad sequences to max_length
+                    max_length = 512
+                    padded_input_ids = []
+                    padded_attention_mask = []
+                    for ids, mask in zip(input_ids, attention_mask):
+                        if len(ids) > max_length:
+                            ids = ids[:max_length]
+                            mask = mask[:max_length]
+                        else:
+                            # Pad with pad_token_id
+                            pad_length = max_length - len(ids)
+                            ids = ids + [0] * pad_length  # 0 is pad_token_id
+                            mask = mask + [0] * pad_length
+                        padded_input_ids.append(ids)
+                        padded_attention_mask.append(mask)
+                    return {
+                        "input_ids": padded_input_ids,
+                        "attention_mask": padded_attention_mask
+                    }
                 except Exception as e:
                     print(f"Tokenization error: {e}")
                     # Fallback: return empty tensors
                     return {"input_ids": [], "attention_mask": []}
+            print("🔄 Tokenizing dataset...")
             tokenized_dataset = dataset["train"].map(
                 tokenize_function,
                 batched=True,
             )
             self.dataset = tokenized_dataset
+            print(f"✅ Dataset tokenized successfully: {len(tokenized_dataset)} samples")
             return f"✅ Successfully prepared dataset with {len(tokenized_dataset)} samples"
             self.training_progress["status"] = "Training"
             self.training_progress["total_steps"] = config.max_steps
+            print(f"🚀 Starting OpenLLM training for {config.max_steps} steps...")
             # Start training
             train_result = self.trainer.train()
             self.training_progress["current_step"] = config.max_steps
             self.training_progress["loss"] = train_result.training_loss
+            print(f"✅ Training completed! Final loss: {train_result.training_loss:.4f}")
             return f"✅ Training completed successfully! Final loss: {train_result.training_loss:.4f}"
         except Exception as e:
             self.training_progress["status"] = "Failed"
+            print(f"❌ Training failed: {e}")
             return f"❌ Training failed: {str(e)}"
         finally:
             self.is_training = False
             Status message indicating success or failure
         """
         try:
+            print("🔄 Saving trained model...")
             # Save the model locally
             self.trainer.save_model()
+            # Save tokenizer files
+            if hasattr(self.tokenizer, 'sp_processor'):
+                # Save the SentencePieceProcessor files
+                tokenizer_dir = os.path.join(config.output_dir, "tokenizer")
+                os.makedirs(tokenizer_dir, exist_ok=True)
+                # Copy the original tokenizer.model file
+                import shutil
+                from huggingface_hub import hf_hub_download
+                model_name = f"lemms/openllm-{config.model_size}-extended-7k"
+                tokenizer_path = hf_hub_download(
+                    repo_id=model_name,
+                    filename="tokenizer.model"
+                )
+                shutil.copy2(tokenizer_path, os.path.join(tokenizer_dir, "tokenizer.model"))
+            print("✅ Model saved locally")
             # Generate model name for upload
             model_name = f"openllm-{config.model_size}-extended-8k"
             # Upload to Hugging Face Hub
             if self.hf_api:
+                print(f"🔄 Uploading model to {repo_id}...")
                 # Upload model files
                 self.hf_api.upload_folder(
                     folder_path=config.output_dir,
                     commit_message=f"Add trained OpenLLM {config.model_size} model (8k steps)"
                 )
+                print(f"✅ Model uploaded successfully to {repo_id}")
                 return f"✅ Model saved and uploaded to https://huggingface.co/{repo_id}"
             else:
                 return f"✅ Model saved locally to {config.output_dir}"
         except Exception as e:
+            print(f"❌ Failed to save/upload model: {e}")
             return f"❌ Failed to save/upload model: {str(e)}"
     def get_training_progress(self) -> Dict[str, Any]:
     # Create the main Gradio application interface
     with gr.Blocks(
+        title="OpenLLM Training Space - Local Code Compatible",
         theme=gr.themes.Soft()
     ) as demo:
         # Application Header
+        gr.Markdown("# 🚀 OpenLLM Training Space - Local Code Compatible")
+        gr.Markdown("### *Uses sentencepiece.SentencePieceProcessor() Like Local Training*")
         gr.Markdown("---")
         # Status Information
         gr.Markdown(f"**Training Available**: {'✅ Yes' if TRAINING_AVAILABLE else '❌ No'}")
         gr.Markdown(f"**SentencePiece Available**: {'✅ Yes' if SENTENCEPIECE_AVAILABLE else '❌ No (using fallback methods)'}")
+        gr.Markdown("**Tokenizer Approach**: ✅ sentencepiece.SentencePieceProcessor() (Local Code Compatible)")
         # Main Content Area
         with gr.Row():
                     stop_btn = gr.Button("⏹️ Stop Training", variant="stop")
         # Instructions Section
+        gr.Markdown("## 📋 Local Code Compatible Training Instructions")
         gr.Markdown("""
+        This interface uses the **same tokenizer approach as local OpenLLM training**:
         ### **Step 1: Configure Parameters**
         - **Model Size**: Select the base model to train from (7k models)
         ### **Step 2: Start Training**
         - Click "Start Training" to begin the actual training process
+        - Uses `sentencepiece.SentencePieceProcessor()` directly (like local code)
+        - Downloads tokenizer.model from HF Hub and loads with SentencePieceProcessor
+        - Compatible with OpenLLM's actual implementation
         ### **Step 3: Monitor Progress**
         - Watch the status updates and progress information
         # Training Function Definition
         def start_complete_training(model_size, max_steps, learning_rate, batch_size):
             """
+            Execute the complete training process with local code compatible approach.
             """
             if not TRAINING_AVAILABLE:
                 return "❌ Training dependencies not available. Please check the installation."
                     batch_size=batch_size
                 )
+                # Step 1: Load model and tokenizer using local approach
                 status = trainer.load_model_and_tokenizer(model_size)
                 if "❌" in status:
                     return status
         gr.Markdown("---")
         gr.Markdown("**Author**: Louis Chua Bean Chong | **Project**: OpenLLM | **License**: GPL-3.0")
         gr.Markdown("**Gradio Version**: 4.44.1 (Fully Compatible)")
+        gr.Markdown("**Tokenizer**: sentencepiece.SentencePieceProcessor() (Local Code Compatible)")
     return demo