|
|
| """
|
| Simple test script to check sentencepiece installation and import.
|
|
|
| This script specifically tests the sentencepiece library which is critical
|
| for OpenLLM model tokenization.
|
|
|
| Author: Louis Chua Bean Chong
|
| License: GPL-3.0
|
| """
|
|
|
| import sys
|
| import subprocess
|
|
|
| def test_sentencepiece():
|
| """Test sentencepiece installation and import."""
|
| print("π Testing SentencePiece Installation")
|
| print("=" * 40)
|
|
|
|
|
| print("\nπ¦ Checking pip installation...")
|
| try:
|
| result = subprocess.run(
|
| ["pip", "show", "sentencepiece"],
|
| capture_output=True,
|
| text=True
|
| )
|
| if result.returncode == 0:
|
| print("β
sentencepiece is installed via pip")
|
| print(f"Info:\n{result.stdout}")
|
| else:
|
| print("β sentencepiece is NOT installed via pip")
|
| print("Installing sentencepiece...")
|
| install_result = subprocess.run(
|
| ["pip", "install", "sentencepiece>=0.1.99"],
|
| capture_output=True,
|
| text=True
|
| )
|
| if install_result.returncode == 0:
|
| print("β
sentencepiece installed successfully")
|
| else:
|
| print(f"β Failed to install sentencepiece: {install_result.stderr}")
|
| except Exception as e:
|
| print(f"β Error checking pip: {e}")
|
|
|
|
|
| print("\nπ Testing Python import...")
|
| try:
|
| import sentencepiece
|
| print("β
sentencepiece import successful")
|
| print(f"Version: {sentencepiece.__version__}")
|
| except ImportError as e:
|
| print(f"β sentencepiece import failed: {e}")
|
| return False
|
|
|
|
|
| print("\nπ€ Testing SentencePieceTokenizer...")
|
| try:
|
| from transformers import AutoTokenizer
|
| print("β
AutoTokenizer import successful")
|
|
|
|
|
| print("Testing tokenizer loading...")
|
| tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
| print("β
Basic tokenizer loading successful")
|
|
|
| except Exception as e:
|
| print(f"β Tokenizer test failed: {e}")
|
| return False
|
|
|
| print("\n" + "=" * 40)
|
| print("π― SentencePiece Test Complete!")
|
| return True
|
|
|
| def test_openllm_model():
|
| """Test loading the OpenLLM model specifically."""
|
| print("\nπ Testing OpenLLM Model Loading")
|
| print("=" * 40)
|
|
|
| try:
|
| from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
| print("Loading OpenLLM small model...")
|
| model_name = "lemms/openllm-small-extended-7k"
|
|
|
|
|
| print("Loading tokenizer...")
|
| tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| print("β
Tokenizer loaded successfully")
|
|
|
|
|
| print("Loading model...")
|
| model = AutoModelForCausalLM.from_pretrained(model_name)
|
| print("β
Model loaded successfully")
|
|
|
| print(f"\nπ OpenLLM model test successful!")
|
| print(f"Model: {model_name}")
|
| print(f"Tokenizer type: {type(tokenizer).__name__}")
|
| print(f"Model type: {type(model).__name__}")
|
|
|
| return True
|
|
|
| except Exception as e:
|
| print(f"β OpenLLM model test failed: {e}")
|
| return False
|
|
|
| if __name__ == "__main__":
|
| print("π§ͺ SentencePiece and OpenLLM Model Test")
|
| print("=" * 50)
|
|
|
|
|
| sp_success = test_sentencepiece()
|
|
|
|
|
| if sp_success:
|
| model_success = test_openllm_model()
|
| if model_success:
|
| print("\nπ All tests passed! Training should work now.")
|
| else:
|
| print("\nβ οΈ SentencePiece works but model loading failed.")
|
| else:
|
| print("\nβ SentencePiece test failed. Need to fix dependencies first.")
|
|
|
| print("\nπ‘ Next steps:")
|
| print("1. If tests failed, run: python install_dependencies.py")
|
| print("2. If tests passed, try the training again")
|
| print("3. If still having issues, restart the Space")
|
|
|