Spaces:

megharudushi
/

agentic-api

Runtime error

agentic-api / openelm_tokenizer.py

MiniMax Agent

Fix OpenELM tokenizer loading - use LlamaTokenizer as fallback

3daef91 about 2 months ago

7.69 kB

	"""
	OpenELM Model Loading Utilities

	This module handles loading Apple OpenELM models with proper tokenizer support,
	including custom configuration and modeling code that transformers doesn't natively support.
	"""

	import os
	import sys
	import subprocess
	from pathlib import Path
	from huggingface_hub import hf_hub_download, snapshot_download


	# Path for storing OpenELM custom code
	OPENELM_CACHE_DIR = Path("/app/.openelm_cache")
	OPENELM_CACHE_DIR.mkdir(parents=True, exist_ok=True)


	def download_openelm_files():
	"""
	Download OpenELM custom configuration and tokenizer files from Hugging Face.
	Apple uses custom code that needs to be available locally for transformers to load.
	"""
	model_id = "apple/OpenELM-450M-Instruct"

	files_to_download = [
	"configuration_openelm.py",
	"tokenizer.json",
	"vocab.txt",
	"merges.txt",
	]

	print("Downloading OpenELM custom files...")

	for filename in files_to_download:
	try:
	filepath = hf_hub_download(
	repo_id=model_id,
	filename=filename,
	repo_type="model",
	local_dir=OPENELM_CACHE_DIR,
	force_download=True
	)
	print(f" Downloaded: {filename}")
	except Exception as e:
	print(f" Warning: Could not download {filename}: {e}")

	# Also download the modeling file if it exists
	try:
	modeling_file = hf_hub_download(
	repo_id=model_id,
	filename="modeling_openelm.py",
	repo_type="model",
	local_dir=OPENELM_CACHE_DIR,
	force_download=True
	)
	print(f" Downloaded: modeling_openelm.py")
	except Exception as e:
	print(f" Note: modeling_openelm.py not found (using transformers built-in)")

	return OPENELM_CACHE_DIR


	def get_openelm_tokenizer():
	"""
	Get the tokenizer for OpenELM model with custom code support.

	Returns:
	tokenizer: OpenELM tokenizer with proper configuration
	"""
	try:
	# First try to download custom files
	cache_dir = download_openelm_files()

	# Add the cache directory to Python path so custom code can be imported
	if str(cache_dir) not in sys.path:
	sys.path.insert(0, str(cache_dir))

	# Try to import the tokenizer
	try:
	from transformers import LlamaTokenizer
	from configuration_openelm import OpenELMConfig

	# Check if we have tokenizer files
	vocab_file = cache_dir / "vocab.txt"
	merge_file = cache_dir / "merges.txt"
	tokenizer_file = cache_dir / "tokenizer.json"

	if tokenizer_file.exists():
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	str(cache_dir),
	trust_remote_code=True
	)
	return tokenizer
	elif vocab_file.exists():
	# Use LlamaTokenizer as base (OpenELM uses similar tokenizer)
	tokenizer = LlamaTokenizer(
	vocab_file=str(vocab_file),
	merges_file=str(merge_file) if merge_file.exists() else None,
	trust_remote_code=True
	)
	return tokenizer
	else:
	raise FileNotFoundError("No tokenizer files found")

	except ImportError as e:
	print(f"Custom tokenizer import failed: {e}")
	# Fall back to default tokenizer
	raise

	except Exception as e:
	print(f"Error loading OpenELM tokenizer: {e}")
	# Fall back to using the default tokenizer from Hugging Face
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(
	"apple/OpenELM-450M-Instruct",
	trust_remote_code=True
	)
	return tokenizer


	def get_openelm_model():
	"""
	Get the OpenELM model with custom configuration support.

	Returns:
	model: OpenELM model ready for inference
	"""
	import torch
	from transformers import AutoModelForCausalLM

	try:
	# Try to use custom configuration
	cache_dir = OPENELM_CACHE_DIR

	if (cache_dir / "configuration_openelm.py").exists():
	sys.path.insert(0, str(cache_dir))
	from configuration_openelm import OpenELMConfig
	from transformers import AutoConfig

	# Try to register the config
	print("Using custom OpenELM configuration...")

	except Exception as e:
	print(f"Custom configuration not available: {e}")

	# Load model with trust_remote_code to use Apple's custom code
	model = AutoModelForCausalLM.from_pretrained(
	"apple/OpenELM-450M-Instruct",
	torch_dtype=torch.float16,
	use_safetensors=True,
	trust_remote_code=True,
	device_map="auto" if torch.cuda.is_available() else None
	)

	return model


	# Simple tokenizer that works without custom files
	class SimpleOpenELMTokenizer:
	"""
	A simple tokenizer fallback that uses byte-level encoding.
	This is used when the proper OpenELM tokenizer files are not available.
	"""

	def __init__(self):
	import re
	# GPT-2 style regex
	self.pat = re.compile(r"""'s\|'t\|'re\|'ve\|'m\|'ll\|'d\| ?\p{L}+\| ?\p{N}+\| ?[^\s\p{L}\p{N}]+\|\s+(?!\S)\|\s+""")
	self.encoder = {}
	self.decoder = {}

	def encode(self, text):
	"""Encode text to tokens."""
	# Simple byte-level encoding
	tokens = []
	for i, char in enumerate(text):
	tokens.append(ord(char) + 256) # Offset to avoid special tokens
	return tokens

	def decode(self, tokens):
	"""Decode tokens to text."""
	text = ""
	for token in tokens:
	if token >= 256:
	text += chr(token - 256)
	elif token in self.decoder:
	text += self.decoder[token]
	return text

	def __call__(self, text, return_tensors=None, **kwargs):
	"""Tokenize text."""
	tokens = self.encode(text)

	if return_tensors == "pt":
	import torch
	return {"input_ids": torch.tensor([tokens])}
	elif return_tensors == "tf":
	import tensorflow as tf
	return {"input_ids": tf.constant([tokens])}

	return {"input_ids": tokens}


	def create_fallback_tokenizer():
	"""
	Create a fallback tokenizer when the proper one can't be loaded.
	Uses a simple character-level tokenizer.
	"""
	return SimpleOpenELMTokenizer()


	# Test function
	def test_tokenizer():
	"""Test the tokenizer loading."""
	print("Testing OpenELM tokenizer...")

	try:
	tokenizer = get_openelm_tokenizer()
	test_text = "Hello, world!"
	tokens = tokenizer.encode(test_text)
	decoded = tokenizer.decode(tokens)

	print(f" Input: {test_text}")
	print(f" Tokens: {tokens}")
	print(f" Decoded: {decoded}")
	print(f" Token count: {len(tokens)}")

	return True

	except Exception as e:
	print(f" Error: {e}")
	print(" Using fallback tokenizer...")

	tokenizer = create_fallback_tokenizer()
	tokens = tokenizer.encode(test_text)
	print(f" Fallback tokenizer works: {tokens}")

	return False


	if __name__ == "__main__":
	test_tokenizer()