Spaces:
Runtime error
Runtime error
File size: 7,693 Bytes
3daef91 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | """
OpenELM Model Loading Utilities
This module handles loading Apple OpenELM models with proper tokenizer support,
including custom configuration and modeling code that transformers doesn't natively support.
"""
import os
import sys
import subprocess
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download
# Path for storing OpenELM custom code
OPENELM_CACHE_DIR = Path("/app/.openelm_cache")
OPENELM_CACHE_DIR.mkdir(parents=True, exist_ok=True)
def download_openelm_files():
"""
Download OpenELM custom configuration and tokenizer files from Hugging Face.
Apple uses custom code that needs to be available locally for transformers to load.
"""
model_id = "apple/OpenELM-450M-Instruct"
files_to_download = [
"configuration_openelm.py",
"tokenizer.json",
"vocab.txt",
"merges.txt",
]
print("Downloading OpenELM custom files...")
for filename in files_to_download:
try:
filepath = hf_hub_download(
repo_id=model_id,
filename=filename,
repo_type="model",
local_dir=OPENELM_CACHE_DIR,
force_download=True
)
print(f" Downloaded: {filename}")
except Exception as e:
print(f" Warning: Could not download {filename}: {e}")
# Also download the modeling file if it exists
try:
modeling_file = hf_hub_download(
repo_id=model_id,
filename="modeling_openelm.py",
repo_type="model",
local_dir=OPENELM_CACHE_DIR,
force_download=True
)
print(f" Downloaded: modeling_openelm.py")
except Exception as e:
print(f" Note: modeling_openelm.py not found (using transformers built-in)")
return OPENELM_CACHE_DIR
def get_openelm_tokenizer():
"""
Get the tokenizer for OpenELM model with custom code support.
Returns:
tokenizer: OpenELM tokenizer with proper configuration
"""
try:
# First try to download custom files
cache_dir = download_openelm_files()
# Add the cache directory to Python path so custom code can be imported
if str(cache_dir) not in sys.path:
sys.path.insert(0, str(cache_dir))
# Try to import the tokenizer
try:
from transformers import LlamaTokenizer
from configuration_openelm import OpenELMConfig
# Check if we have tokenizer files
vocab_file = cache_dir / "vocab.txt"
merge_file = cache_dir / "merges.txt"
tokenizer_file = cache_dir / "tokenizer.json"
if tokenizer_file.exists():
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
str(cache_dir),
trust_remote_code=True
)
return tokenizer
elif vocab_file.exists():
# Use LlamaTokenizer as base (OpenELM uses similar tokenizer)
tokenizer = LlamaTokenizer(
vocab_file=str(vocab_file),
merges_file=str(merge_file) if merge_file.exists() else None,
trust_remote_code=True
)
return tokenizer
else:
raise FileNotFoundError("No tokenizer files found")
except ImportError as e:
print(f"Custom tokenizer import failed: {e}")
# Fall back to default tokenizer
raise
except Exception as e:
print(f"Error loading OpenELM tokenizer: {e}")
# Fall back to using the default tokenizer from Hugging Face
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
"apple/OpenELM-450M-Instruct",
trust_remote_code=True
)
return tokenizer
def get_openelm_model():
"""
Get the OpenELM model with custom configuration support.
Returns:
model: OpenELM model ready for inference
"""
import torch
from transformers import AutoModelForCausalLM
try:
# Try to use custom configuration
cache_dir = OPENELM_CACHE_DIR
if (cache_dir / "configuration_openelm.py").exists():
sys.path.insert(0, str(cache_dir))
from configuration_openelm import OpenELMConfig
from transformers import AutoConfig
# Try to register the config
print("Using custom OpenELM configuration...")
except Exception as e:
print(f"Custom configuration not available: {e}")
# Load model with trust_remote_code to use Apple's custom code
model = AutoModelForCausalLM.from_pretrained(
"apple/OpenELM-450M-Instruct",
torch_dtype=torch.float16,
use_safetensors=True,
trust_remote_code=True,
device_map="auto" if torch.cuda.is_available() else None
)
return model
# Simple tokenizer that works without custom files
class SimpleOpenELMTokenizer:
"""
A simple tokenizer fallback that uses byte-level encoding.
This is used when the proper OpenELM tokenizer files are not available.
"""
def __init__(self):
import re
# GPT-2 style regex
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
self.encoder = {}
self.decoder = {}
def encode(self, text):
"""Encode text to tokens."""
# Simple byte-level encoding
tokens = []
for i, char in enumerate(text):
tokens.append(ord(char) + 256) # Offset to avoid special tokens
return tokens
def decode(self, tokens):
"""Decode tokens to text."""
text = ""
for token in tokens:
if token >= 256:
text += chr(token - 256)
elif token in self.decoder:
text += self.decoder[token]
return text
def __call__(self, text, return_tensors=None, **kwargs):
"""Tokenize text."""
tokens = self.encode(text)
if return_tensors == "pt":
import torch
return {"input_ids": torch.tensor([tokens])}
elif return_tensors == "tf":
import tensorflow as tf
return {"input_ids": tf.constant([tokens])}
return {"input_ids": tokens}
def create_fallback_tokenizer():
"""
Create a fallback tokenizer when the proper one can't be loaded.
Uses a simple character-level tokenizer.
"""
return SimpleOpenELMTokenizer()
# Test function
def test_tokenizer():
"""Test the tokenizer loading."""
print("Testing OpenELM tokenizer...")
try:
tokenizer = get_openelm_tokenizer()
test_text = "Hello, world!"
tokens = tokenizer.encode(test_text)
decoded = tokenizer.decode(tokens)
print(f" Input: {test_text}")
print(f" Tokens: {tokens}")
print(f" Decoded: {decoded}")
print(f" Token count: {len(tokens)}")
return True
except Exception as e:
print(f" Error: {e}")
print(" Using fallback tokenizer...")
tokenizer = create_fallback_tokenizer()
tokens = tokenizer.encode(test_text)
print(f" Fallback tokenizer works: {tokens}")
return False
if __name__ == "__main__":
test_tokenizer()
|