File size: 7,693 Bytes
3daef91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""
OpenELM Model Loading Utilities

This module handles loading Apple OpenELM models with proper tokenizer support,
including custom configuration and modeling code that transformers doesn't natively support.
"""

import os
import sys
import subprocess
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download


# Path for storing OpenELM custom code
OPENELM_CACHE_DIR = Path("/app/.openelm_cache")
OPENELM_CACHE_DIR.mkdir(parents=True, exist_ok=True)


def download_openelm_files():
    """
    Download OpenELM custom configuration and tokenizer files from Hugging Face.
    Apple uses custom code that needs to be available locally for transformers to load.
    """
    model_id = "apple/OpenELM-450M-Instruct"
    
    files_to_download = [
        "configuration_openelm.py",
        "tokenizer.json",
        "vocab.txt",
        "merges.txt",
    ]
    
    print("Downloading OpenELM custom files...")
    
    for filename in files_to_download:
        try:
            filepath = hf_hub_download(
                repo_id=model_id,
                filename=filename,
                repo_type="model",
                local_dir=OPENELM_CACHE_DIR,
                force_download=True
            )
            print(f"  Downloaded: {filename}")
        except Exception as e:
            print(f"  Warning: Could not download {filename}: {e}")
    
    # Also download the modeling file if it exists
    try:
        modeling_file = hf_hub_download(
            repo_id=model_id,
            filename="modeling_openelm.py",
            repo_type="model",
            local_dir=OPENELM_CACHE_DIR,
            force_download=True
        )
        print(f"  Downloaded: modeling_openelm.py")
    except Exception as e:
        print(f"  Note: modeling_openelm.py not found (using transformers built-in)")
    
    return OPENELM_CACHE_DIR


def get_openelm_tokenizer():
    """
    Get the tokenizer for OpenELM model with custom code support.
    
    Returns:
        tokenizer: OpenELM tokenizer with proper configuration
    """
    try:
        # First try to download custom files
        cache_dir = download_openelm_files()
        
        # Add the cache directory to Python path so custom code can be imported
        if str(cache_dir) not in sys.path:
            sys.path.insert(0, str(cache_dir))
        
        # Try to import the tokenizer
        try:
            from transformers import LlamaTokenizer
            from configuration_openelm import OpenELMConfig
            
            # Check if we have tokenizer files
            vocab_file = cache_dir / "vocab.txt"
            merge_file = cache_dir / "merges.txt"
            tokenizer_file = cache_dir / "tokenizer.json"
            
            if tokenizer_file.exists():
                from transformers import AutoTokenizer
                tokenizer = AutoTokenizer.from_pretrained(
                    str(cache_dir),
                    trust_remote_code=True
                )
                return tokenizer
            elif vocab_file.exists():
                # Use LlamaTokenizer as base (OpenELM uses similar tokenizer)
                tokenizer = LlamaTokenizer(
                    vocab_file=str(vocab_file),
                    merges_file=str(merge_file) if merge_file.exists() else None,
                    trust_remote_code=True
                )
                return tokenizer
            else:
                raise FileNotFoundError("No tokenizer files found")
                
        except ImportError as e:
            print(f"Custom tokenizer import failed: {e}")
            # Fall back to default tokenizer
            raise
            
    except Exception as e:
        print(f"Error loading OpenELM tokenizer: {e}")
        # Fall back to using the default tokenizer from Hugging Face
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            "apple/OpenELM-450M-Instruct",
            trust_remote_code=True
        )
        return tokenizer


def get_openelm_model():
    """
    Get the OpenELM model with custom configuration support.
    
    Returns:
        model: OpenELM model ready for inference
    """
    import torch
    from transformers import AutoModelForCausalLM
    
    try:
        # Try to use custom configuration
        cache_dir = OPENELM_CACHE_DIR
        
        if (cache_dir / "configuration_openelm.py").exists():
            sys.path.insert(0, str(cache_dir))
            from configuration_openelm import OpenELMConfig
            from transformers import AutoConfig
            
            # Try to register the config
            print("Using custom OpenELM configuration...")
            
    except Exception as e:
        print(f"Custom configuration not available: {e}")
    
    # Load model with trust_remote_code to use Apple's custom code
    model = AutoModelForCausalLM.from_pretrained(
        "apple/OpenELM-450M-Instruct",
        torch_dtype=torch.float16,
        use_safetensors=True,
        trust_remote_code=True,
        device_map="auto" if torch.cuda.is_available() else None
    )
    
    return model


# Simple tokenizer that works without custom files
class SimpleOpenELMTokenizer:
    """
    A simple tokenizer fallback that uses byte-level encoding.
    This is used when the proper OpenELM tokenizer files are not available.
    """
    
    def __init__(self):
        import re
        # GPT-2 style regex
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
        self.encoder = {}
        self.decoder = {}
    
    def encode(self, text):
        """Encode text to tokens."""
        # Simple byte-level encoding
        tokens = []
        for i, char in enumerate(text):
            tokens.append(ord(char) + 256)  # Offset to avoid special tokens
        return tokens
    
    def decode(self, tokens):
        """Decode tokens to text."""
        text = ""
        for token in tokens:
            if token >= 256:
                text += chr(token - 256)
            elif token in self.decoder:
                text += self.decoder[token]
        return text
    
    def __call__(self, text, return_tensors=None, **kwargs):
        """Tokenize text."""
        tokens = self.encode(text)
        
        if return_tensors == "pt":
            import torch
            return {"input_ids": torch.tensor([tokens])}
        elif return_tensors == "tf":
            import tensorflow as tf
            return {"input_ids": tf.constant([tokens])}
        
        return {"input_ids": tokens}


def create_fallback_tokenizer():
    """
    Create a fallback tokenizer when the proper one can't be loaded.
    Uses a simple character-level tokenizer.
    """
    return SimpleOpenELMTokenizer()


# Test function
def test_tokenizer():
    """Test the tokenizer loading."""
    print("Testing OpenELM tokenizer...")
    
    try:
        tokenizer = get_openelm_tokenizer()
        test_text = "Hello, world!"
        tokens = tokenizer.encode(test_text)
        decoded = tokenizer.decode(tokens)
        
        print(f"  Input: {test_text}")
        print(f"  Tokens: {tokens}")
        print(f"  Decoded: {decoded}")
        print(f"  Token count: {len(tokens)}")
        
        return True
        
    except Exception as e:
        print(f"  Error: {e}")
        print("  Using fallback tokenizer...")
        
        tokenizer = create_fallback_tokenizer()
        tokens = tokenizer.encode(test_text)
        print(f"  Fallback tokenizer works: {tokens}")
        
        return False


if __name__ == "__main__":
    test_tokenizer()