File size: 7,468 Bytes
d03f587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61e7d9a
 
d03f587
 
 
 
 
 
 
 
7cd0e22
 
 
 
d03f587
 
 
 
 
 
7cd0e22
d03f587
7cd0e22
d03f587
 
 
 
 
 
 
 
 
 
7cd0e22
d03f587
 
 
7cd0e22
d03f587
 
7cd0e22
61e7d9a
7cd0e22
 
 
 
 
 
d03f587
 
61e7d9a
d03f587
 
 
61e7d9a
 
 
d03f587
61e7d9a
 
 
 
 
 
 
 
 
 
 
 
 
d03f587
61e7d9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d03f587
 
 
 
 
 
 
 
61e7d9a
d03f587
 
 
 
 
 
 
 
7cd0e22
d03f587
 
 
 
 
 
 
 
 
 
61e7d9a
d03f587
 
 
 
 
 
 
 
 
 
61e7d9a
 
 
 
 
 
d03f587
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# model_loader.py
import os
import sys
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoConfig,
    pipeline
)
import torch
import warnings

warnings.filterwarnings("ignore")

MODEL_NAME = "RayyanAhmed9477/med-coding"

def load_model_and_tokenizer():
    """
    Loads Phi-3 model with multiple fallback strategies.
    Handles safetensors loading issues with robust error recovery.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"πŸ”§ Using device: {device}")
    print(f"πŸ”§ PyTorch version: {torch.__version__}")
    print(f"πŸ”§ Transformers version: {sys.modules['transformers'].__version__}")
    
    # Get HuggingFace token from environment
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        print("πŸ”‘ HuggingFace token found")
    else:
        print("⚠️  No HuggingFace token - assuming public model")
    
    try:
        # ===== STEP 1: Load Tokenizer =====
        print(f"πŸ“₯ Loading tokenizer: {MODEL_NAME}")
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            token=hf_token,
            use_fast=True
        )
        
        # Configure tokenizer
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
            tokenizer.padding_side = "left"
        
        print("βœ… Tokenizer loaded successfully")
        
        # ===== STEP 2: Load Configuration =====
        print(f"πŸ“₯ Loading model configuration: {MODEL_NAME}")
        config = AutoConfig.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,
            token=hf_token
        )
        
        # Handle LongRoPE configuration
        if hasattr(config, 'rope_scaling') and config.rope_scaling is not None:
            rope_type = config.rope_scaling.get('type', 'default')
            print(f"πŸ“ RoPE scaling type detected: {rope_type}")
            if rope_type == 'longrope':
                print("βœ… LongRoPE configuration detected and supported")
        
        print(f"βœ… Config loaded: {config.model_type}")
        
        # ===== STEP 3: Load Model with Multiple Strategies =====
        print(f"πŸ“₯ Loading model: {MODEL_NAME}")
        print("⏳ This may take 2-5 minutes on first load...")
        
        model = None
        loading_strategies = []
        
        if device == "cuda":
            loading_strategies = [
                # Strategy 1: Standard GPU loading
                {
                    "name": "GPU Standard",
                    "params": {
                        "trust_remote_code": True,
                        "torch_dtype": torch.bfloat16,
                        "device_map": "auto",
                        "token": hf_token,
                        "low_cpu_mem_usage": True
                    }
                }
            ]
        else:
            loading_strategies = [
                # Strategy 1: CPU with safetensors (preferred)
                {
                    "name": "CPU with safetensors",
                    "params": {
                        "trust_remote_code": True,
                        "torch_dtype": torch.float32,
                        "device_map": {"": "cpu"},
                        "token": hf_token,
                        "low_cpu_mem_usage": True,
                        "use_safetensors": True
                    }
                },
                # Strategy 2: CPU without explicit safetensors
                {
                    "name": "CPU standard",
                    "params": {
                        "trust_remote_code": True,
                        "torch_dtype": torch.float32,
                        "token": hf_token,
                        "low_cpu_mem_usage": True
                    }
                },
                # Strategy 3: CPU with PyTorch weights fallback
                {
                    "name": "CPU PyTorch weights",
                    "params": {
                        "trust_remote_code": True,
                        "torch_dtype": torch.float32,
                        "token": hf_token,
                        "low_cpu_mem_usage": True,
                        "use_safetensors": False
                    }
                },
                # Strategy 4: Minimal parameters
                {
                    "name": "CPU minimal",
                    "params": {
                        "trust_remote_code": True,
                        "token": hf_token
                    }
                }
            ]
        
        # Try each loading strategy
        for idx, strategy in enumerate(loading_strategies, 1):
            try:
                print(f"\nπŸ”„ Attempt {idx}/{len(loading_strategies)}: {strategy['name']}")
                
                model = AutoModelForCausalLM.from_pretrained(
                    MODEL_NAME,
                    config=config,
                    **strategy['params']
                )
                
                # Move to CPU explicitly if needed
                if device == "cpu" and not strategy['params'].get('device_map'):
                    model = model.to("cpu")
                
                print(f"βœ… Model loaded successfully using: {strategy['name']}")
                break
                
            except Exception as e:
                print(f"❌ Strategy '{strategy['name']}' failed: {str(e)}")
                if idx == len(loading_strategies):
                    # All strategies failed
                    raise
                else:
                    print(f"⏭️  Trying next strategy...")
                    continue
        
        if model is None:
            raise RuntimeError("All loading strategies failed")
        
        # Set model to evaluation mode
        model.eval()
        
        # Disable gradients to save memory
        for param in model.parameters():
            param.requires_grad = False
        
        print("\nβœ… Model fully loaded and ready!")
        
        # ===== STEP 4: Create Pipeline =====
        print("πŸ”§ Creating text generation pipeline...")
        gen_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device == "cuda" else -1,
            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
        )
        
        print("βœ… Pipeline created successfully!")
        print("=" * 60)
        print("πŸŽ‰ MODEL READY FOR INFERENCE")
        print("=" * 60)
        
        return gen_pipeline, tokenizer
        
    except Exception as e:
        print(f"\n❌ Error during model loading: {str(e)}")
        print("\nπŸ” Diagnostic Information:")
        print(f"   - Model: {MODEL_NAME}")
        print(f"   - Device: {device}")
        print(f"   - Token available: {hf_token is not None}")
        
        import traceback
        traceback.print_exc()
        
        raise RuntimeError(
            f"Failed to load model {MODEL_NAME}. "
            "All loading strategies exhausted. "
            "This could be due to: "
            "1) Model file corruption during download, "
            "2) Insufficient memory, "
            "3) Model incompatibility. "
            "Try upgrading Space to GPU or use a different model."
        ) from e