File size: 4,540 Bytes
d03f587
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# model_loader.py
import os
import sys
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoConfig,
    pipeline
)
import torch
import warnings

warnings.filterwarnings("ignore")

MODEL_NAME = "RayyanAhmed9477/med-coding"

def load_model_and_tokenizer():
    """
    Loads Phi-3 model with comprehensive error handling and fallbacks.
    Supports both CPU and GPU with automatic detection.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"๐Ÿ”ง Using device: {device}")
    print(f"๐Ÿ”ง PyTorch version: {torch.__version__}")
    print(f"๐Ÿ”ง Transformers version: {sys.modules['transformers'].__version__}")
    
    # Get HuggingFace token from environment
    hf_token = os.getenv("HF_TOKEN")
    
    try:
        # ===== STEP 1: Load Tokenizer =====
        print(f"๐Ÿ“ฅ Loading tokenizer: {MODEL_NAME}")
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,  # Critical for Phi-3
            token=hf_token,
            use_fast=True,
            legacy=False
        )
        
        # Configure tokenizer
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
            tokenizer.padding_side = "left"
        
        print("โœ… Tokenizer loaded successfully")
        
        # ===== STEP 2: Load Configuration with trust_remote_code =====
        print(f"๐Ÿ“ฅ Loading model configuration: {MODEL_NAME}")
        config = AutoConfig.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,  # Critical for Phi-3
            token=hf_token
        )
        print(f"โœ… Config loaded: {config.model_type}")
        
        # ===== STEP 3: Load Model =====
        print(f"๐Ÿ“ฅ Loading model: {MODEL_NAME}")
        print("โณ This may take 2-5 minutes on first load...")
        
        if device == "cuda":
            # GPU Configuration
            print("๐ŸŽฎ Using GPU with bfloat16 precision")
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                config=config,
                trust_remote_code=True,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                token=hf_token,
                low_cpu_mem_usage=True,
                attn_implementation="eager"  # More stable than flash attention
            )
        else:
            # CPU Configuration - optimized for stability
            print("๐Ÿ’ป Using CPU with float32 precision")
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                config=config,
                trust_remote_code=True,
                torch_dtype=torch.float32,
                device_map={"": "cpu"},
                token=hf_token,
                low_cpu_mem_usage=True,
                offload_folder="offload",
                attn_implementation="eager"
            )
        
        # Set model to evaluation mode
        model.eval()
        
        # Disable gradients to save memory
        for param in model.parameters():
            param.requires_grad = False
        
        print("โœ… Model loaded successfully!")
        
        # ===== STEP 4: Create Pipeline =====
        print("๐Ÿ”ง Creating text generation pipeline...")
        gen_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device == "cuda" else -1,
            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
            framework="pt"
        )
        
        print("โœ… Pipeline created successfully!")
        print("=" * 60)
        print("๐ŸŽ‰ MODEL READY FOR INFERENCE")
        print("=" * 60)
        
        return gen_pipeline, tokenizer
        
    except Exception as e:
        print(f"โŒ Error during model loading: {str(e)}")
        print("\n๐Ÿ” Diagnostic Information:")
        print(f"   - Model: {MODEL_NAME}")
        print(f"   - Device: {device}")
        print(f"   - Token available: {hf_token is not None}")
        
        import traceback
        traceback.print_exc()
        
        raise RuntimeError(
            f"Failed to load model {MODEL_NAME}. "
            "Please check: "
            "1) Internet connection, "
            "2) HuggingFace token (if model is private), "
            "3) Transformers version (requires >=4.36.0 for Phi-3)"
        ) from e