Spaces:
Sleeping
Sleeping
File size: 4,540 Bytes
d03f587 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# model_loader.py
import os
import sys
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoConfig,
pipeline
)
import torch
import warnings
warnings.filterwarnings("ignore")
MODEL_NAME = "RayyanAhmed9477/med-coding"
def load_model_and_tokenizer():
"""
Loads Phi-3 model with comprehensive error handling and fallbacks.
Supports both CPU and GPU with automatic detection.
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐ง Using device: {device}")
print(f"๐ง PyTorch version: {torch.__version__}")
print(f"๐ง Transformers version: {sys.modules['transformers'].__version__}")
# Get HuggingFace token from environment
hf_token = os.getenv("HF_TOKEN")
try:
# ===== STEP 1: Load Tokenizer =====
print(f"๐ฅ Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True, # Critical for Phi-3
token=hf_token,
use_fast=True,
legacy=False
)
# Configure tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
tokenizer.padding_side = "left"
print("โ
Tokenizer loaded successfully")
# ===== STEP 2: Load Configuration with trust_remote_code =====
print(f"๐ฅ Loading model configuration: {MODEL_NAME}")
config = AutoConfig.from_pretrained(
MODEL_NAME,
trust_remote_code=True, # Critical for Phi-3
token=hf_token
)
print(f"โ
Config loaded: {config.model_type}")
# ===== STEP 3: Load Model =====
print(f"๐ฅ Loading model: {MODEL_NAME}")
print("โณ This may take 2-5 minutes on first load...")
if device == "cuda":
# GPU Configuration
print("๐ฎ Using GPU with bfloat16 precision")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=config,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
token=hf_token,
low_cpu_mem_usage=True,
attn_implementation="eager" # More stable than flash attention
)
else:
# CPU Configuration - optimized for stability
print("๐ป Using CPU with float32 precision")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=config,
trust_remote_code=True,
torch_dtype=torch.float32,
device_map={"": "cpu"},
token=hf_token,
low_cpu_mem_usage=True,
offload_folder="offload",
attn_implementation="eager"
)
# Set model to evaluation mode
model.eval()
# Disable gradients to save memory
for param in model.parameters():
param.requires_grad = False
print("โ
Model loaded successfully!")
# ===== STEP 4: Create Pipeline =====
print("๐ง Creating text generation pipeline...")
gen_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if device == "cuda" else -1,
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
framework="pt"
)
print("โ
Pipeline created successfully!")
print("=" * 60)
print("๐ MODEL READY FOR INFERENCE")
print("=" * 60)
return gen_pipeline, tokenizer
except Exception as e:
print(f"โ Error during model loading: {str(e)}")
print("\n๐ Diagnostic Information:")
print(f" - Model: {MODEL_NAME}")
print(f" - Device: {device}")
print(f" - Token available: {hf_token is not None}")
import traceback
traceback.print_exc()
raise RuntimeError(
f"Failed to load model {MODEL_NAME}. "
"Please check: "
"1) Internet connection, "
"2) HuggingFace token (if model is private), "
"3) Transformers version (requires >=4.36.0 for Phi-3)"
) from e
|