medical-coding-api / app /model_loader.py
Distopia22's picture
Production-ready Medical Coding API with Phi-3 support
d03f587
raw
history blame
4.54 kB
# model_loader.py
import os
import sys
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoConfig,
pipeline
)
import torch
import warnings
warnings.filterwarnings("ignore")
MODEL_NAME = "RayyanAhmed9477/med-coding"
def load_model_and_tokenizer():
"""
Loads Phi-3 model with comprehensive error handling and fallbacks.
Supports both CPU and GPU with automatic detection.
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"๐Ÿ”ง Using device: {device}")
print(f"๐Ÿ”ง PyTorch version: {torch.__version__}")
print(f"๐Ÿ”ง Transformers version: {sys.modules['transformers'].__version__}")
# Get HuggingFace token from environment
hf_token = os.getenv("HF_TOKEN")
try:
# ===== STEP 1: Load Tokenizer =====
print(f"๐Ÿ“ฅ Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True, # Critical for Phi-3
token=hf_token,
use_fast=True,
legacy=False
)
# Configure tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
tokenizer.padding_side = "left"
print("โœ… Tokenizer loaded successfully")
# ===== STEP 2: Load Configuration with trust_remote_code =====
print(f"๐Ÿ“ฅ Loading model configuration: {MODEL_NAME}")
config = AutoConfig.from_pretrained(
MODEL_NAME,
trust_remote_code=True, # Critical for Phi-3
token=hf_token
)
print(f"โœ… Config loaded: {config.model_type}")
# ===== STEP 3: Load Model =====
print(f"๐Ÿ“ฅ Loading model: {MODEL_NAME}")
print("โณ This may take 2-5 minutes on first load...")
if device == "cuda":
# GPU Configuration
print("๐ŸŽฎ Using GPU with bfloat16 precision")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=config,
trust_remote_code=True,
torch_dtype=torch.bfloat16,
device_map="auto",
token=hf_token,
low_cpu_mem_usage=True,
attn_implementation="eager" # More stable than flash attention
)
else:
# CPU Configuration - optimized for stability
print("๐Ÿ’ป Using CPU with float32 precision")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
config=config,
trust_remote_code=True,
torch_dtype=torch.float32,
device_map={"": "cpu"},
token=hf_token,
low_cpu_mem_usage=True,
offload_folder="offload",
attn_implementation="eager"
)
# Set model to evaluation mode
model.eval()
# Disable gradients to save memory
for param in model.parameters():
param.requires_grad = False
print("โœ… Model loaded successfully!")
# ===== STEP 4: Create Pipeline =====
print("๐Ÿ”ง Creating text generation pipeline...")
gen_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=0 if device == "cuda" else -1,
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
framework="pt"
)
print("โœ… Pipeline created successfully!")
print("=" * 60)
print("๐ŸŽ‰ MODEL READY FOR INFERENCE")
print("=" * 60)
return gen_pipeline, tokenizer
except Exception as e:
print(f"โŒ Error during model loading: {str(e)}")
print("\n๐Ÿ” Diagnostic Information:")
print(f" - Model: {MODEL_NAME}")
print(f" - Device: {device}")
print(f" - Token available: {hf_token is not None}")
import traceback
traceback.print_exc()
raise RuntimeError(
f"Failed to load model {MODEL_NAME}. "
"Please check: "
"1) Internet connection, "
"2) HuggingFace token (if model is private), "
"3) Transformers version (requires >=4.36.0 for Phi-3)"
) from e