Spaces:

Distopia22
/

medical-coding-api

Sleeping

File size: 4,540 Bytes

d03f587

# model_loader.py
import os
import sys
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    AutoConfig,
    pipeline
)
import torch
import warnings

warnings.filterwarnings("ignore")

MODEL_NAME = "RayyanAhmed9477/med-coding"

def load_model_and_tokenizer():
    """
    Loads Phi-3 model with comprehensive error handling and fallbacks.
    Supports both CPU and GPU with automatic detection.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"🔧 Using device: {device}")
    print(f"🔧 PyTorch version: {torch.__version__}")
    print(f"🔧 Transformers version: {sys.modules['transformers'].__version__}")
    
    # Get HuggingFace token from environment
    hf_token = os.getenv("HF_TOKEN")
    
    try:
        # ===== STEP 1: Load Tokenizer =====
        print(f"📥 Loading tokenizer: {MODEL_NAME}")
        tokenizer = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,  # Critical for Phi-3
            token=hf_token,
            use_fast=True,
            legacy=False
        )
        
        # Configure tokenizer
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
            tokenizer.padding_side = "left"
        
        print("✅ Tokenizer loaded successfully")
        
        # ===== STEP 2: Load Configuration with trust_remote_code =====
        print(f"📥 Loading model configuration: {MODEL_NAME}")
        config = AutoConfig.from_pretrained(
            MODEL_NAME,
            trust_remote_code=True,  # Critical for Phi-3
            token=hf_token
        )
        print(f"✅ Config loaded: {config.model_type}")
        
        # ===== STEP 3: Load Model =====
        print(f"📥 Loading model: {MODEL_NAME}")
        print("⏳ This may take 2-5 minutes on first load...")
        
        if device == "cuda":
            # GPU Configuration
            print("🎮 Using GPU with bfloat16 precision")
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                config=config,
                trust_remote_code=True,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                token=hf_token,
                low_cpu_mem_usage=True,
                attn_implementation="eager"  # More stable than flash attention
            )
        else:
            # CPU Configuration - optimized for stability
            print("💻 Using CPU with float32 precision")
            model = AutoModelForCausalLM.from_pretrained(
                MODEL_NAME,
                config=config,
                trust_remote_code=True,
                torch_dtype=torch.float32,
                device_map={"": "cpu"},
                token=hf_token,
                low_cpu_mem_usage=True,
                offload_folder="offload",
                attn_implementation="eager"
            )
        
        # Set model to evaluation mode
        model.eval()
        
        # Disable gradients to save memory
        for param in model.parameters():
            param.requires_grad = False
        
        print("✅ Model loaded successfully!")
        
        # ===== STEP 4: Create Pipeline =====
        print("🔧 Creating text generation pipeline...")
        gen_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device == "cuda" else -1,
            torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
            framework="pt"
        )
        
        print("✅ Pipeline created successfully!")
        print("=" * 60)
        print("🎉 MODEL READY FOR INFERENCE")
        print("=" * 60)
        
        return gen_pipeline, tokenizer
        
    except Exception as e:
        print(f"❌ Error during model loading: {str(e)}")
        print("\n🔍 Diagnostic Information:")
        print(f"   - Model: {MODEL_NAME}")
        print(f"   - Device: {device}")
        print(f"   - Token available: {hf_token is not None}")
        
        import traceback
        traceback.print_exc()
        
        raise RuntimeError(
            f"Failed to load model {MODEL_NAME}. "
            "Please check: "
            "1) Internet connection, "
            "2) HuggingFace token (if model is private), "
            "3) Transformers version (requires >=4.36.0 for Phi-3)"
        ) from e