Spaces:

Distopia22
/

medical-coding-api

Sleeping

App Files Files Community

medical-coding-api / app /model_loader.py

Distopia22

Production-ready Medical Coding API with Phi-3 support

d03f587 about 2 months ago

raw

history blame

4.54 kB

	# model_loader.py
	import os
	import sys
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoConfig,
	pipeline
	)
	import torch
	import warnings

	warnings.filterwarnings("ignore")

	MODEL_NAME = "RayyanAhmed9477/med-coding"

	def load_model_and_tokenizer():
	"""
	Loads Phi-3 model with comprehensive error handling and fallbacks.
	Supports both CPU and GPU with automatic detection.
	"""
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"🔧 Using device: {device}")
	print(f"🔧 PyTorch version: {torch.__version__}")
	print(f"🔧 Transformers version: {sys.modules['transformers'].__version__}")

	# Get HuggingFace token from environment
	hf_token = os.getenv("HF_TOKEN")

	try:
	# ===== STEP 1: Load Tokenizer =====
	print(f"📥 Loading tokenizer: {MODEL_NAME}")
	tokenizer = AutoTokenizer.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True, # Critical for Phi-3
	token=hf_token,
	use_fast=True,
	legacy=False
	)

	# Configure tokenizer
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token
	if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None:
	tokenizer.padding_side = "left"

	print("✅ Tokenizer loaded successfully")

	# ===== STEP 2: Load Configuration with trust_remote_code =====
	print(f"📥 Loading model configuration: {MODEL_NAME}")
	config = AutoConfig.from_pretrained(
	MODEL_NAME,
	trust_remote_code=True, # Critical for Phi-3
	token=hf_token
	)
	print(f"✅ Config loaded: {config.model_type}")

	# ===== STEP 3: Load Model =====
	print(f"📥 Loading model: {MODEL_NAME}")
	print("⏳ This may take 2-5 minutes on first load...")

	if device == "cuda":
	# GPU Configuration
	print("🎮 Using GPU with bfloat16 precision")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	config=config,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	token=hf_token,
	low_cpu_mem_usage=True,
	attn_implementation="eager" # More stable than flash attention
	)
	else:
	# CPU Configuration - optimized for stability
	print("💻 Using CPU with float32 precision")
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	config=config,
	trust_remote_code=True,
	torch_dtype=torch.float32,
	device_map={"": "cpu"},
	token=hf_token,
	low_cpu_mem_usage=True,
	offload_folder="offload",
	attn_implementation="eager"
	)

	# Set model to evaluation mode
	model.eval()

	# Disable gradients to save memory
	for param in model.parameters():
	param.requires_grad = False

	print("✅ Model loaded successfully!")

	# ===== STEP 4: Create Pipeline =====
	print("🔧 Creating text generation pipeline...")
	gen_pipeline = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	device=0 if device == "cuda" else -1,
	torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
	framework="pt"
	)

	print("✅ Pipeline created successfully!")
	print("=" * 60)
	print("🎉 MODEL READY FOR INFERENCE")
	print("=" * 60)

	return gen_pipeline, tokenizer

	except Exception as e:
	print(f"❌ Error during model loading: {str(e)}")
	print("\n🔍 Diagnostic Information:")
	print(f" - Model: {MODEL_NAME}")
	print(f" - Device: {device}")
	print(f" - Token available: {hf_token is not None}")

	import traceback
	traceback.print_exc()

	raise RuntimeError(
	f"Failed to load model {MODEL_NAME}. "
	"Please check: "
	"1) Internet connection, "
	"2) HuggingFace token (if model is private), "
	"3) Transformers version (requires >=4.36.0 for Phi-3)"
	) from e