# model_loader.py import os import sys from transformers import ( AutoTokenizer, AutoModelForCausalLM, AutoConfig, pipeline ) import torch import warnings warnings.filterwarnings("ignore") MODEL_NAME = "RayyanAhmed9477/med-coding" def load_model_and_tokenizer(): """ Loads Phi-3 model with comprehensive error handling and fallbacks. Supports both CPU and GPU with automatic detection. """ device = "cuda" if torch.cuda.is_available() else "cpu" print(f"šŸ”§ Using device: {device}") print(f"šŸ”§ PyTorch version: {torch.__version__}") print(f"šŸ”§ Transformers version: {sys.modules['transformers'].__version__}") # Get HuggingFace token from environment hf_token = os.getenv("HF_TOKEN") try: # ===== STEP 1: Load Tokenizer ===== print(f"šŸ“„ Loading tokenizer: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, # Critical for Phi-3 token=hf_token, use_fast=True, legacy=False ) # Configure tokenizer if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None: tokenizer.padding_side = "left" print("āœ… Tokenizer loaded successfully") # ===== STEP 2: Load Configuration with trust_remote_code ===== print(f"šŸ“„ Loading model configuration: {MODEL_NAME}") config = AutoConfig.from_pretrained( MODEL_NAME, trust_remote_code=True, # Critical for Phi-3 token=hf_token ) print(f"āœ… Config loaded: {config.model_type}") # ===== STEP 3: Load Model ===== print(f"šŸ“„ Loading model: {MODEL_NAME}") print("ā³ This may take 2-5 minutes on first load...") if device == "cuda": # GPU Configuration print("šŸŽ® Using GPU with bfloat16 precision") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, config=config, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", token=hf_token, low_cpu_mem_usage=True, attn_implementation="eager" # More stable than flash attention ) else: # CPU Configuration - optimized for stability print("šŸ’» Using CPU with float32 precision") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, config=config, trust_remote_code=True, torch_dtype=torch.float32, device_map={"": "cpu"}, token=hf_token, low_cpu_mem_usage=True, offload_folder="offload", attn_implementation="eager" ) # Set model to evaluation mode model.eval() # Disable gradients to save memory for param in model.parameters(): param.requires_grad = False print("āœ… Model loaded successfully!") # ===== STEP 4: Create Pipeline ===== print("šŸ”§ Creating text generation pipeline...") gen_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, framework="pt" ) print("āœ… Pipeline created successfully!") print("=" * 60) print("šŸŽ‰ MODEL READY FOR INFERENCE") print("=" * 60) return gen_pipeline, tokenizer except Exception as e: print(f"āŒ Error during model loading: {str(e)}") print("\nšŸ” Diagnostic Information:") print(f" - Model: {MODEL_NAME}") print(f" - Device: {device}") print(f" - Token available: {hf_token is not None}") import traceback traceback.print_exc() raise RuntimeError( f"Failed to load model {MODEL_NAME}. " "Please check: " "1) Internet connection, " "2) HuggingFace token (if model is private), " "3) Transformers version (requires >=4.36.0 for Phi-3)" ) from e