# model_loader.py import os import sys from transformers import ( AutoTokenizer, AutoModelForCausalLM, AutoConfig, pipeline ) import torch import warnings warnings.filterwarnings("ignore") MODEL_NAME = "RayyanAhmed9477/med-coding" def load_model_and_tokenizer(): """ Loads Phi-3 model with multiple fallback strategies. Handles safetensors loading issues with robust error recovery. """ device = "cuda" if torch.cuda.is_available() else "cpu" print(f"šŸ”§ Using device: {device}") print(f"šŸ”§ PyTorch version: {torch.__version__}") print(f"šŸ”§ Transformers version: {sys.modules['transformers'].__version__}") # Get HuggingFace token from environment hf_token = os.getenv("HF_TOKEN") if hf_token: print("šŸ”‘ HuggingFace token found") else: print("āš ļø No HuggingFace token - assuming public model") try: # ===== STEP 1: Load Tokenizer ===== print(f"šŸ“„ Loading tokenizer: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, token=hf_token, use_fast=True ) # Configure tokenizer if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None: tokenizer.padding_side = "left" print("āœ… Tokenizer loaded successfully") # ===== STEP 2: Load Configuration ===== print(f"šŸ“„ Loading model configuration: {MODEL_NAME}") config = AutoConfig.from_pretrained( MODEL_NAME, trust_remote_code=True, token=hf_token ) # Handle LongRoPE configuration if hasattr(config, 'rope_scaling') and config.rope_scaling is not None: rope_type = config.rope_scaling.get('type', 'default') print(f"šŸ“ RoPE scaling type detected: {rope_type}") if rope_type == 'longrope': print("āœ… LongRoPE configuration detected and supported") print(f"āœ… Config loaded: {config.model_type}") # ===== STEP 3: Load Model with Multiple Strategies ===== print(f"šŸ“„ Loading model: {MODEL_NAME}") print("ā³ This may take 2-5 minutes on first load...") model = None loading_strategies = [] if device == "cuda": loading_strategies = [ # Strategy 1: Standard GPU loading { "name": "GPU Standard", "params": { "trust_remote_code": True, "torch_dtype": torch.bfloat16, "device_map": "auto", "token": hf_token, "low_cpu_mem_usage": True } } ] else: loading_strategies = [ # Strategy 1: CPU with safetensors (preferred) { "name": "CPU with safetensors", "params": { "trust_remote_code": True, "torch_dtype": torch.float32, "device_map": {"": "cpu"}, "token": hf_token, "low_cpu_mem_usage": True, "use_safetensors": True } }, # Strategy 2: CPU without explicit safetensors { "name": "CPU standard", "params": { "trust_remote_code": True, "torch_dtype": torch.float32, "token": hf_token, "low_cpu_mem_usage": True } }, # Strategy 3: CPU with PyTorch weights fallback { "name": "CPU PyTorch weights", "params": { "trust_remote_code": True, "torch_dtype": torch.float32, "token": hf_token, "low_cpu_mem_usage": True, "use_safetensors": False } }, # Strategy 4: Minimal parameters { "name": "CPU minimal", "params": { "trust_remote_code": True, "token": hf_token } } ] # Try each loading strategy for idx, strategy in enumerate(loading_strategies, 1): try: print(f"\nšŸ”„ Attempt {idx}/{len(loading_strategies)}: {strategy['name']}") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, config=config, **strategy['params'] ) # Move to CPU explicitly if needed if device == "cpu" and not strategy['params'].get('device_map'): model = model.to("cpu") print(f"āœ… Model loaded successfully using: {strategy['name']}") break except Exception as e: print(f"āŒ Strategy '{strategy['name']}' failed: {str(e)}") if idx == len(loading_strategies): # All strategies failed raise else: print(f"ā­ļø Trying next strategy...") continue if model is None: raise RuntimeError("All loading strategies failed") # Set model to evaluation mode model.eval() # Disable gradients to save memory for param in model.parameters(): param.requires_grad = False print("\nāœ… Model fully loaded and ready!") # ===== STEP 4: Create Pipeline ===== print("šŸ”§ Creating text generation pipeline...") gen_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1, torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32 ) print("āœ… Pipeline created successfully!") print("=" * 60) print("šŸŽ‰ MODEL READY FOR INFERENCE") print("=" * 60) return gen_pipeline, tokenizer except Exception as e: print(f"\nāŒ Error during model loading: {str(e)}") print("\nšŸ” Diagnostic Information:") print(f" - Model: {MODEL_NAME}") print(f" - Device: {device}") print(f" - Token available: {hf_token is not None}") import traceback traceback.print_exc() raise RuntimeError( f"Failed to load model {MODEL_NAME}. " "All loading strategies exhausted. " "This could be due to: " "1) Model file corruption during download, " "2) Insufficient memory, " "3) Model incompatibility. " "Try upgrading Space to GPU or use a different model." ) from e