Spaces:
Sleeping
Sleeping
| # model_loader.py | |
| import os | |
| import sys | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| AutoConfig, | |
| pipeline | |
| ) | |
| import torch | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| MODEL_NAME = "RayyanAhmed9477/med-coding" | |
| def load_model_and_tokenizer(): | |
| """ | |
| Loads Phi-3 model with comprehensive error handling and fallbacks. | |
| Supports both CPU and GPU with automatic detection. | |
| """ | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"๐ง Using device: {device}") | |
| print(f"๐ง PyTorch version: {torch.__version__}") | |
| print(f"๐ง Transformers version: {sys.modules['transformers'].__version__}") | |
| # Get HuggingFace token from environment | |
| hf_token = os.getenv("HF_TOKEN") | |
| try: | |
| # ===== STEP 1: Load Tokenizer ===== | |
| print(f"๐ฅ Loading tokenizer: {MODEL_NAME}") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, # Critical for Phi-3 | |
| token=hf_token, | |
| use_fast=True, | |
| legacy=False | |
| ) | |
| # Configure tokenizer | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| if not hasattr(tokenizer, 'padding_side') or tokenizer.padding_side is None: | |
| tokenizer.padding_side = "left" | |
| print("โ Tokenizer loaded successfully") | |
| # ===== STEP 2: Load Configuration with trust_remote_code ===== | |
| print(f"๐ฅ Loading model configuration: {MODEL_NAME}") | |
| config = AutoConfig.from_pretrained( | |
| MODEL_NAME, | |
| trust_remote_code=True, # Critical for Phi-3 | |
| token=hf_token | |
| ) | |
| print(f"โ Config loaded: {config.model_type}") | |
| # ===== STEP 3: Load Model ===== | |
| print(f"๐ฅ Loading model: {MODEL_NAME}") | |
| print("โณ This may take 2-5 minutes on first load...") | |
| if device == "cuda": | |
| # GPU Configuration | |
| print("๐ฎ Using GPU with bfloat16 precision") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| config=config, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| token=hf_token, | |
| low_cpu_mem_usage=True, | |
| attn_implementation="eager" # More stable than flash attention | |
| ) | |
| else: | |
| # CPU Configuration - optimized for stability | |
| print("๐ป Using CPU with float32 precision") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| config=config, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float32, | |
| device_map={"": "cpu"}, | |
| token=hf_token, | |
| low_cpu_mem_usage=True, | |
| offload_folder="offload", | |
| attn_implementation="eager" | |
| ) | |
| # Set model to evaluation mode | |
| model.eval() | |
| # Disable gradients to save memory | |
| for param in model.parameters(): | |
| param.requires_grad = False | |
| print("โ Model loaded successfully!") | |
| # ===== STEP 4: Create Pipeline ===== | |
| print("๐ง Creating text generation pipeline...") | |
| gen_pipeline = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| device=0 if device == "cuda" else -1, | |
| torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
| framework="pt" | |
| ) | |
| print("โ Pipeline created successfully!") | |
| print("=" * 60) | |
| print("๐ MODEL READY FOR INFERENCE") | |
| print("=" * 60) | |
| return gen_pipeline, tokenizer | |
| except Exception as e: | |
| print(f"โ Error during model loading: {str(e)}") | |
| print("\n๐ Diagnostic Information:") | |
| print(f" - Model: {MODEL_NAME}") | |
| print(f" - Device: {device}") | |
| print(f" - Token available: {hf_token is not None}") | |
| import traceback | |
| traceback.print_exc() | |
| raise RuntimeError( | |
| f"Failed to load model {MODEL_NAME}. " | |
| "Please check: " | |
| "1) Internet connection, " | |
| "2) HuggingFace token (if model is private), " | |
| "3) Transformers version (requires >=4.36.0 for Phi-3)" | |
| ) from e | |