Spaces:

danmac1
/

ChadTRP_Test_052025

Paused

App Files Files Community

danmac1 commited on May 21, 2025

Commit

3bd8666

verified ·

1 Parent(s): db4b4c9

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -84

app.py CHANGED Viewed

@@ -5,10 +5,12 @@ from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import uvicorn
 import os
 # --- Global Variables for Model and Tokenizer ---
 model = None
 tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"--- Initializing on Device: {device} ---")
@@ -24,104 +26,130 @@ class PromptRequest(BaseModel):
 app = FastAPI()
 def load_model_and_tokenizer():
-    global model, tokenizer
     base_model_id = os.environ.get("BASE_MODEL_ID")
     adapter_path = os.environ.get("ADAPTER_PATH")
     hf_token = os.environ.get("HF_TOKEN")
     if not base_model_id:
-        print("ERROR: BASE_MODEL_ID environment variable not set.")
-        raise ValueError("BASE_MODEL_ID environment variable not set.")
     if not adapter_path:
-        print("ERROR: ADAPTER_PATH environment variable not set.")
-        raise ValueError("ADAPTER_PATH environment variable not set.")
     print(f"Using device: {device}")
     print(f"Attempting to load base model: {base_model_id}")
     print(f"Attempting to load adapter from: {adapter_path}")
-    # --- Load Tokenizer ---
-    print(f"Loading tokenizer...")
     try:
-        tokenizer = AutoTokenizer.from_pretrained(adapter_path, token=hf_token, trust_remote_code=True)
-        print(f"Loaded tokenizer from adapter path: {adapter_path}")
-    except Exception as e:
-        print(f"Could not load tokenizer from adapter path: {e}. Loading from base model path: {base_model_id}")
-        tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
-    if tokenizer.pad_token is None:
-        if tokenizer.eos_token is not None:
-            print("Setting pad_token to eos_token.")
-            tokenizer.pad_token = tokenizer.eos_token
         else:
-            print("Adding new pad_token '[PAD]'.")
-            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-    tokenizer.padding_side = "left"
-    # --- Configure Quantization ---
-    print("Configuring 4-bit quantization...")
-    compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() and device == "cuda" else torch.float16
-    bnb_config = None
-    if device == "cuda":
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_compute_dtype=compute_dtype,
-            bnb_4bit_use_double_quant=True,
         )
-        print(f"Using BNB config with compute_dtype: {compute_dtype}")
-    else:
-        print("Running on CPU, BNB quantization will not be applied.")
-    # --- Load Base Model with Quantization ---
-    print(f"Loading base model: {base_model_id}...")
-    config = AutoConfig.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
-    if getattr(config, "pretraining_tp", 1) != 1:
-        print(f"Overriding pretraining_tp from {getattr(config, 'pretraining_tp', 'N/A')} to 1.")
-        config.pretraining_tp = 1
-    base_model_instance = AutoModelForCausalLM.from_pretrained(
-        base_model_id,
-        config=config,
-        quantization_config=bnb_config if device == "cuda" else None,
-        device_map={"": device},
-        token=hf_token,
-        trust_remote_code=True,
-        low_cpu_mem_usage=True if device == "cuda" else False
-    )
-    print("Base model loaded.")
-    if tokenizer.pad_token_id is not None and tokenizer.pad_token_id >= base_model_instance.config.vocab_size:
-        print("Resizing token embeddings for base model.")
-        base_model_instance.resize_token_embeddings(len(tokenizer))
-    # --- Load LoRA Adapter ---
-    print(f"Loading LoRA adapter from: {adapter_path}...")
-    model = PeftModel.from_pretrained(base_model_instance, adapter_path)
-    model.eval()
-    print("LoRA adapter loaded and model is in eval mode.")
-    print(f"Model is on device: {model.device}")
 @app.on_event("startup")
 async def startup_event():
-    print("Server startup event: Loading model and tokenizer...")
-    try:
-        load_model_and_tokenizer()
-        print("Model and tokenizer loaded successfully via startup event.")
-    except Exception as e:
-        print(f"CRITICAL ERROR during startup model loading: {e}")
-        # This error might not stop Uvicorn if it's already started by __main__
-        # but it will prevent the /generate endpoint from working.
-        # Consider raising an exception here to potentially stop the app if model load fails.
-        # For now, it will print and the /generate endpoint will show model not loaded.
 @app.post("/generate/")
 async def generate_text(request: PromptRequest):
-    global model, tokenizer
-    if model is None or tokenizer is None:
-        # This error will be returned to the client
         raise HTTPException(status_code=503, detail="Model is not loaded or still loading. Please try again shortly or check server logs.")
     try:
@@ -156,16 +184,13 @@ async def generate_text(request: PromptRequest):
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
-    print("Starting Uvicorn server directly from app.py...")
-    # Hugging Face Spaces injects the PORT environment variable.
-    # Default to 8000 if not set (for local testing without Spaces).
     port = int(os.environ.get("PORT", 8000))
-    host = "0.0.0.0" # Listen on all available network interfaces
     print(f"Uvicorn will attempt to listen on host {host}, port {port}")
-    # The @app.on_event("startup") should be called by Uvicorn when it starts the app.
-    # This will trigger load_model_and_tokenizer().
     try:
         uvicorn.run(app, host=host, port=port)
     except Exception as e:

 from pydantic import BaseModel
 import uvicorn
 import os
+import time # For checking model load status
 # --- Global Variables for Model and Tokenizer ---
 model = None
 tokenizer = None
+model_loaded_successfully = False # Flag to indicate model status
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"--- Initializing on Device: {device} ---")
 app = FastAPI()
 def load_model_and_tokenizer():
+    global model, tokenizer, model_loaded_successfully
     base_model_id = os.environ.get("BASE_MODEL_ID")
     adapter_path = os.environ.get("ADAPTER_PATH")
     hf_token = os.environ.get("HF_TOKEN")
     if not base_model_id:
+        print("CRITICAL ERROR: BASE_MODEL_ID environment variable not set.")
+        # In a real app, you might want to prevent startup or handle this more gracefully
+        return
     if not adapter_path:
+        print("CRITICAL ERROR: ADAPTER_PATH environment variable not set.")
+        return
     print(f"Using device: {device}")
     print(f"Attempting to load base model: {base_model_id}")
     print(f"Attempting to load adapter from: {adapter_path}")
     try:
+        # --- Load Tokenizer ---
+        print(f"Loading tokenizer...")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(adapter_path, token=hf_token, trust_remote_code=True)
+            print(f"Loaded tokenizer from adapter path: {adapter_path}")
+        except Exception as e:
+            print(f"Could not load tokenizer from adapter path: {e}. Loading from base model path: {base_model_id}")
+            tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            if tokenizer.eos_token is not None:
+                print("Setting pad_token to eos_token.")
+                tokenizer.pad_token = tokenizer.eos_token
+            else:
+                print("Adding new pad_token '[PAD]'.")
+                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        tokenizer.padding_side = "left"
+        # --- Configure Quantization ---
+        print("Configuring 4-bit quantization...")
+        compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() and device == "cuda" else torch.float16
+        bnb_config = None
+        if device == "cuda":
+            bnb_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=True,
+            )
+            print(f"Using BNB config with compute_dtype: {compute_dtype}")
         else:
+            print("Running on CPU, BNB quantization will not be applied.")
+        # --- Load Base Model with Quantization ---
+        print(f"Loading base model: {base_model_id}...")
+        config = AutoConfig.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
+        if getattr(config, "pretraining_tp", 1) != 1:
+            print(f"Overriding pretraining_tp from {getattr(config, 'pretraining_tp', 'N/A')} to 1.")
+            config.pretraining_tp = 1
+        base_model_instance = AutoModelForCausalLM.from_pretrained(
+            base_model_id,
+            config=config,
+            quantization_config=bnb_config if device == "cuda" else None,
+            device_map={"": device},
+            token=hf_token,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True if device == "cuda" else False
         )
+        print("Base model loaded.")
+        if tokenizer.pad_token_id is not None and tokenizer.pad_token_id >= base_model_instance.config.vocab_size:
+            print("Resizing token embeddings for base model.")
+            base_model_instance.resize_token_embeddings(len(tokenizer))
+        # --- Load LoRA Adapter ---
+        print(f"Loading LoRA adapter from: {adapter_path}...")
+        model = PeftModel.from_pretrained(base_model_instance, adapter_path)
+        model.eval()
+        print("LoRA adapter loaded and model is in eval mode.")
+        print(f"Model is on device: {model.device}")
+        model_loaded_successfully = True # Set flag on successful load
+        print("Model and tokenizer loaded successfully.")
+    except Exception as e:
+        print(f"CRITICAL ERROR during model/tokenizer loading: {e}")
+        model_loaded_successfully = False
+        # Optionally, re-raise or handle to prevent app from starting if model load fails.
+        # For now, it will print error and the /generate endpoint will show model not loaded.
+        # And the health check will show model not ready.
 @app.on_event("startup")
 async def startup_event():
+    print("Server startup event: Initiating model and tokenizer loading...")
+    # Model loading can take time, so it's done here.
+    # Health checks might hit the server before this completes.
+    load_model_and_tokenizer()
+    if model_loaded_successfully:
+        print("Model loading process completed successfully within startup event.")
+    else:
+        print("Model loading process encountered an error or did not complete within startup event.")
+# <<< --- ADDED HEALTH CHECK ENDPOINT --- >>>
+@app.get("/")
+async def health_check():
+    """Basic health check endpoint."""
+    if model_loaded_successfully and model is not None and tokenizer is not None:
+        return {"status": "ok", "message": "Model is loaded and ready."}
+    else:
+        # Return a 503 if model isn't ready yet, so Spaces knows it's still starting up
+        # or if loading failed.
+        raise HTTPException(status_code=503, detail="Model is not loaded or still loading.")
+@app.get("/health") # Common alternative health check path
+async def health_check_alternative():
+    return await health_check()
+# <<< --- END OF HEALTH CHECK ENDPOINT --- >>>
 @app.post("/generate/")
 async def generate_text(request: PromptRequest):
+    global model, tokenizer, model_loaded_successfully
+    if not model_loaded_successfully or model is None or tokenizer is None:
         raise HTTPException(status_code=503, detail="Model is not loaded or still loading. Please try again shortly or check server logs.")
     try:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
+    print("Starting Uvicorn server directly from app.py for local testing...")
     port = int(os.environ.get("PORT", 8000))
+    host = "0.0.0.0"
     print(f"Uvicorn will attempt to listen on host {host}, port {port}")
+    print("Set BASE_MODEL_ID and ADAPTER_PATH environment variables for model loading.")
+    # The @app.on_event("startup") will be called by Uvicorn.
     try:
         uvicorn.run(app, host=host, port=port)
     except Exception as e: