Spaces:

danmac1
/

ChadTRP_Test_052025

Paused

App Files Files Community

danmac1 commited on May 20, 2025

Commit

b291059

1 Parent(s): 27cd88b

Initial FastAPI app with LoRA model

Browse files

Files changed (8) hide show

Procfile +1 -0
app.py +185 -0
my_adapter/adapter_config.json +39 -0
my_adapter/adapter_model.safetensors +3 -0
my_adapter/special_tokens_map.json +23 -0
my_adapter/tokenizer.json +0 -0
my_adapter/tokenizer_config.json +138 -0
requirements.txt +10 -0

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: uvicorn app:app --host 0.0.0.0 --port $PORT

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
+from peft import PeftModel
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import uvicorn
+import os
+# --- Global Variables for Model and Tokenizer ---
+model = None
+tokenizer = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"--- Initializing on Device: {device} ---")
+# --- Pydantic Model for Request Body ---
+class PromptRequest(BaseModel):
+    prompt: str
+    max_new_tokens: int = 256
+    temperature: float = 0.7
+    top_p: float = 0.9
+    top_k: int = 50
+# --- FastAPI App Initialization ---
+app = FastAPI()
+def load_model_and_tokenizer():
+    global model, tokenizer
+    base_model_id = os.environ.get("BASE_MODEL_ID")
+    adapter_path = os.environ.get("ADAPTER_PATH")
+    hf_token = os.environ.get("HF_TOKEN") # For downloading base model if needed
+    if not base_model_id:
+        raise ValueError("BASE_MODEL_ID environment variable not set.")
+    if not adapter_path:
+        raise ValueError("ADAPTER_PATH environment variable not set.")
+    print(f"Using device: {device}")
+    print(f"Attempting to load base model: {base_model_id}")
+    print(f"Attempting to load adapter from: {adapter_path}")
+    # --- Load Tokenizer ---
+    print(f"Loading tokenizer...")
+    try:
+        # Try loading tokenizer from the adapter path first as it should have been saved there
+        tokenizer = AutoTokenizer.from_pretrained(adapter_path, token=hf_token, trust_remote_code=True)
+        print(f"Loaded tokenizer from adapter path: {adapter_path}")
+    except Exception as e:
+        print(f"Could not load tokenizer from adapter path: {e}. Loading from base model path: {base_model_id}")
+        tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        if tokenizer.eos_token is not None:
+            print("Setting pad_token to eos_token.")
+            tokenizer.pad_token = tokenizer.eos_token
+        else:
+            print("Adding new pad_token '[PAD]'.")
+            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    tokenizer.padding_side = "left" # Important for generation
+    # --- Configure Quantization ---
+    print("Configuring 4-bit quantization...")
+    compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() and device == "cuda" else torch.float16
+    bnb_config = None
+    if device == "cuda": # Only apply BNB config if on GPU
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=True,
+        )
+        print(f"Using BNB config with compute_dtype: {compute_dtype}")
+    else:
+        print("Running on CPU, BNB quantization will not be applied.")
+    # --- Load Base Model with Quantization ---
+    print(f"Loading base model: {base_model_id}...")
+    config = AutoConfig.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
+    if getattr(config, "pretraining_tp", 1) != 1: # Default to 1 if not present
+        print(f"Overriding pretraining_tp from {getattr(config, 'pretraining_tp', 'N/A')} to 1.")
+        config.pretraining_tp = 1
+    base_model_instance = AutoModelForCausalLM.from_pretrained(
+        base_model_id,
+        config=config,
+        quantization_config=bnb_config if device == "cuda" else None, # Only if on GPU
+        device_map={"": device}, # Load directly to the determined device
+        token=hf_token,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True if device == "cuda" else False # More relevant for GPU
+    )
+    print("Base model loaded.")
+    if tokenizer.pad_token_id is not None and tokenizer.pad_token_id >= base_model_instance.config.vocab_size:
+        print("Resizing token embeddings for base model.")
+        base_model_instance.resize_token_embeddings(len(tokenizer))
+    # --- Load LoRA Adapter ---
+    print(f"Loading LoRA adapter from: {adapter_path}...")
+    # For PEFT, if the base model is already on the target device,
+    # PeftModel.from_pretrained should handle adapter loading correctly.
+    model = PeftModel.from_pretrained(base_model_instance, adapter_path)
+    # model = model.to(device) # Should already be on device due to base_model_instance's device_map
+    model.eval()
+    print("LoRA adapter loaded and model is in eval mode.")
+    print(f"Model is on device: {model.device}")
+@app.on_event("startup")
+async def startup_event():
+    print("Server startup: Loading model and tokenizer...")
+    try:
+        load_model_and_tokenizer()
+        print("Model and tokenizer loaded successfully.")
+    except Exception as e:
+        print(f"Error during startup model loading: {e}")
+        # Optionally, re-raise or handle to prevent app from starting if model load fails
+        # For now, it will print error and /generate will return "Model not loaded"
+@app.post("/generate/")
+async def generate_text(request: PromptRequest):
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        raise HTTPException(status_code=503, detail="Model not loaded yet. Please wait or check server logs.")
+    try:
+        inputs = tokenizer(request.prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
+        print(f"Received prompt: {request.prompt}")
+        print("Generating...")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=request.max_new_tokens,
+                num_return_sequences=1,
+                do_sample=True,
+                temperature=request.temperature,
+                top_p=request.top_p,
+                top_k=request.top_k,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id
+            )
+        prompt_tokens = inputs.input_ids.shape[-1]
+        # Ensure generated_sequence is not empty before decoding
+        if outputs[0].size(0) > prompt_tokens:
+            generated_sequence = outputs[0][prompt_tokens:]
+            generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
+        else: # Handle case where no new tokens were generated beyond the prompt
+            generated_text = ""
+        print(f"Generated text: {generated_text}")
+        return {"generated_text": generated_text}
+    except Exception as e:
+        print(f"Error during generation: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+# This __main__ block is for local testing.
+# On Hugging Face Spaces with a Procfile, Uvicorn will be started differently.
+if __name__ == "__main__":
+    print("Starting server locally for testing...")
+    # For local testing, you'd set these environment variables or pass them as args
+    # and adjust the startup_event or load_model_and_tokenizer call.
+    # Example:
+    # os.environ["BASE_MODEL_ID"] = "deepseek-ai/deepseek-llm-7b-base"
+    # os.environ["ADAPTER_PATH"] = "./path_to_your_adapter_locally" # Adjust this path
+    # The startup_event will try to read from os.environ
+    # You need to set these before running uvicorn locally for testing the startup logic
+    # To run locally:
+    # 1. Set environment variables:
+    #    export BASE_MODEL_ID="deepseek-ai/deepseek-llm-7b-base"
+    #    export ADAPTER_PATH="./my_adapter"
+    # 2. Run:
+    #    python app.py
+    #    (This will fail because uvicorn.run is not called here directly with the app object)
+    # OR better for local testing:
+    #    uvicorn app:app --reload --host 0.0.0.0 --port 8000
+    #    (And ensure BASE_MODEL_ID and ADAPTER_PATH are set in your shell environment)
+    print("To run this app locally for testing, set BASE_MODEL_ID and ADAPTER_PATH environment variables, then run:")
+    print("uvicorn app:app --reload --host 0.0.0.0 --port 8000")

my_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "deepseek-ai/deepseek-llm-7b-base",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "down_proj",
+    "k_proj",
+    "v_proj",
+    "gate_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

my_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d7abe86e797630ad2e7e9f4b6b6e5df3b5735d616534e35212f3d9650190dea
+size 299883760

my_adapter/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

my_adapter/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

my_adapter/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "100000": {
+      "content": "<｜begin▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100001": {
+      "content": "<｜end▁of▁sentence｜>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100002": {
+      "content": "ø",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100003": {
+      "content": "ö",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100004": {
+      "content": "ú",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100005": {
+      "content": "ÿ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100006": {
+      "content": "õ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100007": {
+      "content": "÷",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100008": {
+      "content": "û",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100009": {
+      "content": "ý",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100010": {
+      "content": "À",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100011": {
+      "content": "ù",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100012": {
+      "content": "Á",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100013": {
+      "content": "þ",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100014": {
+      "content": "ü",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 4096,
+  "pad_token": "<｜end▁of▁sentence｜>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizerFast",
+  "unk_token": null,
+  "use_default_system_prompt": false
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn[standard]
+torch
+transformers
+peft
+bitsandbytes
+accelerate
+sentencepiece
+pydantic
+python-dotenv # Optional, if you want to use a .env file for local testing