danmac1 commited on
Commit
b291059
·
1 Parent(s): 27cd88b

Initial FastAPI app with LoRA model

Browse files
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn app:app --host 0.0.0.0 --port $PORT
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig
3
+ from peft import PeftModel
4
+ from fastapi import FastAPI, HTTPException
5
+ from pydantic import BaseModel
6
+ import uvicorn
7
+ import os
8
+
9
+ # --- Global Variables for Model and Tokenizer ---
10
+ model = None
11
+ tokenizer = None
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ print(f"--- Initializing on Device: {device} ---")
14
+
15
+ # --- Pydantic Model for Request Body ---
16
+ class PromptRequest(BaseModel):
17
+ prompt: str
18
+ max_new_tokens: int = 256
19
+ temperature: float = 0.7
20
+ top_p: float = 0.9
21
+ top_k: int = 50
22
+
23
+ # --- FastAPI App Initialization ---
24
+ app = FastAPI()
25
+
26
+ def load_model_and_tokenizer():
27
+ global model, tokenizer
28
+
29
+ base_model_id = os.environ.get("BASE_MODEL_ID")
30
+ adapter_path = os.environ.get("ADAPTER_PATH")
31
+ hf_token = os.environ.get("HF_TOKEN") # For downloading base model if needed
32
+
33
+ if not base_model_id:
34
+ raise ValueError("BASE_MODEL_ID environment variable not set.")
35
+ if not adapter_path:
36
+ raise ValueError("ADAPTER_PATH environment variable not set.")
37
+
38
+ print(f"Using device: {device}")
39
+ print(f"Attempting to load base model: {base_model_id}")
40
+ print(f"Attempting to load adapter from: {adapter_path}")
41
+
42
+ # --- Load Tokenizer ---
43
+ print(f"Loading tokenizer...")
44
+ try:
45
+ # Try loading tokenizer from the adapter path first as it should have been saved there
46
+ tokenizer = AutoTokenizer.from_pretrained(adapter_path, token=hf_token, trust_remote_code=True)
47
+ print(f"Loaded tokenizer from adapter path: {adapter_path}")
48
+ except Exception as e:
49
+ print(f"Could not load tokenizer from adapter path: {e}. Loading from base model path: {base_model_id}")
50
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
51
+
52
+ if tokenizer.pad_token is None:
53
+ if tokenizer.eos_token is not None:
54
+ print("Setting pad_token to eos_token.")
55
+ tokenizer.pad_token = tokenizer.eos_token
56
+ else:
57
+ print("Adding new pad_token '[PAD]'.")
58
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
59
+ tokenizer.padding_side = "left" # Important for generation
60
+
61
+ # --- Configure Quantization ---
62
+ print("Configuring 4-bit quantization...")
63
+ compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() and device == "cuda" else torch.float16
64
+
65
+ bnb_config = None
66
+ if device == "cuda": # Only apply BNB config if on GPU
67
+ bnb_config = BitsAndBytesConfig(
68
+ load_in_4bit=True,
69
+ bnb_4bit_quant_type="nf4",
70
+ bnb_4bit_compute_dtype=compute_dtype,
71
+ bnb_4bit_use_double_quant=True,
72
+ )
73
+ print(f"Using BNB config with compute_dtype: {compute_dtype}")
74
+ else:
75
+ print("Running on CPU, BNB quantization will not be applied.")
76
+
77
+
78
+ # --- Load Base Model with Quantization ---
79
+ print(f"Loading base model: {base_model_id}...")
80
+ config = AutoConfig.from_pretrained(base_model_id, token=hf_token, trust_remote_code=True)
81
+ if getattr(config, "pretraining_tp", 1) != 1: # Default to 1 if not present
82
+ print(f"Overriding pretraining_tp from {getattr(config, 'pretraining_tp', 'N/A')} to 1.")
83
+ config.pretraining_tp = 1
84
+
85
+ base_model_instance = AutoModelForCausalLM.from_pretrained(
86
+ base_model_id,
87
+ config=config,
88
+ quantization_config=bnb_config if device == "cuda" else None, # Only if on GPU
89
+ device_map={"": device}, # Load directly to the determined device
90
+ token=hf_token,
91
+ trust_remote_code=True,
92
+ low_cpu_mem_usage=True if device == "cuda" else False # More relevant for GPU
93
+ )
94
+ print("Base model loaded.")
95
+
96
+ if tokenizer.pad_token_id is not None and tokenizer.pad_token_id >= base_model_instance.config.vocab_size:
97
+ print("Resizing token embeddings for base model.")
98
+ base_model_instance.resize_token_embeddings(len(tokenizer))
99
+
100
+ # --- Load LoRA Adapter ---
101
+ print(f"Loading LoRA adapter from: {adapter_path}...")
102
+ # For PEFT, if the base model is already on the target device,
103
+ # PeftModel.from_pretrained should handle adapter loading correctly.
104
+ model = PeftModel.from_pretrained(base_model_instance, adapter_path)
105
+ # model = model.to(device) # Should already be on device due to base_model_instance's device_map
106
+ model.eval()
107
+ print("LoRA adapter loaded and model is in eval mode.")
108
+ print(f"Model is on device: {model.device}")
109
+
110
+ @app.on_event("startup")
111
+ async def startup_event():
112
+ print("Server startup: Loading model and tokenizer...")
113
+ try:
114
+ load_model_and_tokenizer()
115
+ print("Model and tokenizer loaded successfully.")
116
+ except Exception as e:
117
+ print(f"Error during startup model loading: {e}")
118
+ # Optionally, re-raise or handle to prevent app from starting if model load fails
119
+ # For now, it will print error and /generate will return "Model not loaded"
120
+
121
+ @app.post("/generate/")
122
+ async def generate_text(request: PromptRequest):
123
+ global model, tokenizer
124
+ if model is None or tokenizer is None:
125
+ raise HTTPException(status_code=503, detail="Model not loaded yet. Please wait or check server logs.")
126
+
127
+ try:
128
+ inputs = tokenizer(request.prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
129
+
130
+ print(f"Received prompt: {request.prompt}")
131
+ print("Generating...")
132
+ with torch.no_grad():
133
+ outputs = model.generate(
134
+ **inputs,
135
+ max_new_tokens=request.max_new_tokens,
136
+ num_return_sequences=1,
137
+ do_sample=True,
138
+ temperature=request.temperature,
139
+ top_p=request.top_p,
140
+ top_k=request.top_k,
141
+ pad_token_id=tokenizer.pad_token_id,
142
+ eos_token_id=tokenizer.eos_token_id
143
+ )
144
+
145
+ prompt_tokens = inputs.input_ids.shape[-1]
146
+ # Ensure generated_sequence is not empty before decoding
147
+ if outputs[0].size(0) > prompt_tokens:
148
+ generated_sequence = outputs[0][prompt_tokens:]
149
+ generated_text = tokenizer.decode(generated_sequence, skip_special_tokens=True)
150
+ else: # Handle case where no new tokens were generated beyond the prompt
151
+ generated_text = ""
152
+
153
+ print(f"Generated text: {generated_text}")
154
+ return {"generated_text": generated_text}
155
+ except Exception as e:
156
+ print(f"Error during generation: {e}")
157
+ raise HTTPException(status_code=500, detail=str(e))
158
+
159
+ # This __main__ block is for local testing.
160
+ # On Hugging Face Spaces with a Procfile, Uvicorn will be started differently.
161
+ if __name__ == "__main__":
162
+ print("Starting server locally for testing...")
163
+ # For local testing, you'd set these environment variables or pass them as args
164
+ # and adjust the startup_event or load_model_and_tokenizer call.
165
+ # Example:
166
+ # os.environ["BASE_MODEL_ID"] = "deepseek-ai/deepseek-llm-7b-base"
167
+ # os.environ["ADAPTER_PATH"] = "./path_to_your_adapter_locally" # Adjust this path
168
+
169
+ # The startup_event will try to read from os.environ
170
+ # You need to set these before running uvicorn locally for testing the startup logic
171
+
172
+ # To run locally:
173
+ # 1. Set environment variables:
174
+ # export BASE_MODEL_ID="deepseek-ai/deepseek-llm-7b-base"
175
+ # export ADAPTER_PATH="./my_adapter"
176
+ # 2. Run:
177
+ # python app.py
178
+ # (This will fail because uvicorn.run is not called here directly with the app object)
179
+ # OR better for local testing:
180
+ # uvicorn app:app --reload --host 0.0.0.0 --port 8000
181
+ # (And ensure BASE_MODEL_ID and ADAPTER_PATH are set in your shell environment)
182
+
183
+ print("To run this app locally for testing, set BASE_MODEL_ID and ADAPTER_PATH environment variables, then run:")
184
+ print("uvicorn app:app --reload --host 0.0.0.0 --port 8000")
185
+
my_adapter/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "deepseek-ai/deepseek-llm-7b-base",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "o_proj",
28
+ "q_proj",
29
+ "down_proj",
30
+ "k_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "up_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
my_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d7abe86e797630ad2e7e9f4b6b6e5df3b5735d616534e35212f3d9650190dea
3
+ size 299883760
my_adapter/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
my_adapter/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
my_adapter/tokenizer_config.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "100000": {
7
+ "content": "<|begin▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100001": {
15
+ "content": "<|end▁of▁sentence|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "100002": {
23
+ "content": "ø",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "100003": {
31
+ "content": "ö",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "100004": {
39
+ "content": "ú",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "100005": {
47
+ "content": "ÿ",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "100006": {
55
+ "content": "õ",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "100007": {
63
+ "content": "÷",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "100008": {
71
+ "content": "û",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": false
77
+ },
78
+ "100009": {
79
+ "content": "ý",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "100010": {
87
+ "content": "À",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "100011": {
95
+ "content": "ù",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "100012": {
103
+ "content": "Á",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "100013": {
111
+ "content": "þ",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "100014": {
119
+ "content": "ü",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ }
126
+ },
127
+ "bos_token": "<|begin▁of▁sentence|>",
128
+ "clean_up_tokenization_spaces": false,
129
+ "eos_token": "<|end▁of▁sentence|>",
130
+ "extra_special_tokens": {},
131
+ "legacy": true,
132
+ "model_max_length": 4096,
133
+ "pad_token": "<|end▁of▁sentence|>",
134
+ "sp_model_kwargs": {},
135
+ "tokenizer_class": "LlamaTokenizerFast",
136
+ "unk_token": null,
137
+ "use_default_system_prompt": false
138
+ }
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ torch
4
+ transformers
5
+ peft
6
+ bitsandbytes
7
+ accelerate
8
+ sentencepiece
9
+ pydantic
10
+ python-dotenv # Optional, if you want to use a .env file for local testing