varshithkumar commited on
Commit
8a85f65
·
1 Parent(s): fa90325

Added app.py and requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +14 -14
  2. requirements.txt +1 -3
app.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, Request
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
@@ -7,44 +7,44 @@ import os
7
 
8
  app = FastAPI()
9
 
10
- # Load Hugging Face token from environment
11
- HF_TOKEN = os.environ.get("HF_TOKEN")
12
-
13
- # Base and LoRA model names
14
  BASE_MODEL = "google/gemma-2b-it"
15
  LORA_MODEL = "varshithkumar/gemma-finetuned-sql"
16
 
17
- # Load base model and tokenizer
 
 
18
  print("Loading base model...")
19
  base_model = AutoModelForCausalLM.from_pretrained(
20
  BASE_MODEL,
21
- device_map="auto",
22
- token=HF_TOKEN
23
- )
 
 
 
24
  tokenizer = AutoTokenizer.from_pretrained(
25
  BASE_MODEL,
26
  use_fast=True,
27
- token=HF_TOKEN
28
  )
29
 
30
- # Apply LoRA weights
31
  print("Applying LoRA adapter...")
32
  model = PeftModel.from_pretrained(
33
  base_model,
34
  LORA_MODEL,
35
- token=HF_TOKEN
36
  )
37
 
38
  print("Model loaded successfully!")
39
 
40
- # Define input schema
41
  class InputData(BaseModel):
42
  prompt: str
43
  max_length: int = 100
44
 
45
  @app.post("/generate")
46
  def generate_text(data: InputData):
47
- inputs = tokenizer(data.prompt, return_tensors="pt").to(model.device)
48
  outputs = model.generate(**inputs, max_length=data.max_length)
49
  text = tokenizer.decode(outputs[0], skip_special_tokens=True)
50
  return {"response": text}
 
1
+ from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
 
7
 
8
  app = FastAPI()
9
 
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
 
 
 
11
  BASE_MODEL = "google/gemma-2b-it"
12
  LORA_MODEL = "varshithkumar/gemma-finetuned-sql"
13
 
14
+ # Choose device
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
  print("Loading base model...")
18
  base_model = AutoModelForCausalLM.from_pretrained(
19
  BASE_MODEL,
20
+ device_map=None, # Avoid auto offloading
21
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
22
+ use_auth_token=HF_TOKEN
23
+ ).to(device)
24
+
25
+ print("Loading tokenizer...")
26
  tokenizer = AutoTokenizer.from_pretrained(
27
  BASE_MODEL,
28
  use_fast=True,
29
+ use_auth_token=HF_TOKEN
30
  )
31
 
 
32
  print("Applying LoRA adapter...")
33
  model = PeftModel.from_pretrained(
34
  base_model,
35
  LORA_MODEL,
36
+ use_auth_token=HF_TOKEN
37
  )
38
 
39
  print("Model loaded successfully!")
40
 
 
41
  class InputData(BaseModel):
42
  prompt: str
43
  max_length: int = 100
44
 
45
  @app.post("/generate")
46
  def generate_text(data: InputData):
47
+ inputs = tokenizer(data.prompt, return_tensors="pt").to(device)
48
  outputs = model.generate(**inputs, max_length=data.max_length)
49
  text = tokenizer.decode(outputs[0], skip_special_tokens=True)
50
  return {"response": text}
requirements.txt CHANGED
@@ -1,9 +1,7 @@
1
  fastapi
2
  uvicorn
3
  transformers
4
- peft
5
- torch
6
  accelerate
 
7
  bitsandbytes
8
  sentencepiece
9
- huggingface_hub
 
1
  fastapi
2
  uvicorn
3
  transformers
 
 
4
  accelerate
5
+ peft
6
  bitsandbytes
7
  sentencepiece