|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
from peft import PeftModel |
|
|
from fastapi import FastAPI, Request |
|
|
from pydantic import BaseModel |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
|
|
|
BASE_MODEL = "mistralai/Mistral-7B-v0.1" |
|
|
ADAPTER_MODEL = "fansa34/finetunedModel" |
|
|
|
|
|
|
|
|
quant_config = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.float16, |
|
|
) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
|
BASE_MODEL, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.float16, |
|
|
trust_remote_code=True, |
|
|
quantization_config=quant_config, |
|
|
) |
|
|
|
|
|
|
|
|
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
class QueryRequest(BaseModel): |
|
|
question: str |
|
|
max_new_tokens: int = 200 |
|
|
temperature: float = 0.6 |
|
|
|
|
|
@app.post("/ask") |
|
|
async def ask(req: QueryRequest): |
|
|
prompt = f"Question: {req.question}\nAnswer:" |
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=req.max_new_tokens, |
|
|
temperature=req.temperature, |
|
|
do_sample=True, |
|
|
top_p=0.9, |
|
|
top_k=50, |
|
|
repetition_penalty=1.1, |
|
|
pad_token_id=tokenizer.pad_token_id |
|
|
) |
|
|
|
|
|
response = tokenizer.decode(output[0], skip_special_tokens=True).split("Answer:")[-1].strip() |
|
|
return {"question": req.question, "answer": response} |