| from fastapi import FastAPI |
| from pydantic import BaseModel |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| import torch |
| from fastapi.middleware.cors import CORSMiddleware |
| from peft import PeftModel |
|
|
|
|
| |
| |
| |
| |
| model_name = "thedeba/Friday" |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype="auto", |
| device_map="auto" |
| ) |
|
|
| |
| |
| model.to(device) |
| |
| |
| |
| app = FastAPI() |
|
|
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| class Query(BaseModel): |
| text: str |
|
|
| @app.post("/generate") |
| def generate(query: Query): |
| messages = [{"role": "user", "content": query.text}] |
|
|
| |
| inputs = tokenizer.apply_chat_template( |
| messages, |
| tokenize=True, |
| add_generation_prompt=True, |
| return_tensors="pt", |
| ).to(device) |
|
|
| |
| outputs = model.generate( |
| input_ids=inputs, |
| max_new_tokens=2048, |
| use_cache=True, |
| temperature=0.5, |
| min_p=0.1, |
| ) |
|
|
| |
| output_string = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
| response = output_string.split("assistant")[-1].strip() |
| return {"response": response} |
|
|
| @app.get("/") |
| def root(): |
| return {"Friday": "is running!"} |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |