| | from fastapi import FastAPI |
| | from transformers import AutoTokenizer, AutoModelForCausalLM |
| | import torch |
| | import os |
| | from pydantic import BaseModel |
| |
|
| | |
| | os.environ["HF_HOME"] = "/tmp" |
| | os.environ["TRANSFORMERS_CACHE"] = "/tmp" |
| |
|
| |
|
| | model_id = "rabiyulfahim/qa_python_gpt2" |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="/tmp") |
| | model = AutoModelForCausalLM.from_pretrained(model_id, cache_dir="/tmp") |
| |
|
| |
|
| | app = FastAPI(title="QA GPT2 API", description="Serving HuggingFace model with FastAPI") |
| |
|
| |
|
| | |
| | class QueryRequest(BaseModel): |
| | question: str |
| | max_new_tokens: int = 50 |
| | temperature: float = 0.7 |
| | top_p: float = 0.9 |
| |
|
| |
|
| | @app.get("/") |
| | def home(): |
| | return {"message": "Welcome to QA GPT2 API 🚀"} |
| |
|
| | @app.get("/ask") |
| | def ask(question: str, max_new_tokens: int = 50): |
| | inputs = tokenizer(question, return_tensors="pt") |
| | outputs = model.generate(**inputs, max_new_tokens=max_new_tokens) |
| | answer = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | return {"question": question, "answer": answer} |
| |
|
| |
|
| |
|
| | |
| | @app.get("/health") |
| | def health(): |
| | return {"status": "ok"} |
| |
|
| | |
| | @app.post("/predict") |
| | def predict(request: QueryRequest): |
| | inputs = tokenizer(request.question, return_tensors="pt") |
| | outputs = model.generate( |
| | **inputs, |
| | max_new_tokens=request.max_new_tokens, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_p=0.9, |
| | pad_token_id=tokenizer.eos_token_id, |
| | return_dict_in_generate=True |
| | ) |
| |
|
| | answer = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True) |
| | return { |
| | "question": request.question, |
| | "answer": answer |
| | } |