inference / app.py
Vishnu Mukundan
fastapi backend
34b18cd
raw
history blame contribute delete
661 Bytes
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from vllm import LLM, SamplingParams
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
llama = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
dtype="half",
gpu_memory_utilization=0.9
)
@app.post("/generate")
def generate(prompt: str):
params = SamplingParams(temperature=0.7, max_tokens=100)
output = llama.generate([prompt], params)
return {"text": output[0].outputs[0].text}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)