|
|
from fastapi import FastAPI |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
llama = LLM( |
|
|
model="meta-llama/Llama-3.2-1B-Instruct", |
|
|
dtype="half", |
|
|
gpu_memory_utilization=0.9 |
|
|
) |
|
|
|
|
|
@app.post("/generate") |
|
|
def generate(prompt: str): |
|
|
params = SamplingParams(temperature=0.7, max_tokens=100) |
|
|
output = llama.generate([prompt], params) |
|
|
return {"text": output[0].outputs[0].text} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run(app, host="0.0.0.0", port=7860) |