from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from vllm import LLM, SamplingParams app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) llama = LLM( model="meta-llama/Llama-3.2-1B-Instruct", dtype="half", gpu_memory_utilization=0.9 ) @app.post("/generate") def generate(prompt: str): params = SamplingParams(temperature=0.7, max_tokens=100) output = llama.generate([prompt], params) return {"text": output[0].outputs[0].text} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)