import os from fastapi import FastAPI from pydantic import BaseModel from huggingface_hub import InferenceClient app = FastAPI() client = InferenceClient(api_key=os.environ.get("HF_TOKEN")) class Request(BaseModel): prompt: str @app.get("/") def home(): return {"status": "Running"} @app.post("/generate") def generate_text(request: Request): messages = [{"role": "user", "content": request.prompt}] response = client.chat_completion( messages=messages, model="meta-llama/Llama-3.1-8B-Instruct", max_tokens=500 ) return {"response": response.choices[0].message.content}