harismlnaslm's picture
feat: minimal FastAPI app for Llama via HF Inference Endpoint; Dockerfile + requirements
02a6500
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import os, requests
app = FastAPI()
class ChatRequest(BaseModel):
message: str
@app.get("/")
def root():
return {"name": "Textilindo AI Power", "model": os.getenv("DEFAULT_MODEL", "meta-llama/Llama-3.1-8B-Instruct")}
@app.get("/health")
def health():
return {"status": "healthy"}
@app.post("/chat")
def chat(body: ChatRequest):
endpoint = (os.getenv("HF_ENDPOINT_URL") or "").rstrip("/")
token = os.getenv("HUGGINGFACE_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("HUGGINGFAC_API_KEY_2")
model = os.getenv("DEFAULT_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
if not endpoint or not token:
raise HTTPException(status_code=500, detail="Endpoint or token not configured")
url = f"{endpoint}/v1/chat/completions"
headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
payload = {"model": model, "messages": [{"role": "system", "content": "Jawablah singkat dalam Bahasa Indonesia."}, {"role": "user", "content": body.message}], "temperature": 0.5, "top_p": 0.9, "max_tokens": 180}
r = requests.post(url, headers=headers, json=payload, timeout=60)
if r.status_code >= 400:
raise HTTPException(status_code=502, detail=r.text)
data = r.json()
content = (data.get("choices") or [{}])[0].get("message", {}).get("content")
return {"response": content}