code-llama / main.py
StefanG2002's picture
Update main.py
6ad79b6 verified
raw
history blame
815 Bytes
from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import uvicorn
import os
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
app = FastAPI()
HF_TOKEN = os.environ["HF_TOKEN"]
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct",
device_map="auto",
token=HF_TOKEN
)
class Item(BaseModel):
prompt: str
system_prompt: str
temperature: float = 0.0
max_new_tokens: int = 1048
top_p: float = 0.15
repetition_penalty: float = 1.0
@app.post("/generate/")
async def generate_text(item: Item):
return {"response": item.prompt}