Spaces:
Runtime error
Runtime error
File size: 1,527 Bytes
5fa76ab ee715d6 6ad79b6 5fa76ab ee715d6 705578d ce2b463 705578d 5fa76ab 97fcad1 e6fa3d8 473963a 5fa76ab 2a0a9e8 8bcb2c2 2a0a9e8 5fa76ab 2a0a9e8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | from fastapi import FastAPI
from pydantic import BaseModel
from huggingface_hub import InferenceClient
import uvicorn
import os
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
app = FastAPI()
HF_TOKEN = os.environ["HF_TOKEN"]
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct",
device_map="auto",
token=HF_TOKEN
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True, token=HF_TOKEN)
class Item(BaseModel):
prompt: str
system_prompt: str
temperature: float = 0.2
max_new_tokens: int = 1048
top_p: float = 0.15
repetition_penalty: float = 1.0
def generate(item: Item):
temperature = float(item.temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(item.top_p)
device = "cpu"
prompt = item.prompt
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
model_inputs = encodeds.to(device)
generated_ids = model.generate(**model_inputs, max_new_tokens=item.max_new_tokens, do_sample=True, pad_token_id=tokenizer.eos_token_id, temperature=item.temperature)
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
return decoded
@app.post("/generate/")
async def generate_text(item: Item):
return {"response": generate(item)}
|