from fastapi import FastAPI from pydantic import BaseModel from huggingface_hub import InferenceClient import uvicorn import os from transformers import ( AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, ) app = FastAPI() HF_TOKEN = os.environ["HF_TOKEN"] model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", device_map="auto", token=HF_TOKEN ) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True, token=HF_TOKEN) class Item(BaseModel): prompt: str system_prompt: str temperature: float = 0.2 max_new_tokens: int = 1048 top_p: float = 0.15 repetition_penalty: float = 1.0 def generate(item: Item): temperature = float(item.temperature) if temperature < 1e-2: temperature = 1e-2 top_p = float(item.top_p) device = "cpu" prompt = item.prompt encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) model_inputs = encodeds.to(device) generated_ids = model.generate(**model_inputs, max_new_tokens=item.max_new_tokens, do_sample=True, pad_token_id=tokenizer.eos_token_id, temperature=item.temperature) decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True) return decoded @app.post("/generate/") async def generate_text(item: Item): return {"response": generate(item)}