Spaces:
Runtime error
Runtime error
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from huggingface_hub import InferenceClient | |
| import uvicorn | |
| import os | |
| from transformers import ( | |
| AutoModelForCausalLM, | |
| AutoTokenizer, | |
| TextIteratorStreamer, | |
| ) | |
| app = FastAPI() | |
| HF_TOKEN = os.environ["HF_TOKEN"] | |
| model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", | |
| device_map="auto", | |
| token=HF_TOKEN | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True, token=HF_TOKEN) | |
| class Item(BaseModel): | |
| prompt: str | |
| system_prompt: str | |
| temperature: float = 0.2 | |
| max_new_tokens: int = 1048 | |
| top_p: float = 0.15 | |
| repetition_penalty: float = 1.0 | |
| def generate(item: Item): | |
| temperature = float(item.temperature) | |
| if temperature < 1e-2: | |
| temperature = 1e-2 | |
| top_p = float(item.top_p) | |
| device = "cpu" | |
| prompt = item.prompt | |
| encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) | |
| model_inputs = encodeds.to(device) | |
| generated_ids = model.generate(**model_inputs, max_new_tokens=item.max_new_tokens, do_sample=True, pad_token_id=tokenizer.eos_token_id, temperature=item.temperature) | |
| decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True) | |
| return decoded | |
| async def generate_text(item: Item): | |
| return {"response": generate(item)} | |