Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, Form | |
| from fastapi.responses import FileResponse | |
| from typing import Annotated | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
| app = FastAPI() | |
| async def root(): | |
| return "home.html" | |
| def say_hello(msg: Annotated[str, Form()]): | |
| print("model") | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct" | |
| device = "cpu" # for GPU usage or "cpu" for CPU usage | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| # for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")` | |
| model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device) | |
| messages = [{"role": "user", "content": msg}] | |
| input_text=tokenizer.apply_chat_template(messages, tokenize=False) | |
| print(input_text) | |
| input_ids = tokenizer(msg, return_tensors="pt").to("cpu") | |
| print("output") | |
| inputs = tokenizer.encode(input_text, return_tensors="pt").to(device) | |
| outputs = model.generate(inputs, max_new_tokens=256, temperature=0.6, top_p=0.92, do_sample=True) | |
| print("complete") | |
| return {"message": tokenizer.decode(outputs[0])} |