Spaces:
No application file
No application file
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| app = FastAPI() | |
| MODEL_NAME = "5CD-AI/Vintern-1B-v2" | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| print("Loading model (INT4, CPU)...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| load_in_4bit=True, | |
| device_map="cpu", | |
| torch_dtype=torch.float16 | |
| ) | |
| class InferRequest(BaseModel): | |
| text: str | |
| def infer(req: InferRequest): | |
| inputs = tokenizer( | |
| req.text, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| do_sample=False | |
| ) | |
| result = tokenizer.decode(output[0], skip_special_tokens=True) | |
| return {"result": result} | |