from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM import torch app = FastAPI() MODEL_NAME = "5CD-AI/Vintern-1B-v2" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print("Loading model (INT4, CPU)...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, load_in_4bit=True, device_map="cpu", torch_dtype=torch.float16 ) class InferRequest(BaseModel): text: str @app.post("/infer") def infer(req: InferRequest): inputs = tokenizer( req.text, return_tensors="pt", truncation=True, max_length=512 ) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=256, do_sample=False ) result = tokenizer.decode(output[0], skip_special_tokens=True) return {"result": result}