VietCat's picture
split into threads
c09c72a
from fastapi import FastAPI, Request
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModel
import torch
import time
import logging
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
# Cấu hình logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s",
level=logging.INFO
)
# Giới hạn số thread = 1 để không quá tải CPU HFS free
executor = ThreadPoolExecutor(max_workers=1)
app = FastAPI()
# Load model
model_name = "AITeamVN/Vietnamese_Embedding_v2"
logging.info(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()
torch.set_num_threads(1)
logging.info("Model loaded successfully.")
class InputText(BaseModel):
text: str
@app.get("/")
def root():
now = datetime.now().isoformat()
logging.info(f"[GET /] Health check at {now}")
return {"message": "Vietnamese Embedding API is running."}
# Hàm xử lý embedding tách riêng
def compute_embedding(text: str):
start_time = time.time()
start_ts = datetime.now().isoformat()
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
token_count = inputs["input_ids"].shape[1]
logging.info(f"[EMBED] Start: {start_ts} | Input: '{text[:50]}'... | Tokens: {token_count}")
with torch.no_grad():
outputs = model(**inputs)
embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist()
end_ts = datetime.now().isoformat()
duration_ms = (time.time() - start_time) * 1000
logging.info(f"[EMBED] Done: {end_ts} | Embedding size: {len(embedding)} | Time: {duration_ms:.2f} ms")
return embedding
@app.post("/embed")
def get_embedding(data: InputText):
# Gửi sang thread pool (sẽ đợi đến khi xong)
embedding = executor.submit(compute_embedding, data.text).result()
return {"embedding": embedding}