from fastapi import FastAPI, Request from pydantic import BaseModel from transformers import AutoTokenizer, AutoModel import torch import time import logging from datetime import datetime from concurrent.futures import ThreadPoolExecutor # Cấu hình logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO ) # Giới hạn số thread = 1 để không quá tải CPU HFS free executor = ThreadPoolExecutor(max_workers=1) app = FastAPI() # Load model model_name = "AITeamVN/Vietnamese_Embedding_v2" logging.info(f"Loading model: {model_name}") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) model.eval() torch.set_num_threads(1) logging.info("Model loaded successfully.") class InputText(BaseModel): text: str @app.get("/") def root(): now = datetime.now().isoformat() logging.info(f"[GET /] Health check at {now}") return {"message": "Vietnamese Embedding API is running."} # Hàm xử lý embedding tách riêng def compute_embedding(text: str): start_time = time.time() start_ts = datetime.now().isoformat() inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) token_count = inputs["input_ids"].shape[1] logging.info(f"[EMBED] Start: {start_ts} | Input: '{text[:50]}'... | Tokens: {token_count}") with torch.no_grad(): outputs = model(**inputs) embedding = outputs.last_hidden_state[:, 0, :].squeeze().tolist() end_ts = datetime.now().isoformat() duration_ms = (time.time() - start_time) * 1000 logging.info(f"[EMBED] Done: {end_ts} | Embedding size: {len(embedding)} | Time: {duration_ms:.2f} ms") return embedding @app.post("/embed") def get_embedding(data: InputText): # Gửi sang thread pool (sẽ đợi đến khi xong) embedding = executor.submit(compute_embedding, data.text).result() return {"embedding": embedding}