File size: 7,665 Bytes

83d5d1c

import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.models as models
from PIL import Image
import onnxruntime as ort
from huggingface_hub import hf_hub_download

# ==========================================
# CẤU HÌNH REPO
# ==========================================
REPO_ID = "biometric-ai-lab/Face_Recognition"
RECOG_FILENAME = "pytorch_model.bin"
YOLO_FILENAME = "yolov8s-face-lindevs.onnx"


# ==========================================
# 1. MODEL ARCHITECTURE (Giống hệt code bạn)
# ==========================================
class FaceRecognitionModel(nn.Module):
    def __init__(self):
        super(FaceRecognitionModel, self).__init__()
        # Khởi tạo backbone, để weights=None vì ta sẽ load weight train của bạn
        self.backbone = models.wide_resnet101_2(weights=None)
        self.backbone.fc = nn.Identity()
        self.embed = nn.Sequential(
            nn.Linear(2048, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
        )

    def forward(self, img):
        features = self.backbone(img)
        embedding = self.embed(features)
        return F.normalize(embedding, p=2, dim=1)


# ==========================================
# 2. YOLO DETECTOR (Logic chuẩn của bạn)
# ==========================================
class YOLOFaceDetector:
    def __init__(self, model_path, conf_threshold=0.5):
        self.session = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
        self.input_name = self.session.get_inputs()[0].name
        self.output_names = [output.name for output in self.session.get_outputs()]
        self.conf_threshold = conf_threshold
        self.input_size = 640

    def detect_extract_face(self, image_pil, expand_ratio=0.0):
        """
        Input: PIL Image
        Output: PIL Image (Cropped Face)
        """
        # Convert PIL -> OpenCV (BGR) để giống logic cũ
        image_np = np.array(image_pil)
        image_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
        img_height, img_width = image_bgr.shape[:2]

        # Preprocess (Resize -> RGB -> Norm -> Transpose)
        img_resized = cv2.resize(image_bgr, (self.input_size, self.input_size))
        # Lưu ý: YOLO training thường dùng RGB
        img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
        img_normalized = img_rgb.astype(np.float32) / 255.0
        img_transposed = np.transpose(img_normalized, (2, 0, 1))
        img_batch = np.expand_dims(img_transposed, axis=0)

        # Inference
        outputs = self.session.run(self.output_names, {self.input_name: img_batch})
        predictions = outputs[0]

        if len(predictions.shape) == 3:
            predictions = predictions[0].T

        best_face = None
        max_area = 0

        # Post-process
        for pred in predictions:
            conf = pred[4]
            if conf > self.conf_threshold:
                x_center, y_center, w, h = pred[:4]

                # Scale về ảnh gốc
                x_center = x_center * img_width / self.input_size
                y_center = y_center * img_height / self.input_size
                w = w * img_width / self.input_size
                h = h * img_height / self.input_size

                x1 = int(x_center - w / 2)
                y1 = int(y_center - h / 2)
                x2 = int(x_center + w / 2)
                y2 = int(y_center + h / 2)

                x1 = max(0, x1)
                y1 = max(0, y1)
                x2 = min(img_width, x2)
                y2 = min(img_height, y2)

                area = (x2 - x1) * (y2 - y1)

                # Lấy mặt to nhất
                if area > max_area:
                    max_area = area
                    best_face = (x1, y1, x2, y2)

        # Crop ảnh
        if best_face:
            x1, y1, x2, y2 = best_face

            # Xử lý expand_ratio (nếu có dùng)
            if expand_ratio != 0:
                w_box = x2 - x1
                h_box = y2 - y1
                pad = int(expand_ratio * max(w_box, h_box))
                x1 = max(0, x1 - pad)
                y1 = max(0, y1 - pad)
                x2 = min(img_width, x2 + pad)
                y2 = min(img_height, y2 + pad)

            # Crop từ ảnh gốc PIL (để giữ chất lượng tốt nhất)
            return image_pil.crop((x1, y1, x2, y2))

        print("⚠️ Warning: No face detected. Using full image.")
        return image_pil


# ==========================================
# 3. FACE ANALYSIS WRAPPER
# ==========================================
class FaceAnalysis:
    def __init__(self, device=None):
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"🚀 Initializing Face Analysis on {self.device}...")

        # 1. Tải Model
        try:
            print(f"📥 Checking models from {REPO_ID}...")
            recog_path = hf_hub_download(repo_id=REPO_ID, filename=RECOG_FILENAME)
            yolo_path = hf_hub_download(repo_id=REPO_ID, filename=YOLO_FILENAME)
        except Exception as e:
            raise RuntimeError(f"❌ Failed to download models. Check internet or Repo ID.\nError: {e}")

        # 2. Init YOLO
        self.yolo = YOLOFaceDetector(yolo_path, conf_threshold=0.5)

        # 3. Init Recognition
        self.model = FaceRecognitionModel().to(self.device)

        # Load weights an toàn
        checkpoint = torch.load(recog_path, map_location=self.device)
        if 'model' in checkpoint:
            self.model.load_state_dict(checkpoint['model'])
        else:
            # Fallback nếu file chỉ chứa weight không
            self.model.load_state_dict(checkpoint)

        self.model.eval()

        # 4. Transform (Giống hệt inference_transform của bạn)
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225],
            ),
        ])
        print("✅ System Ready!")

    def process_image(self, image_source, expand_ratio=0.0):
        # Load ảnh
        if isinstance(image_source, str):
            if not os.path.exists(image_source):
                raise FileNotFoundError(f"Image not found: {image_source}")
            img_pil = Image.open(image_source).convert('RGB')
        elif isinstance(image_source, Image.Image):
            img_pil = image_source.convert('RGB')
        elif isinstance(image_source, np.ndarray):
            img_pil = Image.fromarray(cv2.cvtColor(image_source, cv2.COLOR_BGR2RGB))
        else:
            raise ValueError("Input must be filepath, PIL Image, or Numpy Array")

        # 1. YOLO Detect & Crop
        face_crop = self.yolo.detect_extract_face(img_pil, expand_ratio=expand_ratio)

        # 2. Transform & Embedding
        img_tensor = self.transform(face_crop).unsqueeze(0).to(self.device)

        with torch.no_grad():
            embedding = self.model(img_tensor)

        return embedding

    def compare(self, img1, img2, threshold=0.45, expand_ratio=0.01):
        """
        So sánh 2 ảnh.
        expand_ratio=0.01 giống code demo của bạn.
        """
        emb1 = self.process_image(img1, expand_ratio)
        emb2 = self.process_image(img2, expand_ratio)

        # Cosine Similarity
        similarity = F.cosine_similarity(emb1, emb2).item()
        is_same = similarity > threshold

        return similarity, is_same