| import torch |
| import numpy as np |
| from PIL import Image |
| from transformers import AutoModel |
| from typing import Union, List |
| import io |
|
|
|
|
| class JinaClipEmbeddingService: |
| """ |
| Jina CLIP v2 Embedding Service với hỗ trợ tiếng Việt |
| Sử dụng AutoModel với trust_remote_code |
| """ |
|
|
| def __init__(self, model_path: str = "jinaai/jina-clip-v2"): |
| """ |
| Initialize Jina CLIP v2 model |
| |
| Args: |
| model_path: Path to model hoặc HuggingFace model name |
| """ |
| print(f"Loading Jina CLIP v2 model from {model_path}...") |
|
|
| |
| self.model = AutoModel.from_pretrained(model_path, trust_remote_code=True) |
|
|
| |
| self.model.eval() |
|
|
| |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" |
| self.model.to(self.device) |
|
|
| print(f"✓ Loaded Jina CLIP v2 model on: {self.device}") |
|
|
| def encode_text( |
| self, |
| text: Union[str, List[str]], |
| truncate_dim: int = None, |
| normalize: bool = True |
| ) -> np.ndarray: |
| """ |
| Encode text thành vector embeddings (hỗ trợ tiếng Việt) |
| |
| Args: |
| text: Text hoặc list of texts (tiếng Việt) |
| truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
| normalize: Có normalize embeddings không |
| |
| Returns: |
| numpy array của embeddings |
| """ |
| if isinstance(text, str): |
| text = [text] |
|
|
| |
| |
| embeddings = self.model.encode_text( |
| text, |
| truncate_dim=truncate_dim |
| ) |
|
|
| |
| if isinstance(embeddings, torch.Tensor): |
| embeddings = embeddings.cpu().detach().numpy() |
|
|
| |
| if normalize: |
| embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
| return embeddings |
|
|
| def encode_image( |
| self, |
| image: Union[Image.Image, bytes, List, str], |
| truncate_dim: int = None, |
| normalize: bool = True |
| ) -> np.ndarray: |
| """ |
| Encode image thành vector embeddings |
| |
| Args: |
| image: PIL Image, bytes, URL string, hoặc list of images |
| truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
| normalize: Có normalize embeddings không |
| |
| Returns: |
| numpy array của embeddings |
| """ |
| |
| if isinstance(image, bytes): |
| image = Image.open(io.BytesIO(image)).convert('RGB') |
| elif isinstance(image, list): |
| processed_images = [] |
| for img in image: |
| if isinstance(img, bytes): |
| processed_images.append(Image.open(io.BytesIO(img)).convert('RGB')) |
| elif isinstance(img, str): |
| |
| processed_images.append(img) |
| else: |
| processed_images.append(img) |
| image = processed_images |
| elif not isinstance(image, list) and not isinstance(image, str): |
| |
| image = [image] |
|
|
| |
| |
| embeddings = self.model.encode_image( |
| image, |
| truncate_dim=truncate_dim |
| ) |
|
|
| |
| if isinstance(embeddings, torch.Tensor): |
| embeddings = embeddings.cpu().detach().numpy() |
|
|
| |
| if normalize: |
| embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) |
|
|
| return embeddings |
|
|
| def encode_multimodal( |
| self, |
| text: Union[str, List[str]] = None, |
| image: Union[Image.Image, bytes, List] = None, |
| truncate_dim: int = None, |
| normalize: bool = True |
| ) -> np.ndarray: |
| """ |
| Encode cả text và image, trả về embeddings kết hợp |
| |
| Args: |
| text: Text hoặc list of texts (tiếng Việt) |
| image: PIL Image, bytes, hoặc list of images |
| truncate_dim: Matryoshka dimension (64-1024, None = full 1024) |
| normalize: Có normalize embeddings không |
| |
| Returns: |
| numpy array của embeddings |
| """ |
| embeddings = [] |
|
|
| if text is not None: |
| text_emb = self.encode_text(text, truncate_dim=truncate_dim, normalize=False) |
| embeddings.append(text_emb) |
|
|
| if image is not None: |
| image_emb = self.encode_image(image, truncate_dim=truncate_dim, normalize=False) |
| embeddings.append(image_emb) |
|
|
| |
| if len(embeddings) == 2: |
| |
| combined = np.mean(embeddings, axis=0) |
| elif len(embeddings) == 1: |
| combined = embeddings[0] |
| else: |
| raise ValueError("Phải cung cấp ít nhất text hoặc image") |
|
|
| |
| if normalize: |
| combined = combined / np.linalg.norm(combined, axis=1, keepdims=True) |
|
|
| return combined |
|
|
| def get_embedding_dimension(self) -> int: |
| """ |
| Trả về dimension của embeddings (1024 cho Jina CLIP v2) |
| """ |
| return 1024 |
|
|