# ViCLIP - Vietnamese CLIP Text Encoder This model is a Vietnamese adaptation of CLIP text encoder, trained on Vietnamese data. ## Model Description - Text encoder based on PhoBERT - Projection head to align with CLIP embedding space - Optimized for Vietnamese text understanding ## Usage ```python from transformers import AutoTokenizer from huggingface_hub import hf_hub_download import torch import torch.nn as nn import torch.nn.functional as F class PhoCLIPTextModel(nn.Module): def __init__(self): super().__init__() # Load text encoder self.text_encoder = AutoModel.from_pretrained("kienhoang123/ViCLIP") # Load text projection head state_dict = torch.load(hf_hub_download(repo_id="kienhoang123/ViCLIP", filename="model.pt")) self.load_state_dict(state_dict) def forward(self, input_ids, attention_mask=None): # Get text embeddings text_outputs = self.text_encoder( input_ids=input_ids, attention_mask=attention_mask, return_dict=True ) text_cls = text_outputs.last_hidden_state[:, 0, :] text_proj = self.text_proj(text_cls) return text_proj # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("kienhoang123/ViCLIP") # Encode text text = "This is an example Vietnamese text" inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=77, truncation=True) model = PhoCLIPTextModel() model.eval() with torch.no_grad(): embedding = model(inputs.input_ids, inputs.attention_mask) normalized_embedding = F.normalize(embedding, p=2, dim=-1) ```