| # ViCLIP - Vietnamese CLIP Text Encoder | |
| This model is a Vietnamese adaptation of CLIP text encoder, trained on Vietnamese data. | |
| ## Model Description | |
| - Text encoder based on PhoBERT | |
| - Projection head to align with CLIP embedding space | |
| - Optimized for Vietnamese text understanding | |
| ## Usage | |
| ```python | |
| from transformers import AutoTokenizer | |
| from huggingface_hub import hf_hub_download | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| class PhoCLIPTextModel(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| # Load text encoder | |
| self.text_encoder = AutoModel.from_pretrained("kienhoang123/ViCLIP") | |
| # Load text projection head | |
| state_dict = torch.load(hf_hub_download(repo_id="kienhoang123/ViCLIP", filename="model.pt")) | |
| self.load_state_dict(state_dict) | |
| def forward(self, input_ids, attention_mask=None): | |
| # Get text embeddings | |
| text_outputs = self.text_encoder( | |
| input_ids=input_ids, | |
| attention_mask=attention_mask, | |
| return_dict=True | |
| ) | |
| text_cls = text_outputs.last_hidden_state[:, 0, :] | |
| text_proj = self.text_proj(text_cls) | |
| return text_proj | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("kienhoang123/ViCLIP") | |
| # Encode text | |
| text = "This is an example Vietnamese text" | |
| inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=77, truncation=True) | |
| model = PhoCLIPTextModel() | |
| model.eval() | |
| with torch.no_grad(): | |
| embedding = model(inputs.input_ids, inputs.attention_mask) | |
| normalized_embedding = F.normalize(embedding, p=2, dim=-1) | |
| ``` | |