# ViCLIP - Vietnamese CLIP Text Encoder

This model is a Vietnamese adaptation of CLIP text encoder, trained on Vietnamese data.

## Model Description

- Text encoder based on PhoBERT
- Projection head to align with CLIP embedding space
- Optimized for Vietnamese text understanding

## Usage

```python
from transformers import AutoTokenizer
from huggingface_hub import hf_hub_download
import torch
import torch.nn as nn
import torch.nn.functional as F

class PhoCLIPTextModel(nn.Module):
    def __init__(self):
        super().__init__()
        # Load text encoder
        self.text_encoder = AutoModel.from_pretrained("kienhoang123/ViCLIP")
        
        # Load text projection head
        state_dict = torch.load(hf_hub_download(repo_id="kienhoang123/ViCLIP", filename="model.pt"))
        self.load_state_dict(state_dict)
    
    def forward(self, input_ids, attention_mask=None):
        # Get text embeddings
        text_outputs = self.text_encoder(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            return_dict=True
        )
        text_cls = text_outputs.last_hidden_state[:, 0, :]
        text_proj = self.text_proj(text_cls)
        return text_proj

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("kienhoang123/ViCLIP")

# Encode text
text = "This is an example Vietnamese text"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", max_length=77, truncation=True)

model = PhoCLIPTextModel()
model.eval()

with torch.no_grad():
    embedding = model(inputs.input_ids, inputs.attention_mask)
    normalized_embedding = F.normalize(embedding, p=2, dim=-1)
```