Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +95 -0
config.json +14 -0
modeling_turkish_encoder.py +137 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +11 -0

README.md ADDED Viewed

	@@ -0,0 +1,95 @@

+---
+language:
+- tr
+license: apache-2.0
+library_name: transformers
+tags:
+- sentence-embeddings
+- sentence-similarity
+- turkish
+- contrastive-learning
+pipeline_tag: sentence-similarity
+---
+# Turkish Sentence Encoder
+A Turkish sentence embedding model trained with contrastive learning (InfoNCE loss) on Turkish paraphrase pairs.
+## Model Description
+This model encodes Turkish sentences into 512-dimensional dense vectors that can be used for:
+- Semantic similarity
+- Semantic search / retrieval
+- Clustering
+- Paraphrase detection
+## Usage
+### Using with custom code
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+# Load model
+model = AutoModel.from_pretrained("Basar2004/turkish-sentence-encoder", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("Basar2004/turkish-sentence-encoder")
+# Encode sentences
+sentences = ["Bugün hava çok güzel.", "Hava bugün oldukça hoş."]
+inputs = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors="pt")
+with torch.no_grad():
+    embeddings = model(**inputs)
+# Compute similarity
+from torch.nn.functional import cosine_similarity
+similarity = cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0))
+print(f"Similarity: {similarity.item():.4f}")
+```
+### Using with Sentence-Transformers (after installing custom wrapper)
+```python
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer("Basar2004/turkish-sentence-encoder")
+embeddings = model.encode(["Merhaba dünya!", "Selam dünya!"])
+```
+## Evaluation Results
+| Metric | Score |
+|--------|-------|
+| Spearman Correlation | 0.8488 |
+| Pearson Correlation | 0.875 |
+| Paraphrase Accuracy | 0.8333 |
+| MRR | 0.95 |
+| Recall@1 | 0.9 |
+| Recall@5 | 1.0 |
+## Training Details
+- **Training Data**: Turkish paraphrase pairs (200K pairs)
+- **Loss Function**: InfoNCE (contrastive loss)
+- **Temperature**: 0.05
+- **Batch Size**: 32
+- **Base Model**: Custom Transformer encoder pretrained with MLM on Turkish text
+## Architecture
+- **Hidden Size**: 512
+- **Layers**: 12
+- **Attention Heads**: 8
+- **Max Sequence Length**: 64
+- **Vocab Size**: 32,000 (Unigram tokenizer)
+## Limitations
+- Optimized for Turkish language only
+- Max sequence length is 64 tokens
+- Best suited for sentence-level (not document-level) embeddings
+## License
+Apache 2.0

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "vocab_size": 32000,
+  "d_model": 512,
+  "max_len": 64,
+  "n_layers": 12,
+  "n_heads": 8,
+  "padding_idx": 0,
+  "dropout": 0.1,
+  "ffn_mult": 4,
+  "model_type": "turkish-sentence-encoder",
+  "architectures": [
+    "TurkishSentenceEncoder"
+  ]
+}

modeling_turkish_encoder.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""Turkish Sentence Encoder Model."""
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+import torch.nn.functional as F
+class InputEmbeddings(nn.Module):
+    def __init__(self, vocab_size: int, d_model: int, max_len: int, padding_idx: int = 0, dropout: float = 0.1):
+        super().__init__()
+        self.token_embed = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
+        self.pos_embed = nn.Embedding(max_len, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.d_model = d_model
+    def forward(self, x: Tensor) -> Tensor:
+        seq_len = x.size(1)
+        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
+        x = self.token_embed(x) + self.pos_embed(positions)
+        return self.dropout(x)
+class TransformerEncoderLayer(nn.Module):
+    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1, ffn_mult: int = 4, layer_idx: int = 0, n_layers: int = 1):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
+        self.ln2 = nn.LayerNorm(d_model)
+        self.ffn_fc1 = nn.Linear(d_model, d_model * ffn_mult)
+        self.ffn_fc2 = nn.Linear(d_model * ffn_mult, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: Tensor, key_padding_mask: Optional[Tensor] = None) -> Tensor:
+        x_norm = self.ln1(x)
+        attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=key_padding_mask)
+        x = x + self.dropout(attn_out)
+        x_norm = self.ln2(x)
+        ffn_out = self.ffn_fc2(self.dropout(F.gelu(self.ffn_fc1(x_norm))))
+        x = x + self.dropout(ffn_out)
+        return x
+class TransformerEncoder(nn.Module):
+    def __init__(self, vocab_size: int, d_model: int, max_len: int, n_layers: int, n_heads: int,
+                 padding_idx: int = 0, dropout: float = 0.1, ffn_mult: int = 4):
+        super().__init__()
+        self.emb = InputEmbeddings(vocab_size, d_model, max_len, padding_idx, dropout)
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(d_model, n_heads, dropout, ffn_mult, i, n_layers)
+            for i in range(n_layers)
+        ])
+        self.final_ln = nn.LayerNorm(d_model)
+    def forward(self, input_ids: Tensor, attention_mask: Optional[Tensor] = None) -> Tensor:
+        x = self.emb(input_ids)
+        key_padding_mask = None
+        if attention_mask is not None:
+            key_padding_mask = (attention_mask == 0)
+        for layer in self.layers:
+            x = layer(x, key_padding_mask=key_padding_mask)
+        return self.final_ln(x)
+class TurkishSentenceEncoder(nn.Module):
+    """Turkish Sentence Encoder for generating sentence embeddings."""
+    def __init__(self, config=None):
+        super().__init__()
+        if config is None:
+            config = {
+                "vocab_size": 32000,
+                "d_model": 512,
+                "max_len": 64,
+                "n_layers": 12,
+                "n_heads": 8,
+                "padding_idx": 0,
+                "dropout": 0.1,
+                "ffn_mult": 4,
+            }
+        self.config = config
+        self.encoder = TransformerEncoder(
+            vocab_size=config.get("vocab_size", 32000),
+            d_model=config.get("d_model", 512),
+            max_len=config.get("max_len", 64),
+            n_layers=config.get("n_layers", 12),
+            n_heads=config.get("n_heads", 8),
+            padding_idx=config.get("padding_idx", 0),
+            dropout=config.get("dropout", 0.1),
+            ffn_mult=config.get("ffn_mult", 4),
+        )
+        # MLM head (for compatibility with pretrained weights)
+        self.mlm_head = nn.Linear(config.get("d_model", 512), config.get("vocab_size", 32000), bias=True)
+    def forward(self, input_ids: Tensor, attention_mask: Optional[Tensor] = None, **kwargs) -> Tensor:
+        """
+        Forward pass that returns sentence embeddings (mean pooled).
+        """
+        encoder_output = self.encoder(input_ids, attention_mask=attention_mask)
+        # Mean pooling
+        if attention_mask is not None:
+            mask = attention_mask.unsqueeze(-1).expand(encoder_output.size()).float()
+            summed = torch.sum(encoder_output * mask, dim=1)
+            counted = torch.clamp(mask.sum(dim=1), min=1e-9)
+            embeddings = summed / counted
+        else:
+            embeddings = torch.mean(encoder_output, dim=1)
+        # Normalize embeddings
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings
+    @classmethod
+    def from_pretrained(cls, model_path: str, **kwargs):
+        """Load model from pretrained weights."""
+        import json
+        import os
+        config_path = os.path.join(model_path, "config.json")
+        if os.path.exists(config_path):
+            with open(config_path) as f:
+                config = json.load(f)
+        else:
+            config = None
+        model = cls(config)
+        weights_path = os.path.join(model_path, "pytorch_model.bin")
+        if os.path.exists(weights_path):
+            state_dict = torch.load(weights_path, map_location="cpu")
+            model.load_state_dict(state_dict, strict=False)
+        return model

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c36fd404625d47509f4ceb9afcab572c8b855394bc1df1b886c47490079ac676
+size 217160759

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]",
+  "cls_token": "[CLS]",
+  "sep_token": "[SEP]",
+  "mask_token": "[MASK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "model_max_length": 64,
+  "padding_side": "right",
+  "truncation_side": "right",
+  "pad_token": "[PAD]",
+  "unk_token": "[UNK]",
+  "cls_token": "[CLS]",
+  "sep_token": "[SEP]",
+  "mask_token": "[MASK]"
+}