Basar2004 commited on
Commit
af8602a
·
verified ·
1 Parent(s): 9b9d4ba

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - tr
4
+ license: apache-2.0
5
+ library_name: transformers
6
+ tags:
7
+ - sentence-embeddings
8
+ - sentence-similarity
9
+ - turkish
10
+ - contrastive-learning
11
+ pipeline_tag: sentence-similarity
12
+ ---
13
+
14
+ # Turkish Sentence Encoder
15
+
16
+ A Turkish sentence embedding model trained with contrastive learning (InfoNCE loss) on Turkish paraphrase pairs.
17
+
18
+ ## Model Description
19
+
20
+ This model encodes Turkish sentences into 512-dimensional dense vectors that can be used for:
21
+ - Semantic similarity
22
+ - Semantic search / retrieval
23
+ - Clustering
24
+ - Paraphrase detection
25
+
26
+ ## Usage
27
+
28
+ ### Using with custom code
29
+
30
+ ```python
31
+ import torch
32
+ from transformers import AutoModel, AutoTokenizer
33
+
34
+ # Load model
35
+ model = AutoModel.from_pretrained("Basar2004/turkish-sentence-encoder", trust_remote_code=True)
36
+ tokenizer = AutoTokenizer.from_pretrained("Basar2004/turkish-sentence-encoder")
37
+
38
+ # Encode sentences
39
+ sentences = ["Bugün hava çok güzel.", "Hava bugün oldukça hoş."]
40
+
41
+ inputs = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors="pt")
42
+ with torch.no_grad():
43
+ embeddings = model(**inputs)
44
+
45
+ # Compute similarity
46
+ from torch.nn.functional import cosine_similarity
47
+ similarity = cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1].unsqueeze(0))
48
+ print(f"Similarity: {similarity.item():.4f}")
49
+ ```
50
+
51
+ ### Using with Sentence-Transformers (after installing custom wrapper)
52
+
53
+ ```python
54
+ from sentence_transformers import SentenceTransformer
55
+
56
+ model = SentenceTransformer("Basar2004/turkish-sentence-encoder")
57
+ embeddings = model.encode(["Merhaba dünya!", "Selam dünya!"])
58
+ ```
59
+
60
+ ## Evaluation Results
61
+
62
+ | Metric | Score |
63
+ |--------|-------|
64
+ | Spearman Correlation | 0.8488 |
65
+ | Pearson Correlation | 0.875 |
66
+ | Paraphrase Accuracy | 0.8333 |
67
+ | MRR | 0.95 |
68
+ | Recall@1 | 0.9 |
69
+ | Recall@5 | 1.0 |
70
+
71
+ ## Training Details
72
+
73
+ - **Training Data**: Turkish paraphrase pairs (200K pairs)
74
+ - **Loss Function**: InfoNCE (contrastive loss)
75
+ - **Temperature**: 0.05
76
+ - **Batch Size**: 32
77
+ - **Base Model**: Custom Transformer encoder pretrained with MLM on Turkish text
78
+
79
+ ## Architecture
80
+
81
+ - **Hidden Size**: 512
82
+ - **Layers**: 12
83
+ - **Attention Heads**: 8
84
+ - **Max Sequence Length**: 64
85
+ - **Vocab Size**: 32,000 (Unigram tokenizer)
86
+
87
+ ## Limitations
88
+
89
+ - Optimized for Turkish language only
90
+ - Max sequence length is 64 tokens
91
+ - Best suited for sentence-level (not document-level) embeddings
92
+
93
+ ## License
94
+
95
+ Apache 2.0
config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 32000,
3
+ "d_model": 512,
4
+ "max_len": 64,
5
+ "n_layers": 12,
6
+ "n_heads": 8,
7
+ "padding_idx": 0,
8
+ "dropout": 0.1,
9
+ "ffn_mult": 4,
10
+ "model_type": "turkish-sentence-encoder",
11
+ "architectures": [
12
+ "TurkishSentenceEncoder"
13
+ ]
14
+ }
modeling_turkish_encoder.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Turkish Sentence Encoder Model."""
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch import Tensor
6
+ from typing import Optional
7
+ import torch.nn.functional as F
8
+
9
+
10
+ class InputEmbeddings(nn.Module):
11
+ def __init__(self, vocab_size: int, d_model: int, max_len: int, padding_idx: int = 0, dropout: float = 0.1):
12
+ super().__init__()
13
+ self.token_embed = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
14
+ self.pos_embed = nn.Embedding(max_len, d_model)
15
+ self.dropout = nn.Dropout(dropout)
16
+ self.d_model = d_model
17
+
18
+ def forward(self, x: Tensor) -> Tensor:
19
+ seq_len = x.size(1)
20
+ positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
21
+ x = self.token_embed(x) + self.pos_embed(positions)
22
+ return self.dropout(x)
23
+
24
+
25
+ class TransformerEncoderLayer(nn.Module):
26
+ def __init__(self, d_model: int, n_heads: int, dropout: float = 0.1, ffn_mult: int = 4, layer_idx: int = 0, n_layers: int = 1):
27
+ super().__init__()
28
+ self.ln1 = nn.LayerNorm(d_model)
29
+ self.attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout, batch_first=True)
30
+ self.ln2 = nn.LayerNorm(d_model)
31
+ self.ffn_fc1 = nn.Linear(d_model, d_model * ffn_mult)
32
+ self.ffn_fc2 = nn.Linear(d_model * ffn_mult, d_model)
33
+ self.dropout = nn.Dropout(dropout)
34
+
35
+ def forward(self, x: Tensor, key_padding_mask: Optional[Tensor] = None) -> Tensor:
36
+ x_norm = self.ln1(x)
37
+ attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=key_padding_mask)
38
+ x = x + self.dropout(attn_out)
39
+ x_norm = self.ln2(x)
40
+ ffn_out = self.ffn_fc2(self.dropout(F.gelu(self.ffn_fc1(x_norm))))
41
+ x = x + self.dropout(ffn_out)
42
+ return x
43
+
44
+
45
+ class TransformerEncoder(nn.Module):
46
+ def __init__(self, vocab_size: int, d_model: int, max_len: int, n_layers: int, n_heads: int,
47
+ padding_idx: int = 0, dropout: float = 0.1, ffn_mult: int = 4):
48
+ super().__init__()
49
+ self.emb = InputEmbeddings(vocab_size, d_model, max_len, padding_idx, dropout)
50
+ self.layers = nn.ModuleList([
51
+ TransformerEncoderLayer(d_model, n_heads, dropout, ffn_mult, i, n_layers)
52
+ for i in range(n_layers)
53
+ ])
54
+ self.final_ln = nn.LayerNorm(d_model)
55
+
56
+ def forward(self, input_ids: Tensor, attention_mask: Optional[Tensor] = None) -> Tensor:
57
+ x = self.emb(input_ids)
58
+ key_padding_mask = None
59
+ if attention_mask is not None:
60
+ key_padding_mask = (attention_mask == 0)
61
+ for layer in self.layers:
62
+ x = layer(x, key_padding_mask=key_padding_mask)
63
+ return self.final_ln(x)
64
+
65
+
66
+ class TurkishSentenceEncoder(nn.Module):
67
+ """Turkish Sentence Encoder for generating sentence embeddings."""
68
+
69
+ def __init__(self, config=None):
70
+ super().__init__()
71
+ if config is None:
72
+ config = {
73
+ "vocab_size": 32000,
74
+ "d_model": 512,
75
+ "max_len": 64,
76
+ "n_layers": 12,
77
+ "n_heads": 8,
78
+ "padding_idx": 0,
79
+ "dropout": 0.1,
80
+ "ffn_mult": 4,
81
+ }
82
+
83
+ self.config = config
84
+ self.encoder = TransformerEncoder(
85
+ vocab_size=config.get("vocab_size", 32000),
86
+ d_model=config.get("d_model", 512),
87
+ max_len=config.get("max_len", 64),
88
+ n_layers=config.get("n_layers", 12),
89
+ n_heads=config.get("n_heads", 8),
90
+ padding_idx=config.get("padding_idx", 0),
91
+ dropout=config.get("dropout", 0.1),
92
+ ffn_mult=config.get("ffn_mult", 4),
93
+ )
94
+ # MLM head (for compatibility with pretrained weights)
95
+ self.mlm_head = nn.Linear(config.get("d_model", 512), config.get("vocab_size", 32000), bias=True)
96
+
97
+ def forward(self, input_ids: Tensor, attention_mask: Optional[Tensor] = None, **kwargs) -> Tensor:
98
+ """
99
+ Forward pass that returns sentence embeddings (mean pooled).
100
+ """
101
+ encoder_output = self.encoder(input_ids, attention_mask=attention_mask)
102
+
103
+ # Mean pooling
104
+ if attention_mask is not None:
105
+ mask = attention_mask.unsqueeze(-1).expand(encoder_output.size()).float()
106
+ summed = torch.sum(encoder_output * mask, dim=1)
107
+ counted = torch.clamp(mask.sum(dim=1), min=1e-9)
108
+ embeddings = summed / counted
109
+ else:
110
+ embeddings = torch.mean(encoder_output, dim=1)
111
+
112
+ # Normalize embeddings
113
+ embeddings = F.normalize(embeddings, p=2, dim=1)
114
+
115
+ return embeddings
116
+
117
+ @classmethod
118
+ def from_pretrained(cls, model_path: str, **kwargs):
119
+ """Load model from pretrained weights."""
120
+ import json
121
+ import os
122
+
123
+ config_path = os.path.join(model_path, "config.json")
124
+ if os.path.exists(config_path):
125
+ with open(config_path) as f:
126
+ config = json.load(f)
127
+ else:
128
+ config = None
129
+
130
+ model = cls(config)
131
+
132
+ weights_path = os.path.join(model_path, "pytorch_model.bin")
133
+ if os.path.exists(weights_path):
134
+ state_dict = torch.load(weights_path, map_location="cpu")
135
+ model.load_state_dict(state_dict, strict=False)
136
+
137
+ return model
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36fd404625d47509f4ceb9afcab572c8b855394bc1df1b886c47490079ac676
3
+ size 217160759
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "pad_token": "[PAD]",
3
+ "unk_token": "[UNK]",
4
+ "cls_token": "[CLS]",
5
+ "sep_token": "[SEP]",
6
+ "mask_token": "[MASK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer_class": "PreTrainedTokenizerFast",
3
+ "model_max_length": 64,
4
+ "padding_side": "right",
5
+ "truncation_side": "right",
6
+ "pad_token": "[PAD]",
7
+ "unk_token": "[UNK]",
8
+ "cls_token": "[CLS]",
9
+ "sep_token": "[SEP]",
10
+ "mask_token": "[MASK]"
11
+ }