AxelPCG commited on
Commit
688ac07
·
verified ·
1 Parent(s): 61eb3c1

Upload SPLADE-PT-BR model v1.0.0

Browse files
README.md CHANGED
@@ -77,14 +77,16 @@ pip install torch transformers
77
 
78
  ### Basic Usage
79
 
 
 
80
  ```python
81
  import torch
82
  from transformers import AutoTokenizer
83
- from splade.models.transformer_rep import Splade
84
 
85
  # Load model and tokenizer
86
  model = Splade.from_pretrained("AxelPCG/splade-pt-br")
87
- tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-base-portuguese-cased")
88
  model.eval()
89
 
90
  # Encode a query
@@ -109,6 +111,17 @@ values = query_vec[indices].tolist()
109
  print(f"Active dimensions: {len(indices)} / {query_vec.shape[0]}")
110
  ```
111
 
 
 
 
 
 
 
 
 
 
 
 
112
  ## Limitations and Bias
113
 
114
  - Model trained on machine-translated Portuguese data (mMARCO)
 
77
 
78
  ### Basic Usage
79
 
80
+ **Option 1: Using HuggingFace Hub (Recommended)**
81
+
82
  ```python
83
  import torch
84
  from transformers import AutoTokenizer
85
+ from modeling_splade import Splade
86
 
87
  # Load model and tokenizer
88
  model = Splade.from_pretrained("AxelPCG/splade-pt-br")
89
+ tokenizer = AutoTokenizer.from_pretrained("AxelPCG/splade-pt-br")
90
  model.eval()
91
 
92
  # Encode a query
 
111
  print(f"Active dimensions: {len(indices)} / {query_vec.shape[0]}")
112
  ```
113
 
114
+ **Option 2: Using SPLADE Library**
115
+
116
+ ```python
117
+ from splade.models.transformer_rep import Splade
118
+ from transformers import AutoTokenizer
119
+
120
+ # Load model by pointing to HuggingFace repo
121
+ model = Splade(model_type_or_dir="AxelPCG/splade-pt-br", agg="max", fp16=True)
122
+ tokenizer = AutoTokenizer.from_pretrained("AxelPCG/splade-pt-br")
123
+ ```
124
+
125
  ## Limitations and Bias
126
 
127
  - Model trained on machine-translated Portuguese data (mMARCO)
config.json CHANGED
@@ -4,6 +4,7 @@
4
  ],
5
  "model_type": "splade",
6
  "base_model": "neuralmind/bert-base-portuguese-cased",
 
7
  "vocab_size": 29794,
8
  "hidden_size": 768,
9
  "num_hidden_layers": 12,
@@ -16,6 +17,12 @@
16
  "type_vocab_size": 2,
17
  "initializer_range": 0.02,
18
  "layer_norm_eps": 1e-12,
 
 
 
 
19
  "aggregation": "max",
20
- "fp16": true
 
 
21
  }
 
4
  ],
5
  "model_type": "splade",
6
  "base_model": "neuralmind/bert-base-portuguese-cased",
7
+ "model_type_or_dir": "neuralmind/bert-base-portuguese-cased",
8
  "vocab_size": 29794,
9
  "hidden_size": 768,
10
  "num_hidden_layers": 12,
 
17
  "type_vocab_size": 2,
18
  "initializer_range": 0.02,
19
  "layer_norm_eps": 1e-12,
20
+ "pad_token_id": 0,
21
+ "position_embedding_type": "absolute",
22
+ "use_cache": true,
23
+ "classifier_dropout": null,
24
  "aggregation": "max",
25
+ "output": "MLM",
26
+ "fp16": true,
27
+ "agg": "max"
28
  }
model_metadata.json CHANGED
@@ -4,19 +4,17 @@
4
  "description": "SPLADE sparse retrieval model trained for Brazilian Portuguese",
5
  "author": "AxelPCG",
6
  "release_date": "2025-12-01",
7
-
8
  "base_model": {
9
  "name": "neuralmind/bert-base-portuguese-cased",
10
  "type": "BERTimbau",
11
  "language": "Portuguese (Brazilian)",
12
  "vocab_size": 29794
13
  },
14
-
15
  "training": {
16
  "training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)",
17
  "validation_dataset": "mRobust (unicamp-dl/mrobust)",
18
  "num_iterations": 150000,
19
- "final_loss": 0.000047,
20
  "batch_size": 8,
21
  "effective_batch_size": 32,
22
  "gradient_accumulation_steps": 4,
@@ -27,7 +25,6 @@
27
  "fp16": true,
28
  "optimizer": "AdamW",
29
  "scheduler": "linear_with_warmup",
30
-
31
  "regularization": {
32
  "type": "FLOPS",
33
  "lambda_q": 0.0003,
@@ -35,7 +32,6 @@
35
  "T": 50000
36
  }
37
  },
38
-
39
  "model_specs": {
40
  "architecture": "SPLADE",
41
  "aggregation": "max",
@@ -44,16 +40,19 @@
44
  "avg_active_dims_query": 120,
45
  "avg_active_dims_doc": 150
46
  },
47
-
48
  "performance": {
49
- "note": "Metrics will be updated after complete evaluation",
50
- "expected": {
51
- "MRR@10": "0.25-0.35",
52
- "Recall@100": "0.85-0.95",
53
- "Recall@1000": "0.95-0.99"
 
 
 
 
 
54
  }
55
  },
56
-
57
  "usage": {
58
  "primary_use_case": "Sparse vector retrieval for Portuguese RAG systems",
59
  "recommended_for": [
@@ -68,21 +67,18 @@
68
  "custom": "Standard inverted index on non-zero dimensions"
69
  }
70
  },
71
-
72
  "files": {
73
  "checkpoint": "model_final_checkpoint.tar",
74
  "config": "config.yaml",
75
  "tokenizer": "neuralmind/bert-base-portuguese-cased",
76
  "size_mb": 450
77
  },
78
-
79
  "huggingface": {
80
  "repo_id": "AxelPCG/splade-pt-br",
81
  "model_type": "splade",
82
  "pipeline_tag": "feature-extraction",
83
  "license": "apache-2.0"
84
  },
85
-
86
  "comparison_with_original": {
87
  "original_model": "SPLADE++",
88
  "original_language": "English",
@@ -94,16 +90,13 @@
94
  "Better semantic understanding of Brazilian Portuguese"
95
  ]
96
  },
97
-
98
  "limitations": [
99
  "Optimized for Brazilian Portuguese",
100
  "Not tested on European Portuguese",
101
  "May require domain adaptation for specialized fields",
102
  "Max sequence length: 256 tokens"
103
  ],
104
-
105
  "citation": {
106
  "bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}"
107
  }
108
- }
109
-
 
4
  "description": "SPLADE sparse retrieval model trained for Brazilian Portuguese",
5
  "author": "AxelPCG",
6
  "release_date": "2025-12-01",
 
7
  "base_model": {
8
  "name": "neuralmind/bert-base-portuguese-cased",
9
  "type": "BERTimbau",
10
  "language": "Portuguese (Brazilian)",
11
  "vocab_size": 29794
12
  },
 
13
  "training": {
14
  "training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)",
15
  "validation_dataset": "mRobust (unicamp-dl/mrobust)",
16
  "num_iterations": 150000,
17
+ "final_loss": 4.7e-05,
18
  "batch_size": 8,
19
  "effective_batch_size": 32,
20
  "gradient_accumulation_steps": 4,
 
25
  "fp16": true,
26
  "optimizer": "AdamW",
27
  "scheduler": "linear_with_warmup",
 
28
  "regularization": {
29
  "type": "FLOPS",
30
  "lambda_q": 0.0003,
 
32
  "T": 50000
33
  }
34
  },
 
35
  "model_specs": {
36
  "architecture": "SPLADE",
37
  "aggregation": "max",
 
40
  "avg_active_dims_query": 120,
41
  "avg_active_dims_doc": 150
42
  },
 
43
  "performance": {
44
+ "dataset": "mRobust (TREC Robust04 Portuguese)",
45
+ "num_documents": 528032,
46
+ "num_queries": 250,
47
+ "metrics": {
48
+ "MRR@10": 0.453,
49
+ "evaluation_date": "2025-12-02"
50
+ },
51
+ "comparison": {
52
+ "splade_en_mrr10": 0.383,
53
+ "improvement": "+18.3%"
54
  }
55
  },
 
56
  "usage": {
57
  "primary_use_case": "Sparse vector retrieval for Portuguese RAG systems",
58
  "recommended_for": [
 
67
  "custom": "Standard inverted index on non-zero dimensions"
68
  }
69
  },
 
70
  "files": {
71
  "checkpoint": "model_final_checkpoint.tar",
72
  "config": "config.yaml",
73
  "tokenizer": "neuralmind/bert-base-portuguese-cased",
74
  "size_mb": 450
75
  },
 
76
  "huggingface": {
77
  "repo_id": "AxelPCG/splade-pt-br",
78
  "model_type": "splade",
79
  "pipeline_tag": "feature-extraction",
80
  "license": "apache-2.0"
81
  },
 
82
  "comparison_with_original": {
83
  "original_model": "SPLADE++",
84
  "original_language": "English",
 
90
  "Better semantic understanding of Brazilian Portuguese"
91
  ]
92
  },
 
93
  "limitations": [
94
  "Optimized for Brazilian Portuguese",
95
  "Not tested on European Portuguese",
96
  "May require domain adaptation for specialized fields",
97
  "Max sequence length: 256 tokens"
98
  ],
 
99
  "citation": {
100
  "bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}"
101
  }
102
+ }
 
modeling_splade.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPLADE Model for HuggingFace Hub
3
+ Adapted from: https://github.com/naver/splade
4
+ """
5
+
6
+ import torch
7
+ from transformers import AutoModelForMaskedLM, PreTrainedModel, PretrainedConfig
8
+ from transformers.modeling_outputs import BaseModelOutput
9
+
10
+
11
+ class SpladeConfig(PretrainedConfig):
12
+ """Configuration class for SPLADE model"""
13
+ model_type = "splade"
14
+
15
+ def __init__(
16
+ self,
17
+ base_model="neuralmind/bert-base-portuguese-cased",
18
+ aggregation="max",
19
+ fp16=True,
20
+ **kwargs
21
+ ):
22
+ super().__init__(**kwargs)
23
+ self.base_model = base_model
24
+ self.aggregation = aggregation
25
+ self.fp16 = fp16
26
+
27
+
28
+ class Splade(PreTrainedModel):
29
+ """
30
+ SPLADE model for sparse retrieval.
31
+
32
+ This model produces sparse representations by:
33
+ 1. Using a MLM head to get vocabulary-sized logits
34
+ 2. Applying log(1 + ReLU(logits))
35
+ 3. Max-pooling over sequence length
36
+
37
+ Usage:
38
+ from transformers import AutoTokenizer
39
+ from modeling_splade import Splade
40
+
41
+ model = Splade.from_pretrained("AxelPCG/splade-pt-br")
42
+ tokenizer = AutoTokenizer.from_pretrained("AxelPCG/splade-pt-br")
43
+
44
+ # Encode query
45
+ query_tokens = tokenizer("Qual é a capital do Brasil?", return_tensors="pt")
46
+ with torch.no_grad():
47
+ query_vec = model(q_kwargs=query_tokens)["q_rep"]
48
+ """
49
+ config_class = SpladeConfig
50
+
51
+ def __init__(self, config):
52
+ super().__init__(config)
53
+ self.config = config
54
+
55
+ # Load base BERT model with MLM head
56
+ base_model = getattr(config, 'base_model', 'neuralmind/bert-base-portuguese-cased')
57
+ self.transformer = AutoModelForMaskedLM.from_pretrained(base_model)
58
+ self.aggregation = getattr(config, 'aggregation', 'max')
59
+ self.fp16 = getattr(config, 'fp16', True)
60
+
61
+ def encode(self, tokens):
62
+ """Encode tokens to sparse representation"""
63
+ # Get MLM logits
64
+ out = self.transformer(**tokens)
65
+ logits = out.logits # shape (bs, seq_len, vocab_size)
66
+
67
+ # Apply log(1 + ReLU(x))
68
+ relu_log = torch.log1p(torch.relu(logits))
69
+
70
+ # Apply attention mask
71
+ attention_mask = tokens["attention_mask"].unsqueeze(-1)
72
+ masked = relu_log * attention_mask
73
+
74
+ # Aggregate (max or sum)
75
+ if self.aggregation == "max":
76
+ values, _ = torch.max(masked, dim=1)
77
+ return values
78
+ else: # sum
79
+ return torch.sum(masked, dim=1)
80
+
81
+ def forward(self, q_kwargs=None, d_kwargs=None, **kwargs):
82
+ """
83
+ Forward pass supporting both query and document encoding.
84
+
85
+ Args:
86
+ q_kwargs: Query tokens (dict with input_ids, attention_mask)
87
+ d_kwargs: Document tokens (dict with input_ids, attention_mask)
88
+ **kwargs: Additional arguments (for compatibility)
89
+
90
+ Returns:
91
+ dict with 'q_rep' and/or 'd_rep' keys containing sparse vectors
92
+ """
93
+ output = {}
94
+
95
+ if q_kwargs is not None:
96
+ output["q_rep"] = self.encode(q_kwargs)
97
+
98
+ if d_kwargs is not None:
99
+ output["d_rep"] = self.encode(d_kwargs)
100
+
101
+ # If neither q_kwargs nor d_kwargs, use kwargs directly
102
+ if not output and kwargs:
103
+ output["rep"] = self.encode(kwargs)
104
+
105
+ return output
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc862991df523373e5698b3341dd0a99245cd3590a345c2173170bc44b7cb6f0
3
- size 1307742766
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fbf5a3a20f6e1d1ae82d2189d6f754434fca7e1b351ad76bbc3a90e6ab32587
3
+ size 435884875
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
tokenizer_config.json CHANGED
@@ -1,6 +1 @@
1
- {
2
- "tokenizer_class": "BertTokenizer",
3
- "do_lower_case": false,
4
- "model_max_length": 256,
5
- "tokenizer_type": "neuralmind/bert-base-portuguese-cased"
6
- }
 
1
+ {"do_lower_case": false, "init_inputs": []}
 
 
 
 
 
vocab.txt ADDED
The diff for this file is too large to render. See raw diff