File size: 3,105 Bytes
72659d5
 
 
 
 
 
 
 
 
 
 
 
 
b62a8d0
 
72659d5
688ac07
72659d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688ac07
 
 
 
 
 
 
 
 
 
72659d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fff5e6a
72659d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688ac07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
{
  "model_name": "SPLADE-PT-BR",
  "version": "1.0.0",
  "description": "SPLADE sparse retrieval model trained for Brazilian Portuguese",
  "author": "AxelPCG",
  "release_date": "2025-12-01",
  "base_model": {
    "name": "neuralmind/bert-base-portuguese-cased",
    "type": "BERTimbau",
    "language": "Portuguese (Brazilian)",
    "vocab_size": 29794
  },
  "training": {
    "training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)",
    "validation_dataset": "mRobust (unicamp-dl/mrobust)",
    "num_iterations": 150000,
    "final_loss": 4.7e-05,
    "batch_size": 8,
    "effective_batch_size": 32,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-05,
    "weight_decay": 0.01,
    "warmup_steps": 6000,
    "max_length": 256,
    "fp16": true,
    "optimizer": "AdamW",
    "scheduler": "linear_with_warmup",
    "regularization": {
      "type": "FLOPS",
      "lambda_q": 0.0003,
      "lambda_d": 0.0001,
      "T": 50000
    }
  },
  "model_specs": {
    "architecture": "SPLADE",
    "aggregation": "max",
    "output_dim": 29794,
    "expected_sparsity": 0.99,
    "avg_active_dims_query": 120,
    "avg_active_dims_doc": 150
  },
  "performance": {
    "dataset": "mRobust (TREC Robust04 Portuguese)",
    "num_documents": 528032,
    "num_queries": 250,
    "metrics": {
      "MRR@10": 0.453,
      "evaluation_date": "2025-12-02"
    },
    "comparison": {
      "splade_en_mrr10": 0.383,
      "improvement": "+18.3%"
    }
  },
  "usage": {
    "primary_use_case": "Sparse vector retrieval for Portuguese RAG systems",
    "recommended_for": [
      "Question answering in Portuguese",
      "Document retrieval with Qdrant",
      "Hybrid search (sparse + dense)",
      "Interpretable search results"
    ],
    "integration": {
      "qdrant": "Use with SparseVectorParams",
      "elasticsearch": "Compatible with sparse_vector field type",
      "custom": "Standard inverted index on non-zero dimensions"
    }
  },
  "files": {
    "checkpoint": "model_final_checkpoint.tar",
    "config": "config.yaml",
    "tokenizer": "neuralmind/bert-base-portuguese-cased",
    "size_mb": 450
  },
  "huggingface": {
    "repo_id": "AxelPCG/splade-pt-br",
    "model_type": "bert",
    "pipeline_tag": "feature-extraction",
    "license": "apache-2.0"
  },
  "comparison_with_original": {
    "original_model": "SPLADE++",
    "original_language": "English",
    "original_mrr10": 0.368,
    "improvements_for_portuguese": [
      "Native Portuguese vocabulary",
      "Contextual expansion in Portuguese",
      "No subword tokenization for PT words",
      "Better semantic understanding of Brazilian Portuguese"
    ]
  },
  "limitations": [
    "Optimized for Brazilian Portuguese",
    "Not tested on European Portuguese",
    "May require domain adaptation for specialized fields",
    "Max sequence length: 256 tokens"
  ],
  "citation": {
    "bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}"
  }
}