File size: 3,105 Bytes
72659d5 b62a8d0 72659d5 688ac07 72659d5 688ac07 72659d5 fff5e6a 72659d5 688ac07 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
{
"model_name": "SPLADE-PT-BR",
"version": "1.0.0",
"description": "SPLADE sparse retrieval model trained for Brazilian Portuguese",
"author": "AxelPCG",
"release_date": "2025-12-01",
"base_model": {
"name": "neuralmind/bert-base-portuguese-cased",
"type": "BERTimbau",
"language": "Portuguese (Brazilian)",
"vocab_size": 29794
},
"training": {
"training_dataset": "mMARCO Portuguese (unicamp-dl/mmarco)",
"validation_dataset": "mRobust (unicamp-dl/mrobust)",
"num_iterations": 150000,
"final_loss": 4.7e-05,
"batch_size": 8,
"effective_batch_size": 32,
"gradient_accumulation_steps": 4,
"learning_rate": 2e-05,
"weight_decay": 0.01,
"warmup_steps": 6000,
"max_length": 256,
"fp16": true,
"optimizer": "AdamW",
"scheduler": "linear_with_warmup",
"regularization": {
"type": "FLOPS",
"lambda_q": 0.0003,
"lambda_d": 0.0001,
"T": 50000
}
},
"model_specs": {
"architecture": "SPLADE",
"aggregation": "max",
"output_dim": 29794,
"expected_sparsity": 0.99,
"avg_active_dims_query": 120,
"avg_active_dims_doc": 150
},
"performance": {
"dataset": "mRobust (TREC Robust04 Portuguese)",
"num_documents": 528032,
"num_queries": 250,
"metrics": {
"MRR@10": 0.453,
"evaluation_date": "2025-12-02"
},
"comparison": {
"splade_en_mrr10": 0.383,
"improvement": "+18.3%"
}
},
"usage": {
"primary_use_case": "Sparse vector retrieval for Portuguese RAG systems",
"recommended_for": [
"Question answering in Portuguese",
"Document retrieval with Qdrant",
"Hybrid search (sparse + dense)",
"Interpretable search results"
],
"integration": {
"qdrant": "Use with SparseVectorParams",
"elasticsearch": "Compatible with sparse_vector field type",
"custom": "Standard inverted index on non-zero dimensions"
}
},
"files": {
"checkpoint": "model_final_checkpoint.tar",
"config": "config.yaml",
"tokenizer": "neuralmind/bert-base-portuguese-cased",
"size_mb": 450
},
"huggingface": {
"repo_id": "AxelPCG/splade-pt-br",
"model_type": "bert",
"pipeline_tag": "feature-extraction",
"license": "apache-2.0"
},
"comparison_with_original": {
"original_model": "SPLADE++",
"original_language": "English",
"original_mrr10": 0.368,
"improvements_for_portuguese": [
"Native Portuguese vocabulary",
"Contextual expansion in Portuguese",
"No subword tokenization for PT words",
"Better semantic understanding of Brazilian Portuguese"
]
},
"limitations": [
"Optimized for Brazilian Portuguese",
"Not tested on European Portuguese",
"May require domain adaptation for specialized fields",
"Max sequence length: 256 tokens"
],
"citation": {
"bibtex": "@misc{splade-pt-br-2025, author = {Axel Chepanski}, title = {SPLADE-PT-BR: Sparse Retrieval for Portuguese}, year = {2025}, publisher = {Hugging Face}, url = {https://huggingface.co/AxelPCG/splade-pt-br}}"
}
} |