smenaaliaga commited on 28 days ago

Commit

d568351

verified ·

1 Parent(s): 84b3b75

Upload PIBot Joint BERT model package

Browse files

Files changed (17) hide show

README.md +164 -0
__init__.py +1 -0
added_tokens.json +3 -0
config.json +36 -0
labels/activity_label.txt +3 -0
labels/calc_mode_label.txt +4 -0
labels/investment_label.txt +3 -0
labels/region_label.txt +3 -0
labels/req_form_label.txt +3 -0
labels/slot_label.txt +15 -0
model.safetensors +3 -0
modeling_jointbert.py +138 -0
module.py +62 -0
special_tokens_map.json +51 -0
spm.model +3 -0
tokenizer_config.json +58 -0
training_args.bin +3 -0

README.md ADDED Viewed

	@@ -0,0 +1,164 @@

+---
+language: es
+tags:
+- intent-classification
+- slot-filling
+- joint-bert
+- spanish
+- economics
+- chile
+- multi-head
+license: mit
+base_model: microsoft/mdeberta-v3-base
+pipeline_tag: token-classification
+---
+# PIBot Joint BERT
+Modelo **Joint BERT multi-head** para clasificación de intención y slot filling,
+especializado en consultas sobre indicadores macroeconómicos del Banco Central de Chile.
+## Arquitectura
+| Componente | Detalle |
+|---|---|
+| Base | `microsoft/mdeberta-v3-base` |
+| Task | `pibimacecv3` |
+| Intent heads | 5 (`activity`, `calc_mode`, `investment`, `region`, `req_form`) |
+| Slot labels | 15 (BIO) |
+| Custom code | `modeling_jointbert.py`, `module.py` |
+### Intent Heads
+| Head | Clases | Valores |
+|---|---|---|
+| `activity` | 3 | `none`, `specific`, `general` |
+| `calc_mode` | 4 | `original`, `prev_period`, `yoy`, `contribution` |
+| `investment` | 3 | `none`, `specific`, `general` |
+| `region` | 3 | `none`, `specific`, `general` |
+| `req_form` | 3 | `latest`, `point`, `range` |
+### Slot Entities (BIO)
+Entidades extraídas: `activity`, `frequency`, `indicator`, `investment`, `period`, `region`, `seasonality`
+Esquema BIO completo: 15 etiquetas (`O`, `B-*`, `I-*`).
+## Uso
+### Instalación
+```bash
+pip install torch transformers
+```
+### Carga del Modelo
+```python
+import torch
+from transformers import AutoTokenizer, AutoConfig
+# Cargar tokenizer y config
+tokenizer = AutoTokenizer.from_pretrained("BCCh/pibert", trust_remote_code=True)
+config = AutoConfig.from_pretrained("BCCh/pibert", trust_remote_code=True)
+# Cargar labels desde el repo
+from huggingface_hub import hf_hub_download
+import os
+label_dir = os.path.dirname(hf_hub_download("BCCh/pibert", "labels/slot_label.txt"))
+# Leer intent y slot labels
+def read_labels(path):
+    with open(path) as f:
+        return [line.strip() for line in f if line.strip()]
+slot_labels = read_labels(os.path.join(label_dir, "slot_label.txt"))
+# Preparar intent_label_lst para cada head
+intent_label_lst = []
+for head in ['activity', 'calc_mode', 'investment', 'region', 'req_form']:
+    intent_label_lst.append(read_labels(os.path.join(label_dir, f"{head}_label.txt")))
+# Cargar modelo con custom code
+from transformers import AutoModelForTokenClassification
+from modeling_jointbert import JointBERT  # auto-cargado con trust_remote_code
+model = JointBERT.from_pretrained(
+    "BCCh/pibert",
+    config=config,
+    intent_label_lst=intent_label_lst,
+    slot_label_lst=slot_labels,
+    trust_remote_code=True,
+)
+model.eval()
+```
+### Predicción
+```python
+text = "cuál fue el imacec de agosto 2024"
+tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+with torch.no_grad():
+    outputs = model(**tokens)
+    # outputs contiene intent_logits (lista) y slot_logits
+```
+## Estructura del Paquete
+```
+model_package/
+├── config.json              # Configuración BERT + task
+├── model.safetensors        # Pesos del modelo
+├── tokenizer.json           # Tokenizer
+├── tokenizer_config.json
+├── special_tokens_map.json
+├── vocab.txt
+├── modeling_jointbert.py    # Arquitectura JointBERT (custom)
+├── module.py                # CRF y módulos auxiliares
+├── __init__.py
+├── README.md                # Este archivo
+└── labels/
+    ├── slot_label.txt
+    ├── activity_label.txt
+    ├── calc_mode_label.txt
+    ├── investment_label.txt
+    ├── region_label.txt
+    ├── req_form_label.txt
+```
+## Datos de Entrenamiento
+Entrenado con datos de consultas sobre indicadores macroeconómicos chilenos:
+- **IMACEC** (Indicador Mensual de Actividad Económica)
+- **PIB** (Producto Interno Bruto)
+- Sectores económicos, frecuencias, períodos, regiones
+## Limitaciones
+- Especializado en consultas macroeconómicas del Banco Central de Chile
+- Mejor rendimiento en consultas cortas (< 50 tokens)
+- Requiere `trust_remote_code=True` por la arquitectura custom
+## Cita
+```bibtex
+@misc{pibot-jointbert,
+  author = {Banco Central de Chile},
+  title = {PIBot Joint BERT - Multi-head Intent + Slot Filling},
+  year = {2025},
+  publisher = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/BCCh/pibert}}
+}
+```
+## Referencias
+- [BERT for Joint Intent Classification and Slot Filling](https://arxiv.org/abs/1902.10909)
+- [JointBERT implementation](https://github.com/monologg/JointBERT)
+- [BETO: Spanish BERT](https://github.com/dccuchile/beto)
+## Licencia
+MIT License

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .modeling_jointbert import JointBERT

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 250101
+}

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_name_or_path": "microsoft/mdeberta-v3-base",
+  "architectures": [
+    "JointBERT"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "finetuning_task": "pibimacecv3",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.0",
+  "type_vocab_size": 0,
+  "vocab_size": 251000
+}

labels/activity_label.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+none
+specific
+general

labels/calc_mode_label.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+original
+prev_period
+yoy
+contribution

labels/investment_label.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+none
+specific
+general

labels/region_label.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+none
+specific
+general

labels/req_form_label.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+latest
+point
+range

labels/slot_label.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+O
+B-ACTIVITY
+B-FREQUENCY
+B-INDICATOR
+B-INVESTMENT
+B-PERIOD
+B-REGION
+B-SEASONALITY
+I-ACTIVITY
+I-FREQUENCY
+I-INDICATOR
+I-INVESTMENT
+I-PERIOD
+I-REGION
+I-SEASONALITY

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e8b66c4ad817ab5a3c0651f984607c39c29c4c017a43a13bb7508f37254246
+size 1112997152

modeling_jointbert.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, AutoModel
+from .module import CalcModeClassifier, ActivityClassifier, RegionClassifier, InvestmentClassifier, ReqFormClassifier, SlotClassifier
+try:
+    from torchcrf import CRF
+except ImportError:
+    CRF = None
+class JointBERT(PreTrainedModel):
+    def __init__(self, config, args, calc_mode_label_lst, activity_label_lst, region_label_lst, investment_label_lst, req_form_label_lst, slot_label_lst):
+        super(JointBERT, self).__init__(config)
+        self.args = args
+        self.num_calc_mode_labels = len(calc_mode_label_lst)
+        self.num_activity_labels = len(activity_label_lst)
+        self.num_region_labels = len(region_label_lst)
+        self.num_investment_labels = len(investment_label_lst)
+        self.num_req_form_labels = len(req_form_label_lst)
+        self.num_slot_labels = len(slot_label_lst)
+        # Usar AutoModel para soportar cualquier encoder transformer
+        self.encoder = AutoModel.from_pretrained(args.model_name_or_path, config=config)
+        self.calc_mode_classifier = CalcModeClassifier(config.hidden_size, self.num_calc_mode_labels, args.dropout_rate)
+        self.activity_classifier = ActivityClassifier(config.hidden_size, self.num_activity_labels, args.dropout_rate)
+        self.region_classifier = RegionClassifier(config.hidden_size, self.num_region_labels, args.dropout_rate)
+        self.investment_classifier = InvestmentClassifier(config.hidden_size, self.num_investment_labels, args.dropout_rate)
+        self.req_form_classifier = ReqFormClassifier(config.hidden_size, self.num_req_form_labels, args.dropout_rate)
+        self.slot_classifier = SlotClassifier(config.hidden_size, self.num_slot_labels, args.dropout_rate)
+        if args.use_crf:
+            if CRF is None:
+                raise ImportError("torchcrf no está instalado. Instala con: pip install pytorch-crf o ejecuta sin --use_crf")
+            crf_init_errors = []
+            for init_fn in (
+                lambda: CRF(self.num_slot_labels, pad_idx=None, use_gpu=False),
+                lambda: CRF(self.num_slot_labels, batch_first=True),
+                lambda: CRF(num_tags=self.num_slot_labels, batch_first=True),
+                lambda: CRF(self.num_slot_labels),
+                lambda: CRF(num_tags=self.num_slot_labels),
+            ):
+                try:
+                    self.crf = init_fn()
+                    break
+                except TypeError as e:
+                    crf_init_errors.append(str(e))
+            else:
+                raise TypeError("No se pudo inicializar CRF con las firmas conocidas: " + " | ".join(crf_init_errors))
+    def forward(self, input_ids, attention_mask, token_type_ids=None,
+        calc_mode_label_ids=None, activity_label_ids=None, region_label_ids=None, investment_label_ids=None, req_form_label_ids=None, slot_labels_ids=None):
+        outputs = self.encoder(input_ids, attention_mask=attention_mask,
+                              token_type_ids=token_type_ids)  # sequence_output, pooled_output, (hidden_states), (attentions)
+        sequence_output = outputs[0]
+        pooled_output = getattr(outputs, "pooler_output", None)
+        if pooled_output is None:
+            if len(outputs) > 1 and outputs[1] is not None and getattr(outputs[1], "dim", lambda: 0)() == 2:
+                pooled_output = outputs[1]
+            else:
+                pooled_output = sequence_output[:, 0]
+        calc_mode_logits = self.calc_mode_classifier(pooled_output)
+        activity_logits = self.activity_classifier(pooled_output)
+        region_logits = self.region_classifier(pooled_output)
+        investment_logits = self.investment_classifier(pooled_output)
+        req_form_logits = self.req_form_classifier(pooled_output)
+        slot_logits = self.slot_classifier(sequence_output)
+        total_loss = 0
+        def _get_weight(head_name):
+            """Retorna class weights registrados como buffer, o None."""
+            buf_name = f"{head_name}_class_weights"
+            w = getattr(self, buf_name, None)
+            return w
+        # 1. Calc Mode CrossEntropy
+        if calc_mode_label_ids is not None:
+            calc_mode_loss_fct = nn.CrossEntropyLoss(weight=_get_weight('calc_mode'))
+            calc_mode_loss = calc_mode_loss_fct(calc_mode_logits.view(-1, self.num_calc_mode_labels), calc_mode_label_ids.view(-1))
+            total_loss += calc_mode_loss
+        # 2. Activity CrossEntropy
+        if activity_label_ids is not None:
+            activity_loss_fct = nn.CrossEntropyLoss(weight=_get_weight('activity'))
+            activity_loss = activity_loss_fct(activity_logits.view(-1, self.num_activity_labels), activity_label_ids.view(-1))
+            total_loss += activity_loss
+        # 3. Region CrossEntropy
+        if region_label_ids is not None:
+            region_loss_fct = nn.CrossEntropyLoss(weight=_get_weight('region'))
+            region_loss = region_loss_fct(region_logits.view(-1, self.num_region_labels), region_label_ids.view(-1))
+            total_loss += region_loss
+        # 4. Investment CrossEntropy
+        if investment_label_ids is not None:
+            investment_loss_fct = nn.CrossEntropyLoss(weight=_get_weight('investment'))
+            investment_loss = investment_loss_fct(investment_logits.view(-1, self.num_investment_labels), investment_label_ids.view(-1))
+            total_loss += investment_loss
+        # 5. Req Form CrossEntropy
+        if req_form_label_ids is not None:
+            req_form_loss_fct = nn.CrossEntropyLoss(weight=_get_weight('req_form'))
+            req_form_loss = req_form_loss_fct(req_form_logits.view(-1, self.num_req_form_labels), req_form_label_ids.view(-1))
+            total_loss += req_form_loss
+        # 6. Slot Softmax
+        if slot_labels_ids is not None and self.args.slot_loss_coef != 0:
+            if self.args.use_crf:
+                # CRF doesn't handle ignore_index (-100), so we replace it with PAD (0)
+                slot_labels_ids_crf = slot_labels_ids.clone()
+                slot_labels_ids_crf[slot_labels_ids_crf == self.args.ignore_index] = 0
+                if hasattr(self.crf, 'viterbi_decode'):
+                    # TorchCRF API: forward returns log-likelihood per batch item
+                    slot_loss = -self.crf(slot_logits, slot_labels_ids_crf, attention_mask.bool()).mean()
+                else:
+                    # pytorch-crf API
+                    slot_loss = self.crf(slot_logits, slot_labels_ids_crf, mask=attention_mask.bool(), reduction='mean')
+                    slot_loss = -1 * slot_loss  # negative log-likelihood
+            else:
+                slot_loss_fct = nn.CrossEntropyLoss(ignore_index=self.args.ignore_index)
+                # Only keep active parts of the loss
+                if attention_mask is not None:
+                    active_loss = attention_mask.view(-1) == 1
+                    active_logits = slot_logits.view(-1, self.num_slot_labels)[active_loss]
+                    active_labels = slot_labels_ids.view(-1)[active_loss]
+                    slot_loss = slot_loss_fct(active_logits, active_labels)
+                else:
+                    slot_loss = slot_loss_fct(slot_logits.view(-1, self.num_slot_labels), slot_labels_ids.view(-1))
+            total_loss += self.args.slot_loss_coef * slot_loss
+        outputs = ((calc_mode_logits, activity_logits, region_logits, investment_logits, req_form_logits, slot_logits),) + outputs[2:]  # add hidden states and attention if they are here
+        outputs = (total_loss,) + outputs
+        return outputs  # (loss), logits, (hidden_states), (attentions) # Logits is a tuple of all classifier logits

module.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import torch.nn as nn
+class CalcModeClassifier(nn.Module):
+    def __init__(self, input_dim, num_calc_mode_labels, dropout_rate=0.):
+        super(CalcModeClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_calc_mode_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class ActivityClassifier(nn.Module):
+    def __init__(self, input_dim, num_activity_labels, dropout_rate=0.):
+        super(ActivityClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_activity_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class RegionClassifier(nn.Module):
+    def __init__(self, input_dim, num_region_labels, dropout_rate=0.):
+        super(RegionClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_region_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class InvestmentClassifier(nn.Module):
+    def __init__(self, input_dim, num_investment_labels, dropout_rate=0.):
+        super(InvestmentClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_investment_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class ReqFormClassifier(nn.Module):
+    def __init__(self, input_dim, num_req_form_labels, dropout_rate=0.):
+        super(ReqFormClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_req_form_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)
+class SlotClassifier(nn.Module):
+    def __init__(self, input_dim, num_slot_labels, dropout_rate=0.):
+        super(SlotClassifier, self).__init__()
+        self.dropout = nn.Dropout(dropout_rate)
+        self.linear = nn.Linear(input_dim, num_slot_labels)
+    def forward(self, x):
+        x = self.dropout(x)
+        return self.linear(x)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

spm.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13c8d666d62a7bc4ac8f040aab68e942c861f93303156cc28f5c7e885d86d6e3
+size 4305025

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "250101": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "sp_model_kwargs": {},
+  "split_by_punct": false,
+  "tokenizer_class": "DebertaV2Tokenizer",
+  "unk_token": "[UNK]",
+  "vocab_type": "spm"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a815201ec693183d7090277dcbe0b6ccbfd934137cdd1bfefabe2ee4cff99fe
+size 2040