Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +287 -0
model.onnx +3 -0
ort_config.json +33 -0
pipeline.py +103 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +63 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,287 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "models/bert-onnx-classifier",
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "custom_pipelines": {
+    "question-classifier": {
+      "impl": "pipeline.MultiTaskClassifierPipeline",
+      "pt": [
+        "AutoModelForSequenceClassification"
+      ]
+    }
+  },
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "type_d",
+    "1": "type_y",
+    "2": "type_c",
+    "3": "type_o",
+    "4": "category_self",
+    "5": "category_health",
+    "6": "category_accumulated_wealth",
+    "7": "category_family",
+    "8": "category_social_media",
+    "9": "category_short_travel",
+    "10": "category_sports",
+    "11": "category_property",
+    "12": "category_primary_education",
+    "13": "category_love",
+    "14": "category_romance",
+    "15": "category_children",
+    "16": "category_higher_education",
+    "17": "category_job",
+    "18": "category_diseases",
+    "19": "category_hard_times",
+    "20": "category_competitive_exam",
+    "21": "category_marriage",
+    "22": "category_business",
+    "23": "category_life_span",
+    "24": "category_unearned_wealth",
+    "25": "category_spirituality",
+    "26": "category_highest_education",
+    "27": "category_long_travel",
+    "28": "category_career",
+    "29": "category_income",
+    "30": "category_foreign",
+    "31": "category_expense",
+    "32": "time_based_y",
+    "33": "time_based_n",
+    "34": "perception_p",
+    "35": "perception_n"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "category_accumulated_wealth": 6,
+    "category_business": 22,
+    "category_career": 28,
+    "category_children": 15,
+    "category_competitive_exam": 20,
+    "category_diseases": 18,
+    "category_expense": 31,
+    "category_family": 7,
+    "category_foreign": 30,
+    "category_hard_times": 19,
+    "category_health": 5,
+    "category_higher_education": 16,
+    "category_highest_education": 26,
+    "category_income": 29,
+    "category_job": 17,
+    "category_life_span": 23,
+    "category_long_travel": 27,
+    "category_love": 13,
+    "category_marriage": 21,
+    "category_primary_education": 12,
+    "category_property": 11,
+    "category_romance": 14,
+    "category_self": 4,
+    "category_short_travel": 9,
+    "category_social_media": 8,
+    "category_spirituality": 25,
+    "category_sports": 10,
+    "category_unearned_wealth": 24,
+    "perception_n": 35,
+    "perception_p": 34,
+    "time_based_n": 33,
+    "time_based_y": 32,
+    "type_c": 2,
+    "type_d": 0,
+    "type_o": 3,
+    "type_y": 1
+  },
+  "label_config": {
+    "multi_class": [
+      {
+        "column": "question type",
+        "labels": [
+          [
+            "d",
+            "Descriptive"
+          ],
+          [
+            "y",
+            "Yes/No"
+          ],
+          [
+            "c",
+            "Complex"
+          ],
+          [
+            "o",
+            "Options"
+          ]
+        ],
+        "loss_weight": 1,
+        "name": "type"
+      },
+      {
+        "column": "category",
+        "labels": [
+          [
+            "self",
+            "Self"
+          ],
+          [
+            "health",
+            "Health"
+          ],
+          [
+            "accumulated_wealth",
+            "Accumulated Wealth"
+          ],
+          [
+            "family",
+            "Family"
+          ],
+          [
+            "social_media",
+            "Social media"
+          ],
+          [
+            "short_travel",
+            "Short Travel"
+          ],
+          [
+            "sports",
+            "Sports"
+          ],
+          [
+            "property",
+            "Property"
+          ],
+          [
+            "primary_education",
+            "Primary Education"
+          ],
+          [
+            "love",
+            "Love"
+          ],
+          [
+            "romance",
+            "Romance"
+          ],
+          [
+            "children",
+            "Children"
+          ],
+          [
+            "higher_education",
+            "Higher Education"
+          ],
+          [
+            "job",
+            "Job"
+          ],
+          [
+            "diseases",
+            "Diseases"
+          ],
+          [
+            "hard_times",
+            "Hard Times"
+          ],
+          [
+            "competitive_exam",
+            "Competitive Exam"
+          ],
+          [
+            "marriage",
+            "Marriage"
+          ],
+          [
+            "business",
+            "Business"
+          ],
+          [
+            "life_span",
+            "Life Span"
+          ],
+          [
+            "unearned_wealth",
+            "Unearned Wealth"
+          ],
+          [
+            "spirituality",
+            "Spirituality"
+          ],
+          [
+            "highest_education",
+            "Highest Education"
+          ],
+          [
+            "long_travel",
+            "Long Travel"
+          ],
+          [
+            "career",
+            "Career"
+          ],
+          [
+            "income",
+            "Income"
+          ],
+          [
+            "foreign",
+            "Foreign"
+          ],
+          [
+            "expense",
+            "Expense"
+          ]
+        ],
+        "loss_weight": 1,
+        "name": "category"
+      },
+      {
+        "column": "time based",
+        "labels": [
+          [
+            "y",
+            "Time Based"
+          ],
+          [
+            "n",
+            "Non Time Based"
+          ]
+        ],
+        "loss_weight": 1,
+        "name": "time_based"
+      },
+      {
+        "column": "perception",
+        "labels": [
+          [
+            "p",
+            "Positive Perception"
+          ],
+          [
+            "n",
+            "Negative Perception"
+          ]
+        ],
+        "loss_weight": 1,
+        "name": "perception"
+      }
+    ]
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pipeline_tag": "question-classifier",
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.48.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae644e537fc6e41fe578fe0a0171f05e6c6daa0d1209ccfa5f7b1496f565795c
+size 337062635

ort_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "one_external_file": true,
+  "opset": null,
+  "optimization": {},
+  "quantization": {
+    "activations_dtype": "QUInt8",
+    "activations_symmetric": false,
+    "format": "QOperator",
+    "is_static": false,
+    "mode": "IntegerOps",
+    "nodes_to_exclude": [],
+    "nodes_to_quantize": [],
+    "operators_to_quantize": [
+      "Conv",
+      "MatMul",
+      "Attention",
+      "LSTM",
+      "Gather",
+      "Transpose",
+      "EmbedLayerNormalization"
+    ],
+    "per_channel": false,
+    "qdq_add_pair_to_weight": false,
+    "qdq_dedicated_pair": false,
+    "qdq_op_type_per_channel_support_to_axis": {
+      "MatMul": 1
+    },
+    "reduce_range": false,
+    "weights_dtype": "QInt8",
+    "weights_symmetric": true
+  },
+  "use_external_data_format": false
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import torch
+import numpy as np
+from transformers import AutoModelForSequenceClassification, Pipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+class MultiTaskLabelEncoder:
+    def __init__(self, config):
+        self.config = config  # label_config from model.config
+        self.num_tasks = len(config["multi_class"])
+        self.label_sets = [task["labels"] for task in config["multi_class"]]
+        self.offsets = [0]
+        for labels in self.label_sets[:-1]:
+            self.offsets.append(self.offsets[-1] + len(labels))
+        self.total_labels = sum(len(labels) for labels in self.label_sets)
+    def preds_from_logits(self, logits):
+        """
+        Converts model logits into class index predictions for each task.
+        Returns shape: (batch_size, num_tasks)
+        """
+        preds = []
+        offset = 0
+        for task in self.config["multi_class"]:
+            block_size = len(task['labels'])
+            block = logits[:, offset: offset + block_size]
+            argmax_indices = np.argmax(block, axis=-1)
+            preds.append(argmax_indices)
+            offset += block_size
+        preds = np.stack(preds, axis=1)  # (batch_size, num_tasks)
+        return preds
+class MultiTaskClassifierPipeline(Pipeline):
+    def __init__(self, model, tokenizer, device=-1, **kwargs):
+        super().__init__(model=model, tokenizer=tokenizer, device=device)
+        if not hasattr(model.config, "label_config"):
+            raise ValueError("Your model config must contain 'label_config'.")
+        self.label_config = model.config.label_config
+        self.label_encoder = MultiTaskLabelEncoder(self.label_config)
+        self.is_onnx = "onnxruntime" in model.__class__.__module__.lower()
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}
+    def preprocess(self, inputs):
+        return self.tokenizer(inputs, return_tensors="pt", truncation=True, padding=True)
+    def _forward(self, model_inputs):
+        if self.is_onnx:
+            # ONNX: send NumPy on CPU
+            model_inputs = {k: v.cpu().numpy() for k, v in model_inputs.items()}
+            outputs = self.model(**model_inputs)
+            logits = outputs.logits if isinstance(outputs, dict) else outputs[0]
+        else:
+            # PyTorch: send to GPU if available
+            model_inputs = {k: v.to(self.model.device) for k, v in model_inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**model_inputs)
+            logits = outputs.logits
+        return {"logits": logits}
+    def postprocess(self, model_outputs):
+        logits = model_outputs["logits"]
+        preds = self.label_encoder.preds_from_logits(logits)
+        results = []
+        for row in preds:
+            result = {
+                "type": {},
+                "category": {},
+                "attributes": []
+            }
+            for i, task in enumerate(self.label_config["multi_class"]):
+                key, value = task["labels"][row[i]]
+                task_name = task["name"]
+                if task_name in ["type", "category"]:
+                    result[task_name] = {"key": key, "value": value}
+                elif task_name in ["time_based", "perception"] and key in ["y", "p"]:
+                    result["attributes"].append(value)
+            results.append(result)
+        return results
+def register_classifier_pipeline():
+    """
+    Register the custom pipeline with Hugging Face's pipeline registry.
+    """
+    # Register the custom pipeline
+    PIPELINE_REGISTRY.register_pipeline(
+        task="question-classifier",
+        pipeline_class=MultiTaskClassifierPipeline,
+        pt_model=AutoModelForSequenceClassification,
+        type="text",
+    )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 512,
+  "model_max_length": 512,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 0,
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "stride": 0,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff