Add new file

Browse files

Files changed (8) hide show

README.md +108 -0
added_tokens.json +3 -0
config.json +48 -0
modeling_bert_classifier.py +84 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer_config.json +15 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,108 @@

+# BertForStorySkillClassification
+## Model Overview
+`BertForStorySkillClassification` is a BERT-based text classification model designed to categorize story-related questions into one of the following 7 classes:
+1. **Character**
+2. **Setting**
+3. **Feeling**
+4. **Action**
+5. **Causal Relationship**
+6. **Outcome Resolution**
+7. **Prediction**
+This model is suitable for applications in education, literary analysis, and story comprehension.
+---
+## Model Architecture
+- **Base Model**: `bert-base-uncased`
+- **Classification Layer**: A fully connected layer on top of BERT for 7-class classification.
+- **Input**: Question text (e.g., "Who is the main character in the story?" or "why could n't alice get a doll as a child ? \<SEP> because her family was very poor " )
+- **Output**: Predicted label and confidence score.
+---
+## Quick Start
+### Install Dependencies
+Ensure you have the `transformers` library installed:
+```bash
+pip install transformers
+```
+### Load Model and Tokenizer
+```python
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+model = AutoModelForSequenceClassification.from_pretrained("curious008/BertForStorySkillClassification")
+tokenizer = AutoTokenizer.from_pretrained("curious008/BertForStorySkillClassification")
+```
+### Use the predict Method for Inference
+```python
+# Single text prediction
+result = model.predict(
+    texts="Where does this story take place?",
+    tokenizer=tokenizer,
+    return_probabilities=True
+)
+print(result)
+# Output: [{'text': 'Where does this story take place?', 'label': 'setting', 'score': 0.93178}]
+# Batch prediction
+results = model.predict(
+    texts=["Why is the character sad?", "How does the story end?","why could n't alice get a doll as a child ? <SEP> because her family was very poor "],
+    tokenizer=tokenizer,
+    batch_size=16,
+    device="cuda"
+)
+print(results)
+"""
+output:
+[{'text': 'Why is the character sad?', 'label': 'causal relationship'},
+ {'text': 'How does the story end?', 'label': 'action'},
+ {'text': "why could n't alice get a doll as a child ? <SEP> because her family was very poor ",
+  'label': 'causal relationship'}]
+"""
+```
+## Training Details
+### Dataset
+Source: [FairytaleQAData](https://github.com/uci-soe/FairytaleQAData)
+### Training Parameters
+Learning Rate: 2e-5
+Batch Size: 32
+Epochs: 3
+Optimizer: AdamW
+### Performance Metrics
+Accuracy: 97.3%
+Recall: 96.59%
+F1 Score: 96.96%
+## Notes
+1. **Input Length**: The model supports a maximum input length of 512 tokens. Longer texts will be truncated.
+2. **Device Suppor**t: The model supports both CPU and GPU inference. GPU is recommended for faster performance.
+3. **Tokenize**r: Always use the matching tokenizer (AutoTokenizer) for the model.
+## Citation
+If you use this model, please cite the following:
+```
+@misc{BertForStorySkillClassification,
+  author = {curious},
+  title = {BertForStorySkillClassification: A BERT-based Model for Story Question Classification},
+  year = {2025},
+  publisher = {Hugging Face},
+  howpublished = {\url{https://huggingface.co/curious008/BertForStorySkillClassification}}
+}
+```
+## License
+This model is open-sourced under the Apache 2.0 License. For more details, see the [LICENSE](https://www.apache.org/licenses/LICENSE-2.0) file.

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<sep>": 30522
+}

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "revision": "1.0",
+  "_name_or_path": "google-bert/bert-base-uncased",
+  "architectures": [
+    "BertForStorySkillClassification"
+  ],
+  "auto_map": {
+    "AutoModelForSequenceClassification": "modeling_bert_classifier.BertForStorySkillClassification"
+  },
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "character",
+    "1": "setting",
+    "2": "feeling",
+    "3": "action",
+    "4": "causal relationship",
+    "5": "outcome resolution",
+    "6": "prediction"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "action": 3,
+    "causal relationship": 4,
+    "character": 0,
+    "feeling": 2,
+    "outcome resolution": 5,
+    "prediction": 6,
+    "setting": 1
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.20.1",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30523
+}

modeling_bert_classifier.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import Dict, List, Union
+from transformers import BertPreTrainedModel, BertModel,PreTrainedTokenizer
+import torch.nn as nn
+import torch
+class BertForStorySkillClassification(BertPreTrainedModel):
+    def __init__(self,config):
+        super(BertForStorySkillClassification,self).__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+        self.post_init()
+    def forward(self,input_ids,attention_mask=None,labels=None,**kwargs):
+        outputs = self.bert(input_ids,attention_mask=attention_mask)
+        cls_hidden_state = outputs.last_hidden_state[:,0,:]  ##  [batch_size,seq_len,hidden_size]
+        logits = self.classifier(cls_hidden_state) ## [batch_size,num_labels]
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1,self.num_labels),labels.view(-1))
+            return loss
+        return logits
+    def predict(
+            self,
+            texts: Union[str, List[str]],
+            tokenizer: PreTrainedTokenizer,
+            batch_size: int = 32,
+            return_probabilities: bool = False,
+            device: Union[str, torch.device] = 'cpu',
+        ) -> List[Dict]:
+            """
+            对输入文本进行分类预测。
+            Args:
+                texts: 单条文本或文本列表，例如 "故事中的角色是谁？" 或 ["问题1", "问题2"]
+                tokenizer: 分词器实例（需与模型兼容）
+                batch_size: 批处理大小（提升推理速度）
+                return_probabilities: 是否返回概率值（默认返回标签）
+                device: 指定设备（例如 "cuda" 或 "cpu"），默认自动检测模型当前设备
+            Returns:
+                预测结果列表，格式为：
+                [{"text": "输入文本", "label": "预测标签", "score": 置信度}, ...]
+            """
+            # 自动获取模型所在设备
+            if device is None:
+                device = self.device
+            # 统一输入格式为列表
+            if isinstance(texts, str):
+                texts = [texts]
+            # 结果存储
+            predictions = []
+            # 批处理预测
+            with torch.no_grad():
+                for i in range(0, len(texts), batch_size):
+                    batch_texts = texts[i : i + batch_size]
+                    # 分词并转换为张量
+                    inputs = tokenizer(
+                        batch_texts,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="pt",
+                        max_length=512,  # 与BERT最大长度一致
+                    ).to(device)
+                    # 模型推理
+                    logits = self(**inputs)
+                    probs = torch.softmax(logits, dim=-1)
+                    scores, class_ids = torch.max(probs, dim=-1)
+                    # 转换为标签和分数
+                    for text, class_id, score in zip(batch_texts, class_ids, scores):
+                        label = self.config.id2label[class_id.item()]
+                        result = {"text": text, "label": label}
+                        if return_probabilities:
+                            result["score"] = score.item()
+                        predictions.append(result)
+            return predictions

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4a93fb540b17ffb5e5f40dae50d13001f35fdc30f25e1029b570209349b00d0d
+size 438021741

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "name_or_path": "/remote-home/CS_IMIAPD_chensong22/python/weights/google-bert/bert-base-uncased",
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff