Spaces:

garrulus21yyx
/

TextClassification

Sleeping

App Files Files Community

Garrulus21yyx commited on Mar 14

Commit

c772fc0

1 Parent(s): 2a07c48

Add minimal Gradio Space files

Browse files

Files changed (9) hide show

app.py +109 -0
experiments/config.json +26 -0
experiments/model.safetensors +3 -0
experiments/special_tokens_map.json +7 -0
experiments/tokenizer.json +0 -0
experiments/tokenizer_config.json +56 -0
experiments/vocab.txt +0 -0
modeling_bert.py +101 -0
requirements.txt +5 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import gradio as gr
+import torch
+from transformers import AutoTokenizer
+from modeling_bert import BertForSequenceClassification
+# 当前 app.py 所在目录
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# 训练完成后保存模型的目录
+MODEL_DIR = os.path.join(BASE_DIR, "experiments")
+# 如果 Spaces 提供 GPU 就用 GPU，否则自动回退到 CPU
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# 类别 id 到文本标签的映射
+ID2LABEL = {
+    0: "not_disaster",
+    1: "disaster",
+}
+# 加载 tokenizer
+tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
+# 加载训练好的分类模型
+model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
+model.to(DEVICE)
+model.eval()
+def inference(input_text):
+    # 处理空输入，避免直接送进模型报错
+    input_text = (input_text or "").strip()
+    if not input_text:
+        return "Please input a sentence."
+    # 把文本编码成模型可接收的输入格式
+    # 包括 input_ids 和 attention_mask
+    inputs = tokenizer(
+        input_text,
+        max_length=128,
+        truncation=True,
+        padding="max_length",
+        return_tensors="pt",
+    )
+    # 把输入张量移动到和模型相同的设备上
+    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+    # 推理阶段不需要计算梯度
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    # 取分数最高的类别作为最终预测
+    predicted_class_id = logits.argmax(dim=-1).item()
+    output = ID2LABEL[predicted_class_id]
+    return output
+# 使用 Gradio Blocks 搭建一个简单网页界面
+with gr.Blocks(css="""
+.message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
+#component-2 > div.wrap.svelte-w6rprc {height: 600px;}
+""") as demo:
+    gr.Markdown("# Disaster Tweet Classifier")
+    gr.Markdown("Input a sentence or tweet, and the model will predict whether it describes a real disaster.")
+    # 一行布局，里面放一个输入列
+    with gr.Row():
+        with gr.Column():
+            # 用户输入文本
+            input_text = gr.Textbox(
+                placeholder="Insert your text here...",
+                label="Input Text",
+                lines=4,
+            )
+            # 显示模型预测结果
+            answer = gr.Textbox(label="Prediction")
+            # 点击按钮后触发推理
+            generate_bt = gr.Button("Generate")
+    # 把按钮、输入框、输出框和推理函数绑定起来
+    generate_bt.click(
+        fn=inference,
+        inputs=[input_text],
+        outputs=[answer],
+        show_progress=True,
+    )
+    # 提供几个示例，方便在线体验
+    gr.Examples(
+        examples=[
+            ["Forest fire near La Ronge Sask. Canada"],
+            ["I love fruits and summer weather."],
+            ["There is an emergency evacuation happening now in the building across the street."],
+        ],
+        inputs=input_text,
+        outputs=answer,
+        fn=inference,
+        cache_examples=False,
+    )
+# 启动 Gradio 服务
+demo.launch()

experiments/config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "dtype": "float32",
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "transformers_version": "4.57.6",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

experiments/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f9bd5192084f1fc6013bc7e52e4ee9ead272cc3d1c15593c451f5620946a5d8
+size 437958648

experiments/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

experiments/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

experiments/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_bert.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import BertPreTrainedModel, BertModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        # 主干网络仍然是标准 BERT，用它提取整句语义表示。
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        # 分类头非常简单：dropout + 全连接层。
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # 先经过 BERT 编码，得到 token 级表示和 pooled_output。
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # outputs[1] 对应 [CLS] 的句级表示，常用于分类任务。
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            # 根据任务形式自动选择 loss。
+            # 当前数据集是二分类，实际会走 single_label_classification + CrossEntropyLoss。
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+transformers>=4.30.2
+torch>=2.0.0
+safetensors
+sentencepiece!=0.1.92