Spaces:

yuu1234
/

offensive-detection-bert

Sleeping

App Files Files Community

yuu1234 commited on Dec 14, 2025

Commit

c7483ea

1 Parent(s): f4cf167

Add

Browse files

Files changed (9) hide show

Dockerfile +23 -0
app.py +67 -0
best_model.pt +3 -0
model_save/config.json +36 -0
model_save/model.safetensors +3 -0
model_save/special_tokens_map.json +7 -0
model_save/tokenizer_config.json +58 -0
model_save/vocab.txt +0 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+# Base image
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Copy requirements and install
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy app code and model
+COPY app.py .
+COPY model_save/ ./model_save/
+COPY best_model.pt .
+# Expose ports
+EXPOSE 7860  # Gradio UI
+EXPOSE 5000  # Flask API
+# HF Spaces sẽ run CMD này
+# Gunicorn chạy Flask API trên port 5000
+# Gradio UI chạy song song trong cùng container
+CMD bash -c "python app.py & gunicorn -w 4 -b 0.0.0.0:5000 app:app"

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+from flask import Flask, request, jsonify
+import torch
+from transformers import BertTokenizer, BertForSequenceClassification
+import threading
+# ------------------- Device -------------------
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ------------------- Load model -------------------
+tokenizer = BertTokenizer.from_pretrained("./model_save")
+model = BertForSequenceClassification.from_pretrained("./model_save")
+model.load_state_dict(torch.load("best_model.pt", map_location=device))
+model.to(device)
+model.eval()
+# ------------------- Prediction function -------------------
+def predict_offensive(text):
+    encoded = tokenizer(
+        text,
+        return_tensors="pt",
+        truncation=True,
+        padding="max_length",
+        max_length=128
+    )
+    input_ids = encoded["input_ids"].to(device)
+    attention_mask = encoded["attention_mask"].to(device)
+    with torch.no_grad():
+        logits = model(input_ids, attention_mask=attention_mask).logits
+    pred = torch.argmax(logits, dim=1).item()
+    return "Offensive" if pred == 1 else "Not Offensive"
+# ------------------- Flask API -------------------
+app = Flask(__name__)
+@app.route("/predict", methods=["POST"])
+def api_predict():
+    data = request.json
+    if not data or "text" not in data:
+        return jsonify({"error": "Missing 'text' field"}), 400
+    text = data["text"]
+    prediction = predict_offensive(text)
+    return jsonify({"prediction": prediction})
+# ------------------- Gradio UI -------------------
+iface = gr.Interface(
+    fn=predict_offensive,
+    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
+    outputs="text",
+    title="Offensive Language Detector",
+    description="Enter a sentence and the model predicts if it contains offensive language."
+)
+def run_gradio():
+    iface.launch(server_name="0.0.0.0", server_port=7860, share=False, prevent_thread_lock=True)
+# ------------------- Main -------------------
+if __name__ == "__main__":
+    # Start Gradio in a separate thread
+    threading.Thread(target=run_gradio).start()
+    # Flask API will be served by Gunicorn (HF Spaces sẽ build và chạy)
+    # gunicorn -w 4 -b 0.0.0.0:5000 app:app
+    print("Flask API ready. Use Gunicorn to serve for concurrent requests.")

best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:275976b05aebfa9922f12e79f5b0ba1fcb7074adf5d15a38d3058564f95a8e0a
+size 438020423

model_save/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model_save/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40ea5cf1edbb4e292ac44e02f69aa3c3fb4437da2ac453ced2a6a6e31c23f8e8
+size 437961724

model_save/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

model_save/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

model_save/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+transformers
+flask
+gunicorn
+gradio
+numpy