Spaces:

Jet-12138
/

CommentResponse

Runtime error

App Files Files Community

Jet-12138 commited on Apr 25, 2025

Commit

0749e03

verified ·

1 Parent(s): e9007ce

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -53

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
-import torch
-import torch.nn.functional as F
 from transformers import BertTokenizer
 import gradio as gr
-import json
-from model import CommentMTLModel
-# Set device, including MPS
 if torch.backends.mps.is_available():
     device = torch.device("mps")
 elif torch.cuda.is_available():
@@ -14,67 +13,92 @@ elif torch.cuda.is_available():
 else:
     device = torch.device("cpu")
-# Load tokenizer
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-# Load config values manually
-with open("config.json", "r") as f:
-    config_data = json.load(f)
-# Create model
 model = CommentMTLModel(
     model_name="bert-base-uncased",
-    num_sentiment_labels=config_data["num_sentiment_labels"],
-    num_toxicity_labels=config_data["num_toxicity_labels"],
-    dropout_prob=config_data.get("dropout_prob", 0.1)
 )
 model.load_state_dict(torch.load("pytorch_model.bin", map_location=device))
-model.to(device)
-model.eval()
-# Define labels
 sentiment_labels = ["Negative", "Neutral", "Positive"]
-toxicity_labels = ["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]
-# Define the prediction function
-def analyse_comment(comment):
-    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
-    inputs = {k: v.to(device) for k, v in inputs.items() if k in ['input_ids', 'attention_mask', 'token_type_ids']}
-    with torch.no_grad():
-        outputs = model(**inputs)
-    sentiment_logits = outputs["sentiment_logits"]
-    toxicity_logits = outputs["toxicity_logits"]
-    # Process sentiment (multi-class classification)
-    sentiment_probs = F.softmax(sentiment_logits, dim=1).squeeze(0)  # shape: (3,)
-    sentiment_predictions = {}
-    for idx, label in enumerate(sentiment_labels):
-        prob = sentiment_probs[idx].item()
-        sentiment_predictions[label] = round(prob, 4)
-    # Process toxicity (multi-label classification)
-    toxicity_probs = torch.sigmoid(toxicity_logits).squeeze(0)  # shape: (6,)
-    toxicity_predictions = {}
-    for idx, label in enumerate(toxicity_labels):
-        prob = toxicity_probs[idx].item()
-        toxicity_predictions[label] = round(prob, 4)
     return {
-        "Sentiment Probabilities": sentiment_predictions,
-        "Toxicity Probabilities": toxicity_predictions
     }
-# Create Gradio interface
 iface = gr.Interface(
-    fn=analyse_comment,
-    inputs=gr.Textbox(lines=3, placeholder="Please enter a comment for analysis..."),
-    outputs=gr.JSON(label="Prediction Results"),
-    title="Comment Sentiment and Toxicity Classifier",
-    description="This tool classifies the sentiment and the most probable type of toxicity in a given comment. It utilises a custom multi-task learning BERT model. Developed for academic demonstration purposes in Australia."
 )
-iface.launch()

+import torch, json, math, torch.nn.functional as F
 from transformers import BertTokenizer
 import gradio as gr
+from typing import List, Dict
+from model import CommentMTLModel   # your class
+# ------------ Device -----------------------------------------------------------------
 if torch.backends.mps.is_available():
     device = torch.device("mps")
 elif torch.cuda.is_available():
 else:
     device = torch.device("cpu")
+# ------------ Model / tokenizer ------------------------------------------------------
 tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+with open("config.json") as f:
+    cfg = json.load(f)
 model = CommentMTLModel(
     model_name="bert-base-uncased",
+    num_sentiment_labels=cfg["num_sentiment_labels"],
+    num_toxicity_labels=cfg["num_toxicity_labels"],
+    dropout_prob=cfg.get("dropout_prob", 0.1)
 )
 model.load_state_dict(torch.load("pytorch_model.bin", map_location=device))
+model.to(device).eval()
 sentiment_labels = ["Negative", "Neutral", "Positive"]
+toxicity_labels  = ["Toxic", "Severe Toxic", "Obscene", "Threat", "Insult", "Identity Hate"]
+# ------------ Core inference function ------------------------------------------------
+@torch.inference_mode()
+def analyse_batch(comments: List[str]) -> Dict:
+    """
+    comments: list of ≤100 raw comment strings
+    returns: aggregated statistics dict
+    """
+    # ---- encode all comments (batched) ----------
+    enc = tokenizer(
+        comments,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=512
+    )
+    enc = {k: v.to(device) for k, v in enc.items()}
+    # ---- forward pass (split to mini-batches in case 100 is too big) ----
+    batch_size = 32
+    n = enc["input_ids"].shape[0]
+    # counters
+    sent_counts = {lab: 0 for lab in sentiment_labels}
+    tox_counts  = {lab: 0 for lab in toxicity_labels}
+    comments_with_any_tox = 0
+    for i in range(0, n, batch_size):
+        sl = slice(i, i + batch_size)
+        out = model(
+            input_ids      = enc["input_ids"][sl],
+            attention_mask = enc["attention_mask"][sl],
+            token_type_ids = enc.get("token_type_ids", None)[sl] if "token_type_ids" in enc else None
+        )
+        # ----- sentiment (softmax, pick max) ----------------------------
+        sent_logits = out["sentiment_logits"]          # (b, 3)
+        sent_pred   = sent_logits.softmax(dim=1).argmax(dim=1)  # (b,)
+        for idx in sent_pred.tolist():
+            sent_counts[sentiment_labels[idx]] += 1
+        # ----- toxicity (sigmoid, multi-label) --------------------------
+        tox_probs = out["toxicity_logits"].sigmoid()   # (b, 6)
+        toxic_mask = tox_probs > 0.30                  # boolean mask
+        comments_with_any_tox += toxic_mask.any(dim=1).sum().item()
+        # add per-label counts
+        for lab_idx, lab in enumerate(toxicity_labels):
+            tox_counts[lab] += toxic_mask[:, lab_idx].sum().item()
     return {
+        "sentiment_counts": sent_counts,
+        "toxicity_counts": tox_counts,
+        "comments_with_any_toxicity": int(comments_with_any_tox)
     }
+# ------------ Gradio interface -------------------------------------------------------
 iface = gr.Interface(
+    fn=analyse_batch,
+    inputs=gr.JSON(label="List of comments (max 100)"),
+    outputs=gr.JSON(label="Aggregated statistics"),
+    title="YouTube Comment Sentiment & Toxicity Batch API",
+    description=(
+        "Send up to 100 raw comment strings and receive counts of Positive/Neutral/Negative "
+        "comments plus counts of toxicity labels where probability > 0.30."
+    ),
+    allow_flagging="never"
 )
+if __name__ == "__main__":
+    iface.launch()