Spaces:

DanTan05
/

sentiment-analyser

Sleeping

App Files Files Community

sentiment-analyser / models /bert_model.py

DanTan05

Upload models/bert_model.py with huggingface_hub

7a5dfe2 verified about 1 month ago

raw

history blame contribute delete

7.01 kB

	"""
	models/bert_model.py

	DistilBERT fine-tuned sentiment classifier.

	Training is done on Google Colab (GPU required) — see notebooks/colab_train.py.
	This file handles inference only, loading the saved checkpoint from disk.

	Public API (used by app.py):
	predict(text) -> {"label": str, "score": float, "keywords": list[str]}
	"""

	import os
	import sys
	import numpy as np
	import torch
	from transformers import (
	DistilBertTokenizerFast,
	DistilBertForSequenceClassification,
	)

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
	from data.labels import LABEL_NAMES # {0: "Negative", 1: "Positive", 2: "Neutral"}

	# ── Paths ─────────────────────────────────────────────────────────────────────

	SAVE_DIR = os.path.join(os.path.dirname(__file__), "saved", "bert", "bert_sentiment")
	HUB_MODEL = "DanTan05/bert-sentiment" # fallback when local checkpoint not present

	# ── Module-level cache ────────────────────────────────────────────────────────
	# Same pattern as baseline.py — load from disk once, reuse on every predict()

	_tokenizer = None
	_model = None
	_device = None


	def _load_models():
	global _tokenizer, _model, _device

	if _model is not None:
	return

	# Use local checkpoint if present (dev), otherwise download from Hub (Spaces).
	source = SAVE_DIR if os.path.exists(SAVE_DIR) else HUB_MODEL

	_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Loading BERT model from '{source}' on {_device}...")

	_tokenizer = DistilBertTokenizerFast.from_pretrained(source)
	_model = DistilBertForSequenceClassification.from_pretrained(source, attn_implementation="eager")
	_model.to(_device)
	_model.eval() # disables dropout — important for deterministic inference


	# ── Inference ─────────────────────────────────────────────────────────────────

	def predict(text: str) -> dict:
	"""
	Returns the same inference contract dict as baseline.py:
	{
	"label": "Positive" \| "Negative" \| "Neutral",
	"score": float, # confidence in the predicted class (0–1)
	"keywords": list[str], # tokens with highest attention weights
	}

	Why output_attentions=True?
	DistilBERT has 6 transformer layers, each with 12 attention heads.
	Each head produces a (seq_len × seq_len) attention matrix showing how
	much each token "attended to" every other token.
	We use these weights as a proxy for token importance.
	"""
	_load_models()

	# Tokenize
	# return_tensors="pt" → return PyTorch tensors (not lists)
	# truncation=True → clip to model's max 512 tokens
	# max_length=512 → DistilBERT's hard limit
	inputs = _tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	padding=True,
	)
	inputs = {k: v.to(_device) for k, v in inputs.items()}

	# Forward pass — no gradient tracking needed at inference time.
	# torch.no_grad() saves memory and speeds things up.
	with torch.no_grad():
	outputs = _model(**inputs, output_attentions=True)

	# outputs.logits shape: (1, n_classes) — raw unnormalised scores
	# softmax converts them to probabilities that sum to 1
	proba = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
	class_idx = int(np.argmax(proba))
	score = float(proba[class_idx])

	# The neutral class was trained on Twitter data that was actually pos/neg,
	# so the model over-predicts neutral for short opinionated text.
	# If neutral wins but with low confidence, defer to the stronger of pos/neg.
	NEUTRAL_IDX = 2
	NEUTRAL_THRESHOLD = 0.60
	if class_idx == NEUTRAL_IDX and score < NEUTRAL_THRESHOLD:
	class_idx = int(np.argmax(proba[:2])) # best of Negative(0) / Positive(1)
	score = float(proba[class_idx])

	# Map class index → label string using the model's own id2label config
	label_int = _model.config.id2label[class_idx] # e.g. "LABEL_1"
	# Our fine-tuning saves numeric keys, so fall back to LABEL_NAMES
	label_str = LABEL_NAMES.get(class_idx, label_int)

	keywords = _extract_keywords_from_attention(outputs.attentions, inputs, top_n=10)

	return {
	"label": label_str,
	"score": score,
	"keywords": keywords,
	}


	def _extract_keywords_from_attention(attentions, inputs, top_n: int = 10) -> list:
	"""
	Derives the most important tokens using the last layer's attention weights.

	Steps:
	1. Take the last transformer layer's attention tensor
	Shape: (1, n_heads, seq_len, seq_len)
	2. Average across all 12 heads → shape: (seq_len, seq_len)
	3. Sum each token's incoming attention (column sum) — this measures
	how much the rest of the sequence attended TO this token
	4. Convert token IDs back to strings, skip special tokens
	([CLS], [SEP], [PAD]) which always get high attention artificially
	5. Return the top_n tokens by attention score

	Caveat (worth knowing):
	Attention weights ≠ explanation. Research (Jain & Wallace 2019) shows
	attention doesn't always correlate with feature importance.
	For a demo this is fine; for production use SHAP or integrated gradients.
	"""
	# attentions is a tuple of tensors, one per layer — we want the last one
	last_layer_attn = attentions[-1] # (1, heads, seq, seq)
	avg_attn = last_layer_attn[0].mean(dim=0) # (seq, seq)
	token_scores = avg_attn.sum(dim=0).cpu().numpy() # (seq,)

	# Decode each token ID back to its string
	input_ids = inputs["input_ids"][0].cpu().numpy()
	special_ids = set(_tokenizer.all_special_ids)
	tokens = _tokenizer.convert_ids_to_tokens(input_ids)

	# Pair each token with its score, skip specials and subword prefixes
	scored = []
	for token, score, tid in zip(tokens, token_scores, input_ids):
	if tid in special_ids:
	continue
	# WordPiece subword tokens start with "##" — strip the prefix
	clean = token.replace("##", "")
	if len(clean) < 2: # skip single characters
	continue
	scored.append((clean, float(score)))

	# Sort by score descending, deduplicate, return top_n
	scored.sort(key=lambda x: x[1], reverse=True)
	seen, keywords = set(), []
	for word, _ in scored:
	if word not in seen:
	seen.add(word)
	keywords.append(word)
	if len(keywords) >= top_n:
	break

	return keywords