AventIQ-AI
/

DistilBERT-SentimentAnalyzer

+# Spam Detection using DistilBERT and Quantization
+## 🛠 Install Dependencies
+```bash
+!pip install transformers datasets evaluate scikit-learn torch
+!pip install evaluate
+```
+## 📥 Step 1: Load and Reduce Dataset
+```python
+from datasets import load_dataset
+dataset = load_dataset("yelp_polarity")
+dataset["train"] = dataset["train"].shuffle(seed=42).select(range(50000))
+dataset["test"] = dataset["test"].shuffle(seed=42).select(range(10000))
+```
+## ✂️ Step 2: Tokenization
+```python
+from transformers import AutoTokenizer
+model_name = "distilbert-base-uncased"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+def tokenize_function(example):
+    return tokenizer(example["text"], padding="max_length", truncation=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+## 🏷 Step 3: Rename 'label' to 'labels' and Set Format
+```python
+tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+```
+## 🧠 Step 4: Load Model
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
+```
+## 📊 Step 5: Define Metrics
+```python
+import numpy as np
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    preds = np.argmax(logits, axis=-1)
+    acc = accuracy_score(labels, preds)
+    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
+    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
+```
+## ⚙️ Step 6: Training Setup
+```python
+from transformers import TrainingArguments, Trainer
+training_args = TrainingArguments(
+    output_dir="./results",
+    eval_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    logging_dir="./logs",
+    logging_steps=10,
+)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["test"],
+    compute_metrics=compute_metrics,
+)
+```
+## 🚀 Step 7: Train
+```python
+trainer.train()
+trainer.save_model("./results")
+tokenizer.save_pretrained("./results")
+```
+## 🔍 Step 8: Inference on Sample Texts
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import torch
+model = AutoModelForSequenceClassification.from_pretrained("./results")
+tokenizer = AutoTokenizer.from_pretrained("./results")
+model.eval()
+sample_texts = [
+    "The food was absolutely wonderful!",
+    "Terrible experience. I will never come back.",
+    "Average service, but the food was decent.",
+    "I loved the ambiance and the staff was super friendly!",
+    "Worst food I've had in a long time.",
+    "Highly recommend this place for a date night.",
+    "The waiter was rude and the food was cold.",
+    "Amazing pizza, will order again!",
+    "They took too long to serve and it was overpriced.",
+    "Best customer service and delicious desserts!"
+]
+for text in sample_texts:
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        prediction = torch.argmax(outputs.logits, dim=-1).item()
+        sentiment = "Positive" if prediction == 1 else "Negative"
+    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")
+```
+## 📦 Step 9: Quantize the Model
+```python
+import os
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+model = AutoModelForSequenceClassification.from_pretrained("./results")
+quantized_model = torch.quantization.quantize_dynamic(
+    model,
+    {torch.nn.Linear},
+    dtype=torch.qint8
+)
+quantized_model_path = "./results/quantized_model"
+os.makedirs(quantized_model_path, exist_ok=True)
+torch.save(quantized_model.state_dict(), f"{quantized_model_path}/pytorch_model.bin")
+model.config.save_pretrained(quantized_model_path)
+tokenizer = AutoTokenizer.from_pretrained("./results")
+tokenizer.save_pretrained(quantized_model_path)
+print("✅ Quantized model saved at:", quantized_model_path)
+```