mca_comment_analyzer / mca_comment_analyzer.py
Harshb11's picture
Update mca_comment_analyzer.py
056252e verified
# mca_comment_analyzer.py
import os
import pandas as pd
import torch
from transformers import pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
import random
from datetime import datetime, timedelta
# -----------------------------
# Configs
# -----------------------------
os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"
os.environ["NLTK_DATA"] = "/tmp/nltk_data"
# NLTK Stopwords
nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# -----------------------------
# MCA Comment Analyzer
# -----------------------------
class MCACommentAnalyzer:
def __init__(self):
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device==0 else "CPU")
# Sentiment model
self.sentiment_model = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=device
)
# Summarizer
self.summarizer = pipeline(
"summarization",
model="t5-small",
device=device
)
self.stop_words = STOPWORDS
def map_sentiment(self, pred, text):
text_lower = text.lower()
# -----------------------------
# Strict priority: Negative β†’ Violation β†’ Suggestion β†’ Positive
# -----------------------------
negative_keywords = ["confusing", "unclear", "bad", "problem"]
violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
if any(w in text_lower for w in negative_keywords):
return "Negative"
if any(w in text_lower for w in violation_keywords):
return "Violation"
if any(w in text_lower for w in suggestion_keywords):
return "Suggestion"
if any(w in text_lower for w in positive_keywords):
return "Positive"
# fallback to sentiment model
label = pred['label'].upper()
if label == "POSITIVE":
return "Positive"
elif label == "NEGATIVE":
return "Negative"
else:
return "Neutral"
def process_comment(self, comment):
pred = self.sentiment_model(comment)[0]
sentiment = self.map_sentiment(pred, comment)
if len(comment.split()) < 10:
summary_text = " ".join(comment.split()[:10])
else:
try:
summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
except:
summary_text = comment
words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
keywords = list(Counter(words).keys())
top_keywords = ", ".join(keywords[:3])
return sentiment, summary_text, keywords, top_keywords
def process_comments(self, comments_list):
sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
start_date = datetime.now() - timedelta(days=30)
for comment in comments_list:
sentiment, summary, keywords, top_kw = self.process_comment(comment)
sentiments.append(sentiment)
summaries.append(summary)
all_keywords.extend(keywords)
top_keywords_list.append(top_kw)
timestamps.append(start_date + timedelta(days=random.randint(0,30)))
df = pd.DataFrame({
"Timestamp": timestamps,
"Comment": comments_list,
"Summary": summaries,
"Sentiment": sentiments,
"Top Keywords": top_keywords_list
})
df.sort_values(by='Timestamp', inplace=True, ascending=True)
keyword_freq = pd.DataFrame(
Counter(all_keywords).items(),
columns=['Keyword', 'Frequency']
).sort_values(by='Frequency', ascending=False)
return df, keyword_freq
def generate_wordcloud(self, keyword_freq, filename=None):
wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
plt.figure(figsize=(10,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
if filename:
plt.savefig(filename, bbox_inches='tight')
return plt