Spaces:

Harshb11
/

mca_comment_analyzer

Sleeping

App Files Files Community

mca_comment_analyzer / mca_comment_analyzer.py

Harshb11

Update mca_comment_analyzer.py

056252e verified 5 months ago

raw

history blame contribute delete

4.68 kB

	# mca_comment_analyzer.py

	import os
	import pandas as pd
	import torch
	from transformers import pipeline
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	from collections import Counter
	import nltk
	import random
	from datetime import datetime, timedelta

	# -----------------------------
	# Configs
	# -----------------------------
	os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"
	os.environ["NLTK_DATA"] = "/tmp/nltk_data"

	# NLTK Stopwords
	nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True)
	from nltk.corpus import stopwords
	STOPWORDS = set(stopwords.words('english'))

	# -----------------------------
	# MCA Comment Analyzer
	# -----------------------------
	class MCACommentAnalyzer:
	def __init__(self):
	device = 0 if torch.cuda.is_available() else -1
	print("Using device:", "GPU" if device==0 else "CPU")

	# Sentiment model
	self.sentiment_model = pipeline(
	"sentiment-analysis",
	model="distilbert-base-uncased-finetuned-sst-2-english",
	device=device
	)

	# Summarizer
	self.summarizer = pipeline(
	"summarization",
	model="t5-small",
	device=device
	)

	self.stop_words = STOPWORDS

	def map_sentiment(self, pred, text):
	text_lower = text.lower()

	# -----------------------------
	# Strict priority: Negative → Violation → Suggestion → Positive
	# -----------------------------
	negative_keywords = ["confusing", "unclear", "bad", "problem"]
	violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
	suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
	positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]

	if any(w in text_lower for w in negative_keywords):
	return "Negative"
	if any(w in text_lower for w in violation_keywords):
	return "Violation"
	if any(w in text_lower for w in suggestion_keywords):
	return "Suggestion"
	if any(w in text_lower for w in positive_keywords):
	return "Positive"

	# fallback to sentiment model
	label = pred['label'].upper()
	if label == "POSITIVE":
	return "Positive"
	elif label == "NEGATIVE":
	return "Negative"
	else:
	return "Neutral"

	def process_comment(self, comment):
	pred = self.sentiment_model(comment)[0]
	sentiment = self.map_sentiment(pred, comment)

	if len(comment.split()) < 10:
	summary_text = " ".join(comment.split()[:10])
	else:
	try:
	summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
	except:
	summary_text = comment

	words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
	keywords = list(Counter(words).keys())
	top_keywords = ", ".join(keywords[:3])
	return sentiment, summary_text, keywords, top_keywords

	def process_comments(self, comments_list):
	sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
	start_date = datetime.now() - timedelta(days=30)

	for comment in comments_list:
	sentiment, summary, keywords, top_kw = self.process_comment(comment)
	sentiments.append(sentiment)
	summaries.append(summary)
	all_keywords.extend(keywords)
	top_keywords_list.append(top_kw)
	timestamps.append(start_date + timedelta(days=random.randint(0,30)))

	df = pd.DataFrame({
	"Timestamp": timestamps,
	"Comment": comments_list,
	"Summary": summaries,
	"Sentiment": sentiments,
	"Top Keywords": top_keywords_list
	})
	df.sort_values(by='Timestamp', inplace=True, ascending=True)

	keyword_freq = pd.DataFrame(
	Counter(all_keywords).items(),
	columns=['Keyword', 'Frequency']
	).sort_values(by='Frequency', ascending=False)

	return df, keyword_freq

	def generate_wordcloud(self, keyword_freq, filename=None):
	wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
	wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
	plt.figure(figsize=(10,5))
	plt.imshow(wc, interpolation="bilinear")
	plt.axis("off")
	if filename:
	plt.savefig(filename, bbox_inches='tight')
	return plt