Spaces:
Sleeping
Sleeping
File size: 4,675 Bytes
03c0ebe 79a013c 2aabb82 79a013c 2aabb82 03c0ebe 2d8c6ff 03c0ebe 79a013c 03c0ebe 79a013c 2aabb82 03c0ebe 79a013c 2aabb82 79a013c 03c0ebe 2aabb82 79a013c 2aabb82 03c0ebe 2aabb82 2d8c6ff 79a013c 2aabb82 03c0ebe 79a013c 2aabb82 056252e 2d8c6ff 5937b4b 056252e 2aabb82 056252e 2aabb82 79a013c 2aabb82 79a013c 2aabb82 2d8c6ff 2aabb82 2d8c6ff 2aabb82 2d8c6ff 2aabb82 2d8c6ff 2aabb82 2d8c6ff 2aabb82 2d8c6ff 2aabb82 79a013c 2aabb82 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
# mca_comment_analyzer.py
import os
import pandas as pd
import torch
from transformers import pipeline
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import nltk
import random
from datetime import datetime, timedelta
# -----------------------------
# Configs
# -----------------------------
os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"
os.environ["NLTK_DATA"] = "/tmp/nltk_data"
# NLTK Stopwords
nltk.download('stopwords', download_dir="/tmp/nltk_data", quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
# -----------------------------
# MCA Comment Analyzer
# -----------------------------
class MCACommentAnalyzer:
def __init__(self):
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device==0 else "CPU")
# Sentiment model
self.sentiment_model = pipeline(
"sentiment-analysis",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=device
)
# Summarizer
self.summarizer = pipeline(
"summarization",
model="t5-small",
device=device
)
self.stop_words = STOPWORDS
def map_sentiment(self, pred, text):
text_lower = text.lower()
# -----------------------------
# Strict priority: Negative → Violation → Suggestion → Positive
# -----------------------------
negative_keywords = ["confusing", "unclear", "bad", "problem"]
violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
if any(w in text_lower for w in negative_keywords):
return "Negative"
if any(w in text_lower for w in violation_keywords):
return "Violation"
if any(w in text_lower for w in suggestion_keywords):
return "Suggestion"
if any(w in text_lower for w in positive_keywords):
return "Positive"
# fallback to sentiment model
label = pred['label'].upper()
if label == "POSITIVE":
return "Positive"
elif label == "NEGATIVE":
return "Negative"
else:
return "Neutral"
def process_comment(self, comment):
pred = self.sentiment_model(comment)[0]
sentiment = self.map_sentiment(pred, comment)
if len(comment.split()) < 10:
summary_text = " ".join(comment.split()[:10])
else:
try:
summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
except:
summary_text = comment
words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
keywords = list(Counter(words).keys())
top_keywords = ", ".join(keywords[:3])
return sentiment, summary_text, keywords, top_keywords
def process_comments(self, comments_list):
sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
start_date = datetime.now() - timedelta(days=30)
for comment in comments_list:
sentiment, summary, keywords, top_kw = self.process_comment(comment)
sentiments.append(sentiment)
summaries.append(summary)
all_keywords.extend(keywords)
top_keywords_list.append(top_kw)
timestamps.append(start_date + timedelta(days=random.randint(0,30)))
df = pd.DataFrame({
"Timestamp": timestamps,
"Comment": comments_list,
"Summary": summaries,
"Sentiment": sentiments,
"Top Keywords": top_keywords_list
})
df.sort_values(by='Timestamp', inplace=True, ascending=True)
keyword_freq = pd.DataFrame(
Counter(all_keywords).items(),
columns=['Keyword', 'Frequency']
).sort_values(by='Frequency', ascending=False)
return df, keyword_freq
def generate_wordcloud(self, keyword_freq, filename=None):
wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
plt.figure(figsize=(10,5))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
if filename:
plt.savefig(filename, bbox_inches='tight')
return plt
|