Harshb11 commited on
Commit
2aabb82
·
verified ·
1 Parent(s): 9434cc2

Create mca_comment_analyzer.py

Browse files
Files changed (1) hide show
  1. mca_comment_analyzer.py +125 -0
mca_comment_analyzer.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from transformers import pipeline
3
+ from wordcloud import WordCloud
4
+ import matplotlib.pyplot as plt
5
+ from collections import Counter
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ import random
9
+ from datetime import datetime, timedelta
10
+ from langdetect import detect
11
+ from deep_translator import GoogleTranslator
12
+
13
+ nltk.download('stopwords')
14
+
15
+ class MCACommentAnalyzer:
16
+ def __init__(self):
17
+ self.sentiment_model = pipeline(
18
+ "sentiment-analysis",
19
+ model="distilbert-base-uncased-finetuned-sst-2-english"
20
+ )
21
+ self.summarizer = pipeline(
22
+ "summarization",
23
+ model="sshleifer/distilbart-cnn-12-6"
24
+ )
25
+ self.stop_words = set(stopwords.words('english'))
26
+
27
+ def translate_to_english(self, text):
28
+ try:
29
+ lang = detect(text)
30
+ if lang != "en":
31
+ return GoogleTranslator(source='auto', target='en').translate(text)
32
+ return text
33
+ except:
34
+ return text
35
+
36
+ def map_sentiment(self, pred, text):
37
+ text_lower = text.lower()
38
+ violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
39
+ if any(w in text_lower for w in violation_keywords):
40
+ return "Violation"
41
+
42
+ suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
43
+ if any(w in text_lower for w in suggestion_keywords):
44
+ return "Suggestion"
45
+
46
+ positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
47
+ if any(w in text_lower for w in positive_keywords):
48
+ return "Positive"
49
+
50
+ negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
51
+ if any(w in text_lower for w in negative_keywords):
52
+ return "Negative"
53
+
54
+ label = pred['label'].upper()
55
+ if label == "POSITIVE":
56
+ return "Positive"
57
+ elif label == "NEGATIVE":
58
+ return "Negative"
59
+ else:
60
+ return "Neutral"
61
+
62
+ def process_comment(self, comment):
63
+ translated_comment = self.translate_to_english(comment)
64
+ pred = self.sentiment_model(translated_comment)[0]
65
+ sentiment = self.map_sentiment(pred, translated_comment)
66
+
67
+ # Summary
68
+ if len(translated_comment.split()) < 10:
69
+ summary_text = " ".join(translated_comment.split()[:10])
70
+ else:
71
+ try:
72
+ summary_text = self.summarizer(
73
+ translated_comment,
74
+ max_length=30,
75
+ min_length=5,
76
+ do_sample=False
77
+ )[0]['summary_text']
78
+ except:
79
+ summary_text = translated_comment
80
+
81
+ # Keywords
82
+ words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
83
+ keywords = list(Counter(words).keys())
84
+ top_keywords = ", ".join(keywords[:3])
85
+
86
+ return sentiment, summary_text, keywords, top_keywords
87
+
88
+ def process_comments(self, comments_list):
89
+ sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
90
+ start_date = datetime.now() - timedelta(days=30)
91
+
92
+ for comment in comments_list:
93
+ sentiment, summary, keywords, top_kw = self.process_comment(comment)
94
+ sentiments.append(sentiment)
95
+ summaries.append(summary)
96
+ all_keywords.extend(keywords)
97
+ top_keywords_list.append(top_kw)
98
+ timestamps.append(start_date + timedelta(days=random.randint(0, 30)))
99
+
100
+ df = pd.DataFrame({
101
+ "Timestamp": timestamps,
102
+ "Comment": comments_list,
103
+ "Summary": summaries,
104
+ "Sentiment": sentiments,
105
+ "Top Keywords": top_keywords_list
106
+ })
107
+
108
+ df.sort_values(by='Timestamp', inplace=True, ascending=True)
109
+
110
+ keyword_freq = pd.DataFrame(
111
+ Counter(all_keywords).items(),
112
+ columns=['Keyword', 'Frequency']
113
+ ).sort_values(by='Frequency', ascending=False)
114
+
115
+ return df, keyword_freq
116
+
117
+ def generate_wordcloud(self, keyword_freq, filename=None):
118
+ wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
119
+ wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
120
+ plt.figure(figsize=(10,5))
121
+ plt.imshow(wc, interpolation="bilinear")
122
+ plt.axis("off")
123
+ if filename:
124
+ plt.savefig(filename, bbox_inches='tight')
125
+ return plt