Harshb11 commited on
Commit
5937b4b
·
verified ·
1 Parent(s): f73b119

Update mca_comment_analyzer.py

Browse files
Files changed (1) hide show
  1. mca_comment_analyzer.py +15 -58
mca_comment_analyzer.py CHANGED
@@ -1,6 +1,3 @@
1
- # -----------------------------
2
- # MCACommentAnalyzerLight.py
3
- # -----------------------------
4
  import pandas as pd
5
  from transformers import pipeline
6
  from wordcloud import WordCloud
@@ -13,26 +10,22 @@ from datetime import datetime, timedelta
13
  from langdetect import detect
14
  from deep_translator import GoogleTranslator
15
 
16
- # Download stopwords once
17
- nltk.download('stopwords')
18
 
19
  class MCACommentAnalyzerLight:
20
  def __init__(self):
21
- # Lightweight sentiment model
22
  self.sentiment_model = pipeline(
23
  "sentiment-analysis",
24
- model="cardiffnlp/twitter-roberta-base-sentiment"
 
25
  )
26
- # Lightweight summarizer
27
  self.summarizer = pipeline(
28
  "summarization",
29
- model="t5-small"
 
30
  )
31
  self.stop_words = set(stopwords.words('english'))
32
 
33
- # -----------------------------
34
- # Translate to English if needed
35
- # -----------------------------
36
  def translate_to_english(self, text):
37
  try:
38
  lang = detect(text)
@@ -42,67 +35,55 @@ class MCACommentAnalyzerLight:
42
  except:
43
  return text
44
 
45
- # -----------------------------
46
- # Rule-based sentiment mapping
47
- # -----------------------------
48
  def map_sentiment(self, pred, text):
49
  text_lower = text.lower()
50
  violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
 
 
 
 
51
  if any(w in text_lower for w in violation_keywords):
52
  return "Violation"
53
-
54
- suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
55
  if any(w in text_lower for w in suggestion_keywords):
56
  return "Suggestion"
57
-
58
- positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
59
  if any(w in text_lower for w in positive_keywords):
60
  return "Positive"
61
-
62
- negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
63
  if any(w in text_lower for w in negative_keywords):
64
  return "Negative"
65
 
66
  label = pred['label'].upper()
67
- if label == "POSITIVE":
68
  return "Positive"
69
- elif label == "NEGATIVE":
70
  return "Negative"
71
  else:
72
  return "Neutral"
73
 
74
- # -----------------------------
75
- # Process single comment
76
- # -----------------------------
77
  def process_comment(self, comment):
78
  translated_comment = self.translate_to_english(comment)
79
  pred = self.sentiment_model(translated_comment)[0]
80
  sentiment = self.map_sentiment(pred, translated_comment)
81
 
82
- # Summary: truncate short comments or use summarizer
83
  if len(translated_comment.split()) < 10:
84
  summary_text = " ".join(translated_comment.split()[:10])
85
  else:
86
  try:
87
  summary_text = self.summarizer(
88
  translated_comment,
89
- max_length=30,
90
  min_length=5,
91
  do_sample=False
92
  )[0]['summary_text']
93
  except:
94
  summary_text = translated_comment
95
 
96
- # Keywords
97
  words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
98
  keywords = list(Counter(words).keys())
99
  top_keywords = ", ".join(keywords[:3])
100
 
101
  return sentiment, summary_text, keywords, top_keywords
102
 
103
- # -----------------------------
104
- # Process multiple comments
105
- # -----------------------------
106
  def process_comments(self, comments_list):
107
  sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
108
  start_date = datetime.now() - timedelta(days=30)
@@ -123,10 +104,8 @@ class MCACommentAnalyzerLight:
123
  "Top Keywords": top_keywords_list
124
  })
125
 
126
- # Sort by Timestamp
127
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
128
 
129
- # Keyword frequency table
130
  keyword_freq = pd.DataFrame(
131
  Counter(all_keywords).items(),
132
  columns=['Keyword', 'Frequency']
@@ -134,34 +113,12 @@ class MCACommentAnalyzerLight:
134
 
135
  return df, keyword_freq
136
 
137
- # -----------------------------
138
- # Generate WordCloud
139
- # -----------------------------
140
  def generate_wordcloud(self, keyword_freq, filename=None):
141
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
142
- wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
143
- plt.figure(figsize=(10,5))
144
  plt.imshow(wc, interpolation="bilinear")
145
  plt.axis("off")
146
  if filename:
147
  plt.savefig(filename, bbox_inches='tight')
148
  return plt
149
-
150
- # -----------------------------
151
- # Quick Test (Optional)
152
- # -----------------------------
153
- if __name__ == "__main__":
154
- comments = [
155
- "The draft is very clear and helpful for companies.",
156
- "Section 5 is confusing and needs clarification.",
157
- "It would be better if SMEs get some relief.",
158
- "I recommend including more examples for clarity.",
159
- "Section 12 violates the Companies Act rules.",
160
- "यह टिप्पणी हिंदी में है।", # Hindi comment example
161
- "இந்த கருத்து தமிழில் உள்ளது." # Tamil comment example
162
- ]
163
-
164
- analyzer = MCACommentAnalyzerLight()
165
- df, keyword_freq = analyzer.process_comments(comments)
166
- print(df)
167
- analyzer.generate_wordcloud(keyword_freq)
 
 
 
 
1
  import pandas as pd
2
  from transformers import pipeline
3
  from wordcloud import WordCloud
 
10
  from langdetect import detect
11
  from deep_translator import GoogleTranslator
12
 
13
+ nltk.download('stopwords', quiet=True)
 
14
 
15
  class MCACommentAnalyzerLight:
16
  def __init__(self):
 
17
  self.sentiment_model = pipeline(
18
  "sentiment-analysis",
19
+ model="cardiffnlp/twitter-roberta-base-sentiment",
20
+ device=-1
21
  )
 
22
  self.summarizer = pipeline(
23
  "summarization",
24
+ model="sshleifer/distilbart-cnn-6-6",
25
+ device=-1
26
  )
27
  self.stop_words = set(stopwords.words('english'))
28
 
 
 
 
29
  def translate_to_english(self, text):
30
  try:
31
  lang = detect(text)
 
35
  except:
36
  return text
37
 
 
 
 
38
  def map_sentiment(self, pred, text):
39
  text_lower = text.lower()
40
  violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
41
+ suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
42
+ positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
43
+ negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
44
+
45
  if any(w in text_lower for w in violation_keywords):
46
  return "Violation"
 
 
47
  if any(w in text_lower for w in suggestion_keywords):
48
  return "Suggestion"
 
 
49
  if any(w in text_lower for w in positive_keywords):
50
  return "Positive"
 
 
51
  if any(w in text_lower for w in negative_keywords):
52
  return "Negative"
53
 
54
  label = pred['label'].upper()
55
+ if label in ["POSITIVE", "LABEL_2"]:
56
  return "Positive"
57
+ elif label in ["NEGATIVE", "LABEL_0"]:
58
  return "Negative"
59
  else:
60
  return "Neutral"
61
 
 
 
 
62
  def process_comment(self, comment):
63
  translated_comment = self.translate_to_english(comment)
64
  pred = self.sentiment_model(translated_comment)[0]
65
  sentiment = self.map_sentiment(pred, translated_comment)
66
 
67
+ # Summary
68
  if len(translated_comment.split()) < 10:
69
  summary_text = " ".join(translated_comment.split()[:10])
70
  else:
71
  try:
72
  summary_text = self.summarizer(
73
  translated_comment,
74
+ max_length=20,
75
  min_length=5,
76
  do_sample=False
77
  )[0]['summary_text']
78
  except:
79
  summary_text = translated_comment
80
 
 
81
  words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
82
  keywords = list(Counter(words).keys())
83
  top_keywords = ", ".join(keywords[:3])
84
 
85
  return sentiment, summary_text, keywords, top_keywords
86
 
 
 
 
87
  def process_comments(self, comments_list):
88
  sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
89
  start_date = datetime.now() - timedelta(days=30)
 
104
  "Top Keywords": top_keywords_list
105
  })
106
 
 
107
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
108
 
 
109
  keyword_freq = pd.DataFrame(
110
  Counter(all_keywords).items(),
111
  columns=['Keyword', 'Frequency']
 
113
 
114
  return df, keyword_freq
115
 
 
 
 
116
  def generate_wordcloud(self, keyword_freq, filename=None):
117
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
118
+ wc = WordCloud(width=600, height=300, background_color="white").generate_from_frequencies(wc_dict)
119
+ plt.figure(figsize=(8,4))
120
  plt.imshow(wc, interpolation="bilinear")
121
  plt.axis("off")
122
  if filename:
123
  plt.savefig(filename, bbox_inches='tight')
124
  return plt