Harshb11 commited on
Commit
f686d1b
·
verified ·
1 Parent(s): 50c6bfc

Update mca_comment_analyzer.py

Browse files
Files changed (1) hide show
  1. mca_comment_analyzer.py +46 -4
mca_comment_analyzer.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import pandas as pd
2
  from transformers import pipeline
3
  from wordcloud import WordCloud
@@ -10,20 +13,26 @@ from datetime import datetime, timedelta
10
  from langdetect import detect
11
  from deep_translator import GoogleTranslator
12
 
 
13
  nltk.download('stopwords')
14
 
15
- class MCACommentAnalyzer:
16
  def __init__(self):
 
17
  self.sentiment_model = pipeline(
18
  "sentiment-analysis",
19
- model="distilbert-base-uncased-finetuned-sst-2-english"
20
  )
 
21
  self.summarizer = pipeline(
22
  "summarization",
23
- model="sshleifer/distilbart-cnn-12-6"
24
  )
25
  self.stop_words = set(stopwords.words('english'))
26
 
 
 
 
27
  def translate_to_english(self, text):
28
  try:
29
  lang = detect(text)
@@ -33,6 +42,9 @@ class MCACommentAnalyzer:
33
  except:
34
  return text
35
 
 
 
 
36
  def map_sentiment(self, pred, text):
37
  text_lower = text.lower()
38
  violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
@@ -59,12 +71,15 @@ class MCACommentAnalyzer:
59
  else:
60
  return "Neutral"
61
 
 
 
 
62
  def process_comment(self, comment):
63
  translated_comment = self.translate_to_english(comment)
64
  pred = self.sentiment_model(translated_comment)[0]
65
  sentiment = self.map_sentiment(pred, translated_comment)
66
 
67
- # Summary
68
  if len(translated_comment.split()) < 10:
69
  summary_text = " ".join(translated_comment.split()[:10])
70
  else:
@@ -85,6 +100,9 @@ class MCACommentAnalyzer:
85
 
86
  return sentiment, summary_text, keywords, top_keywords
87
 
 
 
 
88
  def process_comments(self, comments_list):
89
  sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
90
  start_date = datetime.now() - timedelta(days=30)
@@ -105,8 +123,10 @@ class MCACommentAnalyzer:
105
  "Top Keywords": top_keywords_list
106
  })
107
 
 
108
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
109
 
 
110
  keyword_freq = pd.DataFrame(
111
  Counter(all_keywords).items(),
112
  columns=['Keyword', 'Frequency']
@@ -114,6 +134,9 @@ class MCACommentAnalyzer:
114
 
115
  return df, keyword_freq
116
 
 
 
 
117
  def generate_wordcloud(self, keyword_freq, filename=None):
118
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
119
  wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
@@ -123,3 +146,22 @@ class MCACommentAnalyzer:
123
  if filename:
124
  plt.savefig(filename, bbox_inches='tight')
125
  return plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -----------------------------
2
+ # MCACommentAnalyzerLight.py
3
+ # -----------------------------
4
  import pandas as pd
5
  from transformers import pipeline
6
  from wordcloud import WordCloud
 
13
  from langdetect import detect
14
  from deep_translator import GoogleTranslator
15
 
16
+ # Download stopwords once
17
  nltk.download('stopwords')
18
 
19
+ class MCACommentAnalyzerLight:
20
  def __init__(self):
21
+ # Lightweight sentiment model
22
  self.sentiment_model = pipeline(
23
  "sentiment-analysis",
24
+ model="cardiffnlp/twitter-roberta-base-sentiment"
25
  )
26
+ # Lightweight summarizer
27
  self.summarizer = pipeline(
28
  "summarization",
29
+ model="t5-small"
30
  )
31
  self.stop_words = set(stopwords.words('english'))
32
 
33
+ # -----------------------------
34
+ # Translate to English if needed
35
+ # -----------------------------
36
  def translate_to_english(self, text):
37
  try:
38
  lang = detect(text)
 
42
  except:
43
  return text
44
 
45
+ # -----------------------------
46
+ # Rule-based sentiment mapping
47
+ # -----------------------------
48
  def map_sentiment(self, pred, text):
49
  text_lower = text.lower()
50
  violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
 
71
  else:
72
  return "Neutral"
73
 
74
+ # -----------------------------
75
+ # Process single comment
76
+ # -----------------------------
77
  def process_comment(self, comment):
78
  translated_comment = self.translate_to_english(comment)
79
  pred = self.sentiment_model(translated_comment)[0]
80
  sentiment = self.map_sentiment(pred, translated_comment)
81
 
82
+ # Summary: truncate short comments or use summarizer
83
  if len(translated_comment.split()) < 10:
84
  summary_text = " ".join(translated_comment.split()[:10])
85
  else:
 
100
 
101
  return sentiment, summary_text, keywords, top_keywords
102
 
103
+ # -----------------------------
104
+ # Process multiple comments
105
+ # -----------------------------
106
  def process_comments(self, comments_list):
107
  sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
108
  start_date = datetime.now() - timedelta(days=30)
 
123
  "Top Keywords": top_keywords_list
124
  })
125
 
126
+ # Sort by Timestamp
127
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
128
 
129
+ # Keyword frequency table
130
  keyword_freq = pd.DataFrame(
131
  Counter(all_keywords).items(),
132
  columns=['Keyword', 'Frequency']
 
134
 
135
  return df, keyword_freq
136
 
137
+ # -----------------------------
138
+ # Generate WordCloud
139
+ # -----------------------------
140
  def generate_wordcloud(self, keyword_freq, filename=None):
141
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
142
  wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
 
146
  if filename:
147
  plt.savefig(filename, bbox_inches='tight')
148
  return plt
149
+
150
+ # -----------------------------
151
+ # Quick Test (Optional)
152
+ # -----------------------------
153
+ if __name__ == "__main__":
154
+ comments = [
155
+ "The draft is very clear and helpful for companies.",
156
+ "Section 5 is confusing and needs clarification.",
157
+ "It would be better if SMEs get some relief.",
158
+ "I recommend including more examples for clarity.",
159
+ "Section 12 violates the Companies Act rules.",
160
+ "यह टिप्पणी हिंदी में है।", # Hindi comment example
161
+ "இந்த கருத்து தமிழில் உள்ளது." # Tamil comment example
162
+ ]
163
+
164
+ analyzer = MCACommentAnalyzerLight()
165
+ df, keyword_freq = analyzer.process_comments(comments)
166
+ print(df)
167
+ analyzer.generate_wordcloud(keyword_freq)