Spaces:

Harshb11
/

mca_comment_analyzer

Sleeping

App Files Files Community

Harshb11 commited on Sep 14, 2025

Commit

f686d1b

verified ·

1 Parent(s): 50c6bfc

Update mca_comment_analyzer.py

Browse files

Files changed (1) hide show

mca_comment_analyzer.py +46 -4

mca_comment_analyzer.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import pandas as pd
 from transformers import pipeline
 from wordcloud import WordCloud
@@ -10,20 +13,26 @@ from datetime import datetime, timedelta
 from langdetect import detect
 from deep_translator import GoogleTranslator
 nltk.download('stopwords')
-class MCACommentAnalyzer:
     def __init__(self):
         self.sentiment_model = pipeline(
             "sentiment-analysis",
-            model="distilbert-base-uncased-finetuned-sst-2-english"
         )
         self.summarizer = pipeline(
             "summarization",
-            model="sshleifer/distilbart-cnn-12-6"
         )
         self.stop_words = set(stopwords.words('english'))
     def translate_to_english(self, text):
         try:
             lang = detect(text)
@@ -33,6 +42,9 @@ class MCACommentAnalyzer:
         except:
             return text
     def map_sentiment(self, pred, text):
         text_lower = text.lower()
         violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
@@ -59,12 +71,15 @@ class MCACommentAnalyzer:
         else:
             return "Neutral"
     def process_comment(self, comment):
         translated_comment = self.translate_to_english(comment)
         pred = self.sentiment_model(translated_comment)[0]
         sentiment = self.map_sentiment(pred, translated_comment)
-        # Summary
         if len(translated_comment.split()) < 10:
             summary_text = " ".join(translated_comment.split()[:10])
         else:
@@ -85,6 +100,9 @@ class MCACommentAnalyzer:
         return sentiment, summary_text, keywords, top_keywords
     def process_comments(self, comments_list):
         sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
         start_date = datetime.now() - timedelta(days=30)
@@ -105,8 +123,10 @@ class MCACommentAnalyzer:
             "Top Keywords": top_keywords_list
         })
         df.sort_values(by='Timestamp', inplace=True, ascending=True)
         keyword_freq = pd.DataFrame(
             Counter(all_keywords).items(),
             columns=['Keyword', 'Frequency']
@@ -114,6 +134,9 @@ class MCACommentAnalyzer:
         return df, keyword_freq
     def generate_wordcloud(self, keyword_freq, filename=None):
         wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
         wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
@@ -123,3 +146,22 @@ class MCACommentAnalyzer:
         if filename:
             plt.savefig(filename, bbox_inches='tight')
         return plt

+# -----------------------------
+# MCACommentAnalyzerLight.py
+# -----------------------------
 import pandas as pd
 from transformers import pipeline
 from wordcloud import WordCloud
 from langdetect import detect
 from deep_translator import GoogleTranslator
+# Download stopwords once
 nltk.download('stopwords')
+class MCACommentAnalyzerLight:
     def __init__(self):
+        # Lightweight sentiment model
         self.sentiment_model = pipeline(
             "sentiment-analysis",
+            model="cardiffnlp/twitter-roberta-base-sentiment"
         )
+        # Lightweight summarizer
         self.summarizer = pipeline(
             "summarization",
+            model="t5-small"
         )
         self.stop_words = set(stopwords.words('english'))
+    # -----------------------------
+    # Translate to English if needed
+    # -----------------------------
     def translate_to_english(self, text):
         try:
             lang = detect(text)
         except:
             return text
+    # -----------------------------
+    # Rule-based sentiment mapping
+    # -----------------------------
     def map_sentiment(self, pred, text):
         text_lower = text.lower()
         violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
         else:
             return "Neutral"
+    # -----------------------------
+    # Process single comment
+    # -----------------------------
     def process_comment(self, comment):
         translated_comment = self.translate_to_english(comment)
         pred = self.sentiment_model(translated_comment)[0]
         sentiment = self.map_sentiment(pred, translated_comment)
+        # Summary: truncate short comments or use summarizer
         if len(translated_comment.split()) < 10:
             summary_text = " ".join(translated_comment.split()[:10])
         else:
         return sentiment, summary_text, keywords, top_keywords
+    # -----------------------------
+    # Process multiple comments
+    # -----------------------------
     def process_comments(self, comments_list):
         sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
         start_date = datetime.now() - timedelta(days=30)
             "Top Keywords": top_keywords_list
         })
+        # Sort by Timestamp
         df.sort_values(by='Timestamp', inplace=True, ascending=True)
+        # Keyword frequency table
         keyword_freq = pd.DataFrame(
             Counter(all_keywords).items(),
             columns=['Keyword', 'Frequency']
         return df, keyword_freq
+    # -----------------------------
+    # Generate WordCloud
+    # -----------------------------
     def generate_wordcloud(self, keyword_freq, filename=None):
         wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
         wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
         if filename:
             plt.savefig(filename, bbox_inches='tight')
         return plt
+# -----------------------------
+# Quick Test (Optional)
+# -----------------------------
+if __name__ == "__main__":
+    comments = [
+        "The draft is very clear and helpful for companies.",
+        "Section 5 is confusing and needs clarification.",
+        "It would be better if SMEs get some relief.",
+        "I recommend including more examples for clarity.",
+        "Section 12 violates the Companies Act rules.",
+        "यह टिप्पणी हिंदी में है।",  # Hindi comment example
+        "இந்த கருத்து தமிழில் உள்ளது."   # Tamil comment example
+    ]
+    analyzer = MCACommentAnalyzerLight()
+    df, keyword_freq = analyzer.process_comments(comments)
+    print(df)
+    analyzer.generate_wordcloud(keyword_freq)