Spaces:

Harshb11
/

mca_comment_analyzer

Sleeping

App Files Files Community

Harshb11 commited on Sep 14, 2025

Commit

5937b4b

verified ·

1 Parent(s): f73b119

Update mca_comment_analyzer.py

Browse files

Files changed (1) hide show

mca_comment_analyzer.py +15 -58

mca_comment_analyzer.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# -----------------------------
-# MCACommentAnalyzerLight.py
-# -----------------------------
 import pandas as pd
 from transformers import pipeline
 from wordcloud import WordCloud
@@ -13,26 +10,22 @@ from datetime import datetime, timedelta
 from langdetect import detect
 from deep_translator import GoogleTranslator
-# Download stopwords once
-nltk.download('stopwords')
 class MCACommentAnalyzerLight:
     def __init__(self):
-        # Lightweight sentiment model
         self.sentiment_model = pipeline(
             "sentiment-analysis",
-            model="cardiffnlp/twitter-roberta-base-sentiment"
         )
-        # Lightweight summarizer
         self.summarizer = pipeline(
             "summarization",
-            model="t5-small"
         )
         self.stop_words = set(stopwords.words('english'))
-    # -----------------------------
-    # Translate to English if needed
-    # -----------------------------
     def translate_to_english(self, text):
         try:
             lang = detect(text)
@@ -42,67 +35,55 @@ class MCACommentAnalyzerLight:
         except:
             return text
-    # -----------------------------
-    # Rule-based sentiment mapping
-    # -----------------------------
     def map_sentiment(self, pred, text):
         text_lower = text.lower()
         violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
         if any(w in text_lower for w in violation_keywords):
             return "Violation"
-        suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
         if any(w in text_lower for w in suggestion_keywords):
             return "Suggestion"
-        positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
         if any(w in text_lower for w in positive_keywords):
             return "Positive"
-        negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
         if any(w in text_lower for w in negative_keywords):
             return "Negative"
         label = pred['label'].upper()
-        if label == "POSITIVE":
             return "Positive"
-        elif label == "NEGATIVE":
             return "Negative"
         else:
             return "Neutral"
-    # -----------------------------
-    # Process single comment
-    # -----------------------------
     def process_comment(self, comment):
         translated_comment = self.translate_to_english(comment)
         pred = self.sentiment_model(translated_comment)[0]
         sentiment = self.map_sentiment(pred, translated_comment)
-        # Summary: truncate short comments or use summarizer
         if len(translated_comment.split()) < 10:
             summary_text = " ".join(translated_comment.split()[:10])
         else:
             try:
                 summary_text = self.summarizer(
                     translated_comment,
-                    max_length=30,
                     min_length=5,
                     do_sample=False
                 )[0]['summary_text']
             except:
                 summary_text = translated_comment
-        # Keywords
         words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
         keywords = list(Counter(words).keys())
         top_keywords = ", ".join(keywords[:3])
         return sentiment, summary_text, keywords, top_keywords
-    # -----------------------------
-    # Process multiple comments
-    # -----------------------------
     def process_comments(self, comments_list):
         sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
         start_date = datetime.now() - timedelta(days=30)
@@ -123,10 +104,8 @@ class MCACommentAnalyzerLight:
             "Top Keywords": top_keywords_list
         })
-        # Sort by Timestamp
         df.sort_values(by='Timestamp', inplace=True, ascending=True)
-        # Keyword frequency table
         keyword_freq = pd.DataFrame(
             Counter(all_keywords).items(),
             columns=['Keyword', 'Frequency']
@@ -134,34 +113,12 @@ class MCACommentAnalyzerLight:
         return df, keyword_freq
-    # -----------------------------
-    # Generate WordCloud
-    # -----------------------------
     def generate_wordcloud(self, keyword_freq, filename=None):
         wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
-        wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
-        plt.figure(figsize=(10,5))
         plt.imshow(wc, interpolation="bilinear")
         plt.axis("off")
         if filename:
             plt.savefig(filename, bbox_inches='tight')
         return plt
-# -----------------------------
-# Quick Test (Optional)
-# -----------------------------
-if __name__ == "__main__":
-    comments = [
-        "The draft is very clear and helpful for companies.",
-        "Section 5 is confusing and needs clarification.",
-        "It would be better if SMEs get some relief.",
-        "I recommend including more examples for clarity.",
-        "Section 12 violates the Companies Act rules.",
-        "यह टिप्पणी हिंदी में है।",  # Hindi comment example
-        "இந்த கருத்து தமிழில் உள்ளது."   # Tamil comment example
-    ]
-    analyzer = MCACommentAnalyzerLight()
-    df, keyword_freq = analyzer.process_comments(comments)
-    print(df)
-    analyzer.generate_wordcloud(keyword_freq)

 import pandas as pd
 from transformers import pipeline
 from wordcloud import WordCloud
 from langdetect import detect
 from deep_translator import GoogleTranslator
+nltk.download('stopwords', quiet=True)
 class MCACommentAnalyzerLight:
     def __init__(self):
         self.sentiment_model = pipeline(
             "sentiment-analysis",
+            model="cardiffnlp/twitter-roberta-base-sentiment",
+            device=-1
         )
         self.summarizer = pipeline(
             "summarization",
+            model="sshleifer/distilbart-cnn-6-6",
+            device=-1
         )
         self.stop_words = set(stopwords.words('english'))
     def translate_to_english(self, text):
         try:
             lang = detect(text)
         except:
             return text
     def map_sentiment(self, pred, text):
         text_lower = text.lower()
         violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
+        suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
+        positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
+        negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
         if any(w in text_lower for w in violation_keywords):
             return "Violation"
         if any(w in text_lower for w in suggestion_keywords):
             return "Suggestion"
         if any(w in text_lower for w in positive_keywords):
             return "Positive"
         if any(w in text_lower for w in negative_keywords):
             return "Negative"
         label = pred['label'].upper()
+        if label in ["POSITIVE", "LABEL_2"]:
             return "Positive"
+        elif label in ["NEGATIVE", "LABEL_0"]:
             return "Negative"
         else:
             return "Neutral"
     def process_comment(self, comment):
         translated_comment = self.translate_to_english(comment)
         pred = self.sentiment_model(translated_comment)[0]
         sentiment = self.map_sentiment(pred, translated_comment)
+        # Summary
         if len(translated_comment.split()) < 10:
             summary_text = " ".join(translated_comment.split()[:10])
         else:
             try:
                 summary_text = self.summarizer(
                     translated_comment,
+                    max_length=20,
                     min_length=5,
                     do_sample=False
                 )[0]['summary_text']
             except:
                 summary_text = translated_comment
         words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
         keywords = list(Counter(words).keys())
         top_keywords = ", ".join(keywords[:3])
         return sentiment, summary_text, keywords, top_keywords
     def process_comments(self, comments_list):
         sentiments, summaries, all_keywords, top_keywords_list, timestamps = [], [], [], [], []
         start_date = datetime.now() - timedelta(days=30)
             "Top Keywords": top_keywords_list
         })
         df.sort_values(by='Timestamp', inplace=True, ascending=True)
         keyword_freq = pd.DataFrame(
             Counter(all_keywords).items(),
             columns=['Keyword', 'Frequency']
         return df, keyword_freq
     def generate_wordcloud(self, keyword_freq, filename=None):
         wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
+        wc = WordCloud(width=600, height=300, background_color="white").generate_from_frequencies(wc_dict)
+        plt.figure(figsize=(8,4))
         plt.imshow(wc, interpolation="bilinear")
         plt.axis("off")
         if filename:
             plt.savefig(filename, bbox_inches='tight')
         return plt