Harshb11 commited on
Commit
79a013c
·
verified ·
1 Parent(s): 07e0627

Update mca_comment_analyzer.py

Browse files
Files changed (1) hide show
  1. mca_comment_analyzer.py +55 -17
mca_comment_analyzer.py CHANGED
@@ -1,4 +1,7 @@
 
 
1
  import pandas as pd
 
2
  from transformers import pipeline
3
  from wordcloud import WordCloud
4
  import matplotlib.pyplot as plt
@@ -10,21 +13,32 @@ from datetime import datetime, timedelta
10
  from langdetect import detect
11
  from deep_translator import GoogleTranslator
12
 
 
 
 
 
 
 
 
13
  nltk.download('stopwords', quiet=True)
 
14
 
15
- class MCACommentAnalyzerLight:
 
16
  def __init__(self):
 
 
17
  self.sentiment_model = pipeline(
18
  "sentiment-analysis",
19
- model="cardiffnlp/twitter-roberta-base-sentiment",
20
- device=-1
21
  )
22
  self.summarizer = pipeline(
23
  "summarization",
24
- model="sshleifer/distilbart-cnn-6-6",
25
- device=-1
26
  )
27
- self.stop_words = set(stopwords.words('english'))
28
 
29
  def translate_to_english(self, text):
30
  try:
@@ -52,9 +66,9 @@ class MCACommentAnalyzerLight:
52
  return "Negative"
53
 
54
  label = pred['label'].upper()
55
- if label in ["POSITIVE", "LABEL_2"]:
56
  return "Positive"
57
- elif label in ["NEGATIVE", "LABEL_0"]:
58
  return "Negative"
59
  else:
60
  return "Neutral"
@@ -69,15 +83,11 @@ class MCACommentAnalyzerLight:
69
  summary_text = " ".join(translated_comment.split()[:10])
70
  else:
71
  try:
72
- summary_text = self.summarizer(
73
- translated_comment,
74
- max_length=20,
75
- min_length=5,
76
- do_sample=False
77
- )[0]['summary_text']
78
  except:
79
  summary_text = translated_comment
80
 
 
81
  words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
82
  keywords = list(Counter(words).keys())
83
  top_keywords = ", ".join(keywords[:3])
@@ -103,7 +113,6 @@ class MCACommentAnalyzerLight:
103
  "Sentiment": sentiments,
104
  "Top Keywords": top_keywords_list
105
  })
106
-
107
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
108
 
109
  keyword_freq = pd.DataFrame(
@@ -115,10 +124,39 @@ class MCACommentAnalyzerLight:
115
 
116
  def generate_wordcloud(self, keyword_freq, filename=None):
117
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
118
- wc = WordCloud(width=600, height=300, background_color="white").generate_from_frequencies(wc_dict)
119
- plt.figure(figsize=(8,4))
120
  plt.imshow(wc, interpolation="bilinear")
121
  plt.axis("off")
122
  if filename:
123
  plt.savefig(filename, bbox_inches='tight')
124
  return plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
  import pandas as pd
4
+ import torch
5
  from transformers import pipeline
6
  from wordcloud import WordCloud
7
  import matplotlib.pyplot as plt
 
13
  from langdetect import detect
14
  from deep_translator import GoogleTranslator
15
 
16
+ # ---- Config
17
+ st.set_option('browser.gatherUsageStats', False) # Disable usage stats
18
+ os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" # Fix matplotlib cache warning
19
+
20
+ st.set_page_config(page_title="MCA Comment Analyzer", layout="wide")
21
+
22
+ # ---- NLTK setup
23
  nltk.download('stopwords', quiet=True)
24
+ STOPWORDS = set(stopwords.words('english'))
25
 
26
+ # ---- MCA Analyzer Class
27
+ class MCACommentAnalyzer:
28
  def __init__(self):
29
+ device = 0 if torch.cuda.is_available() else -1
30
+ print("Using device:", "GPU" if device==0 else "CPU")
31
  self.sentiment_model = pipeline(
32
  "sentiment-analysis",
33
+ model="distilbert-base-uncased-finetuned-sst-2-english",
34
+ device=device
35
  )
36
  self.summarizer = pipeline(
37
  "summarization",
38
+ model="sshleifer/distilbart-cnn-12-6",
39
+ device=device
40
  )
41
+ self.stop_words = STOPWORDS
42
 
43
  def translate_to_english(self, text):
44
  try:
 
66
  return "Negative"
67
 
68
  label = pred['label'].upper()
69
+ if label == "POSITIVE":
70
  return "Positive"
71
+ elif label == "NEGATIVE":
72
  return "Negative"
73
  else:
74
  return "Neutral"
 
83
  summary_text = " ".join(translated_comment.split()[:10])
84
  else:
85
  try:
86
+ summary_text = self.summarizer(translated_comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
 
 
 
 
 
87
  except:
88
  summary_text = translated_comment
89
 
90
+ # Keywords
91
  words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
92
  keywords = list(Counter(words).keys())
93
  top_keywords = ", ".join(keywords[:3])
 
113
  "Sentiment": sentiments,
114
  "Top Keywords": top_keywords_list
115
  })
 
116
  df.sort_values(by='Timestamp', inplace=True, ascending=True)
117
 
118
  keyword_freq = pd.DataFrame(
 
124
 
125
  def generate_wordcloud(self, keyword_freq, filename=None):
126
  wc_dict = dict(zip(keyword_freq['Keyword'], keyword_freq['Frequency']))
127
+ wc = WordCloud(width=800, height=400, background_color="white").generate_from_frequencies(wc_dict)
128
+ plt.figure(figsize=(10,5))
129
  plt.imshow(wc, interpolation="bilinear")
130
  plt.axis("off")
131
  if filename:
132
  plt.savefig(filename, bbox_inches='tight')
133
  return plt
134
+
135
+ # ---- Streamlit UI
136
+ st.title("📊 MCA eConsultation Comment Analyzer")
137
+ st.sidebar.header("Upload or Enter Comments")
138
+ upload_file = st.sidebar.file_uploader("Upload a text file with comments", type=["txt"])
139
+ manual_input = st.sidebar.text_area("Or enter comments (one per line):")
140
+
141
+ comments = []
142
+ if upload_file:
143
+ comments = upload_file.read().decode("utf-8").splitlines()
144
+ elif manual_input.strip():
145
+ comments = manual_input.strip().split("\n")
146
+
147
+ if st.sidebar.button("Analyze"):
148
+ if comments:
149
+ analyzer = MCACommentAnalyzer()
150
+ df, keyword_freq = analyzer.process_comments(comments)
151
+
152
+ st.subheader("📌 Analysis Results")
153
+ st.dataframe(df, use_container_width=True)
154
+
155
+ st.subheader("📊 Sentiment Distribution")
156
+ st.bar_chart(df["Sentiment"].value_counts())
157
+
158
+ st.subheader("☁️ Word Cloud")
159
+ plt_obj = analyzer.generate_wordcloud(keyword_freq)
160
+ st.pyplot(plt_obj)
161
+ else:
162
+ st.warning("⚠️ Please provide comments to analyze.")