Harshb11 commited on
Commit
2d8c6ff
·
verified ·
1 Parent(s): d785ac6

Update mca_comment_analyzer.py

Browse files
Files changed (1) hide show
  1. mca_comment_analyzer.py +44 -37
mca_comment_analyzer.py CHANGED
@@ -10,51 +10,45 @@ import nltk
10
  from nltk.corpus import stopwords
11
  import random
12
  from datetime import datetime, timedelta
13
- from langdetect import detect
14
- from deep_translator import GoogleTranslator
15
 
16
  # ---- Config
17
- st.set_option('browser.gatherUsageStats', False) # Disable usage stats
18
- os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib" # Fix matplotlib cache warning
19
 
20
- st.set_page_config(page_title="MCA Comment Analyzer", layout="wide")
21
 
22
- # ---- NLTK setup
23
  nltk.download('stopwords', quiet=True)
24
  STOPWORDS = set(stopwords.words('english'))
25
 
26
- # ---- MCA Analyzer Class
27
  class MCACommentAnalyzer:
28
  def __init__(self):
29
  device = 0 if torch.cuda.is_available() else -1
30
  print("Using device:", "GPU" if device==0 else "CPU")
 
 
31
  self.sentiment_model = pipeline(
32
  "sentiment-analysis",
33
  model="distilbert-base-uncased-finetuned-sst-2-english",
34
  device=device
35
  )
 
 
36
  self.summarizer = pipeline(
37
  "summarization",
38
- model="sshleifer/distilbart-cnn-12-6",
39
  device=device
40
  )
 
41
  self.stop_words = STOPWORDS
42
 
43
- def translate_to_english(self, text):
44
- try:
45
- lang = detect(text)
46
- if lang != "en":
47
- return GoogleTranslator(source='auto', target='en').translate(text)
48
- return text
49
- except:
50
- return text
51
-
52
  def map_sentiment(self, pred, text):
53
  text_lower = text.lower()
54
- violation_keywords = ["violation", "violates", "illegal", "non-compliant", "breach", "unlawful", "risk", "penalty"]
55
- suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if", "could", "need to"]
56
  positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
57
- negative_keywords = ["confusing", "unclear", "bad", "problem", "needs clarification"]
58
 
59
  if any(w in text_lower for w in violation_keywords):
60
  return "Violation"
@@ -74,24 +68,20 @@ class MCACommentAnalyzer:
74
  return "Neutral"
75
 
76
  def process_comment(self, comment):
77
- translated_comment = self.translate_to_english(comment)
78
- pred = self.sentiment_model(translated_comment)[0]
79
- sentiment = self.map_sentiment(pred, translated_comment)
80
 
81
- # Summary
82
- if len(translated_comment.split()) < 10:
83
- summary_text = " ".join(translated_comment.split()[:10])
84
  else:
85
  try:
86
- summary_text = self.summarizer(translated_comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
87
  except:
88
- summary_text = translated_comment
89
 
90
- # Keywords
91
- words = [w for w in translated_comment.lower().split() if w.isalpha() and w not in self.stop_words]
92
  keywords = list(Counter(words).keys())
93
  top_keywords = ", ".join(keywords[:3])
94
-
95
  return sentiment, summary_text, keywords, top_keywords
96
 
97
  def process_comments(self, comments_list):
@@ -104,7 +94,7 @@ class MCACommentAnalyzer:
104
  summaries.append(summary)
105
  all_keywords.extend(keywords)
106
  top_keywords_list.append(top_kw)
107
- timestamps.append(start_date + timedelta(days=random.randint(0, 30)))
108
 
109
  df = pd.DataFrame({
110
  "Timestamp": timestamps,
@@ -133,14 +123,31 @@ class MCACommentAnalyzer:
133
  return plt
134
 
135
  # ---- Streamlit UI
136
- st.title("📊 MCA eConsultation Comment Analyzer")
137
  st.sidebar.header("Upload or Enter Comments")
138
- upload_file = st.sidebar.file_uploader("Upload a text file with comments", type=["txt"])
139
- manual_input = st.sidebar.text_area("Or enter comments (one per line):")
 
140
 
141
  comments = []
142
  if upload_file:
143
- comments = upload_file.read().decode("utf-8").splitlines()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  elif manual_input.strip():
145
  comments = manual_input.strip().split("\n")
146
 
@@ -159,4 +166,4 @@ if st.sidebar.button("Analyze"):
159
  plt_obj = analyzer.generate_wordcloud(keyword_freq)
160
  st.pyplot(plt_obj)
161
  else:
162
- st.warning("⚠️ Please provide comments to analyze.")
 
10
  from nltk.corpus import stopwords
11
  import random
12
  from datetime import datetime, timedelta
 
 
13
 
14
  # ---- Config
15
+ st.set_option('browser.gatherUsageStats', False)
16
+ os.environ["MPLCONFIGDIR"] = "/tmp/.matplotlib"
17
 
18
+ st.set_page_config(page_title="MCA Demo Comment Analyzer", layout="wide")
19
 
20
+ # ---- NLTK
21
  nltk.download('stopwords', quiet=True)
22
  STOPWORDS = set(stopwords.words('english'))
23
 
24
+ # ---- Lightweight MCA Analyzer
25
  class MCACommentAnalyzer:
26
  def __init__(self):
27
  device = 0 if torch.cuda.is_available() else -1
28
  print("Using device:", "GPU" if device==0 else "CPU")
29
+
30
+ # Lightweight sentiment model
31
  self.sentiment_model = pipeline(
32
  "sentiment-analysis",
33
  model="distilbert-base-uncased-finetuned-sst-2-english",
34
  device=device
35
  )
36
+
37
+ # Lightweight summarizer
38
  self.summarizer = pipeline(
39
  "summarization",
40
+ model="t5-small",
41
  device=device
42
  )
43
+
44
  self.stop_words = STOPWORDS
45
 
 
 
 
 
 
 
 
 
 
46
  def map_sentiment(self, pred, text):
47
  text_lower = text.lower()
48
+ violation_keywords = ["violation", "violates", "illegal", "non-compliant"]
49
+ suggestion_keywords = ["should", "recommend", "suggest", "advise", "better if"]
50
  positive_keywords = ["clear", "helpful", "good", "appreciate", "support"]
51
+ negative_keywords = ["confusing", "unclear", "bad", "problem"]
52
 
53
  if any(w in text_lower for w in violation_keywords):
54
  return "Violation"
 
68
  return "Neutral"
69
 
70
  def process_comment(self, comment):
71
+ pred = self.sentiment_model(comment)[0]
72
+ sentiment = self.map_sentiment(pred, comment)
 
73
 
74
+ if len(comment.split()) < 10:
75
+ summary_text = " ".join(comment.split()[:10])
 
76
  else:
77
  try:
78
+ summary_text = self.summarizer(comment, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
79
  except:
80
+ summary_text = comment
81
 
82
+ words = [w for w in comment.lower().split() if w.isalpha() and w not in self.stop_words]
 
83
  keywords = list(Counter(words).keys())
84
  top_keywords = ", ".join(keywords[:3])
 
85
  return sentiment, summary_text, keywords, top_keywords
86
 
87
  def process_comments(self, comments_list):
 
94
  summaries.append(summary)
95
  all_keywords.extend(keywords)
96
  top_keywords_list.append(top_kw)
97
+ timestamps.append(start_date + timedelta(days=random.randint(0,30)))
98
 
99
  df = pd.DataFrame({
100
  "Timestamp": timestamps,
 
123
  return plt
124
 
125
  # ---- Streamlit UI
126
+ st.title("📊 MCA Demo Comment Analyzer")
127
  st.sidebar.header("Upload or Enter Comments")
128
+
129
+ upload_file = st.sidebar.file_uploader("Upload CSV/Excel/TXT", type=["csv","xlsx","txt"])
130
+ manual_input = st.sidebar.text_area("Or enter comments manually (one per line)")
131
 
132
  comments = []
133
  if upload_file:
134
+ try:
135
+ if upload_file.name.endswith(".csv"):
136
+ df_file = pd.read_csv(upload_file)
137
+ if 'comment' in df_file.columns:
138
+ comments = df_file['comment'].astype(str).tolist()
139
+ else:
140
+ comments = df_file.iloc[:,0].astype(str).tolist()
141
+ elif upload_file.name.endswith(".xlsx"):
142
+ df_file = pd.read_excel(upload_file)
143
+ if 'comment' in df_file.columns:
144
+ comments = df_file['comment'].astype(str).tolist()
145
+ else:
146
+ comments = df_file.iloc[:,0].astype(str).tolist()
147
+ else:
148
+ comments = upload_file.read().decode("utf-8").splitlines()
149
+ except Exception as e:
150
+ st.error(f"File format not supported or corrupted. {e}")
151
  elif manual_input.strip():
152
  comments = manual_input.strip().split("\n")
153
 
 
166
  plt_obj = analyzer.generate_wordcloud(keyword_freq)
167
  st.pyplot(plt_obj)
168
  else:
169
+ st.warning("⚠️ Provide comments manually or upload a supported file.")