gopalaKrishna1236 commited on
Commit
d5a1755
·
verified ·
1 Parent(s): 0300507

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -31
app.py CHANGED
@@ -4,24 +4,31 @@ import io
4
  import re
5
  import json
6
  import uuid
 
 
7
  import numpy as np
8
  import pandas as pd
 
 
 
 
9
  import matplotlib.pyplot as plt
10
 
11
  import gradio as gr
12
 
13
- # NLTK setup
 
 
14
  import nltk
15
  from nltk.corpus import stopwords
16
  from nltk.sentiment import SentimentIntensityAnalyzer
17
 
18
- # One-time downloads (safe to call repeatedly)
19
  def _ensure_nltk():
 
20
  try:
21
  nltk.data.find("tokenizers/punkt")
22
  except LookupError:
23
  nltk.download("punkt", quiet=True)
24
- # Newer NLTK sometimes references 'punkt_tab'; try best-effort
25
  try:
26
  nltk.data.find("tokenizers/punkt_tab")
27
  except LookupError:
@@ -42,13 +49,29 @@ _ensure_nltk()
42
 
43
  try:
44
  EN_STOPWORDS = set(stopwords.words("english"))
45
- except LookupError:
46
- # If stopwords still missing, fallback empty set
47
  EN_STOPWORDS = set()
48
 
49
- SIA = SentimentIntensityAnalyzer()
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # Keyword category mapping (editable)
 
 
 
 
52
  CATEGORY_MAP = {
53
  "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
54
  "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
@@ -58,10 +81,15 @@ CATEGORY_MAP = {
58
  "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
59
  "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
60
  }
61
-
62
  DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
63
 
64
- TOKEN_PATTERN = re.compile(r"[A-Za-z']+") # capture words with letters and apostrophes
 
 
 
 
 
 
65
 
66
  def tokenize_text(text: str):
67
  if not isinstance(text, str):
@@ -84,19 +112,21 @@ def count_keywords(token_lists, top_n=10, custom_keywords=None):
84
  return counter.most_common(top_n)
85
 
86
  def sentiments_for_texts(texts):
87
- labels = []
88
- compound_scores = []
89
  for t in texts:
90
- vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
91
- compound = vs["compound"]
92
- compound_scores.append(compound)
 
 
 
93
  if compound >= 0.05:
94
  labels.append("Positive")
95
  elif compound <= -0.05:
96
  labels.append("Negative")
97
  else:
98
  labels.append("Neutral")
99
- return labels, compound_scores
100
 
101
  def assign_categories(token_lists):
102
  assigned = []
@@ -174,24 +204,44 @@ def trend_chart_by_date(dates, compounds):
174
  return _save_fig_to_path(fig, "sentiment_trend")
175
 
176
  def read_csv_safe(path):
177
- # Try UTF-8 first, then fallback to latin-1 for messy exports
178
- try:
179
- return pd.read_csv(path)
180
- except UnicodeDecodeError:
181
  try:
182
- return pd.read_csv(path, encoding="latin-1")
 
 
183
  except Exception as e:
184
- raise e
 
185
 
186
- def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
 
187
  if text_col not in df.columns:
188
- raise gr.Error(f"Selected text column '{text_col}' not found in dataset.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  custom_keywords = None
190
  if custom_keywords_text:
191
  parts = re.split(r"[,\\n]+", custom_keywords_text)
192
  custom_keywords = [p.strip().lower() for p in parts if p.strip()]
 
193
  token_lists = df[text_col].apply(tokenize_text).tolist()
194
- freq_pairs = count_keywords(token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None))
 
 
195
  sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
196
  categories = assign_categories(token_lists)
197
 
@@ -256,6 +306,7 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
256
  top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
257
  use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
258
  custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
 
259
  run_btn = gr.Button("Run Analysis 🚀", variant="primary")
260
  with gr.Column():
261
  bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
@@ -264,6 +315,7 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
264
  trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
265
  table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
266
  report = gr.Textbox(label="Auto-generated Report", lines=10)
 
267
  export = gr.File(label="Download Enriched CSV")
268
 
269
  def on_file_upload(fileobj):
@@ -282,27 +334,41 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
282
 
283
  data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
284
 
285
- def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text):
286
  if fileobj is None:
287
  raise gr.Error("Please upload a CSV file.")
288
  try:
289
  df = read_csv_safe(fileobj.name)
 
 
 
 
 
 
 
 
 
 
 
290
  bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
291
  df, text_column, date_column, int(topn), custom_only, custom_text
292
  )
293
  export_path = "enriched_claims.csv"
294
  with open(export_path, "wb") as f:
295
  f.write(csv_bytes)
296
- return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, export_path
297
  except Exception as e:
298
- # Surface full message to the UI
299
- raise gr.Error(f\"Error during analysis: {type(e).__name__}: {e}\")
 
300
 
301
  run_btn.click(
302
  run_pipeline,
303
- inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text],
304
- outputs=[bar_img, cat_img, pie_img, trend_img, table, report, export],
305
  )
306
 
307
  if __name__ == "__main__":
308
- demo.launch()
 
 
 
4
  import re
5
  import json
6
  import uuid
7
+ import sys
8
+ import traceback
9
  import numpy as np
10
  import pandas as pd
11
+
12
+ # Force headless backend before importing pyplot
13
+ import matplotlib
14
+ matplotlib.use("Agg")
15
  import matplotlib.pyplot as plt
16
 
17
  import gradio as gr
18
 
19
+ # -------------------------
20
+ # NLTK + VADER
21
+ # -------------------------
22
  import nltk
23
  from nltk.corpus import stopwords
24
  from nltk.sentiment import SentimentIntensityAnalyzer
25
 
 
26
  def _ensure_nltk():
27
+ # Quiet downloads to avoid noisy logs
28
  try:
29
  nltk.data.find("tokenizers/punkt")
30
  except LookupError:
31
  nltk.download("punkt", quiet=True)
 
32
  try:
33
  nltk.data.find("tokenizers/punkt_tab")
34
  except LookupError:
 
49
 
50
  try:
51
  EN_STOPWORDS = set(stopwords.words("english"))
52
+ except Exception:
 
53
  EN_STOPWORDS = set()
54
 
55
+ def _init_sia():
56
+ try:
57
+ return SentimentIntensityAnalyzer()
58
+ except Exception:
59
+ # Try re-downloading lexicon then retry
60
+ try:
61
+ nltk.download("vader_lexicon", quiet=True)
62
+ return SentimentIntensityAnalyzer()
63
+ except Exception:
64
+ # Fallback dummy analyzer
65
+ class _Dummy:
66
+ def polarity_scores(self, t):
67
+ return {"compound": 0.0}
68
+ return _Dummy()
69
 
70
+ SIA = _init_sia()
71
+
72
+ # -------------------------
73
+ # Config
74
+ # -------------------------
75
  CATEGORY_MAP = {
76
  "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
77
  "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
 
81
  "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
82
  "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
83
  }
 
84
  DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
85
 
86
+ TOKEN_PATTERN = re.compile(r"[A-Za-z']+")
87
+
88
+ # -------------------------
89
+ # Utils
90
+ # -------------------------
91
+ def debug(msg):
92
+ print(msg, file=sys.stderr, flush=True)
93
 
94
  def tokenize_text(text: str):
95
  if not isinstance(text, str):
 
112
  return counter.most_common(top_n)
113
 
114
  def sentiments_for_texts(texts):
115
+ labels, compounds = [], []
 
116
  for t in texts:
117
+ try:
118
+ vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
119
+ compound = float(vs.get("compound", 0.0))
120
+ except Exception:
121
+ compound = 0.0
122
+ compounds.append(compound)
123
  if compound >= 0.05:
124
  labels.append("Positive")
125
  elif compound <= -0.05:
126
  labels.append("Negative")
127
  else:
128
  labels.append("Neutral")
129
+ return labels, compounds
130
 
131
  def assign_categories(token_lists):
132
  assigned = []
 
204
  return _save_fig_to_path(fig, "sentiment_trend")
205
 
206
  def read_csv_safe(path):
207
+ # Try UTF-8 first, then fallbacks
208
+ last_err = None
209
+ for enc in [None, "utf-8", "utf-8-sig", "latin-1"]:
 
210
  try:
211
+ if enc is None:
212
+ return pd.read_csv(path)
213
+ return pd.read_csv(path, encoding=enc)
214
  except Exception as e:
215
+ last_err = e
216
+ raise last_err
217
 
218
+ def validate_schema(df, text_col, date_col):
219
+ problems = []
220
  if text_col not in df.columns:
221
+ problems.append(f"- Text column '{text_col}' not found.")
222
+ else:
223
+ # Ensure there is at least one non-empty string
224
+ non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str)
225
+ if (non_empty == "").all():
226
+ problems.append(f"- Text column '{text_col}' has no non-empty values.")
227
+ if date_col:
228
+ if date_col not in df.columns:
229
+ problems.append(f"- Date column '{date_col}' not found.")
230
+ if problems:
231
+ raise gr.Error("Schema check failed:\n" + "\n".join(problems))
232
+
233
+ def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
234
+ validate_schema(df, text_col, date_col)
235
+
236
  custom_keywords = None
237
  if custom_keywords_text:
238
  parts = re.split(r"[,\\n]+", custom_keywords_text)
239
  custom_keywords = [p.strip().lower() for p in parts if p.strip()]
240
+
241
  token_lists = df[text_col].apply(tokenize_text).tolist()
242
+ freq_pairs = count_keywords(
243
+ token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None)
244
+ )
245
  sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
246
  categories = assign_categories(token_lists)
247
 
 
306
  top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
307
  use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
308
  custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
309
+ debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False)
310
  run_btn = gr.Button("Run Analysis 🚀", variant="primary")
311
  with gr.Column():
312
  bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
 
315
  trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
316
  table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
317
  report = gr.Textbox(label="Auto-generated Report", lines=10)
318
+ debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False)
319
  export = gr.File(label="Download Enriched CSV")
320
 
321
  def on_file_upload(fileobj):
 
334
 
335
  data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
336
 
337
+ def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg):
338
  if fileobj is None:
339
  raise gr.Error("Please upload a CSV file.")
340
  try:
341
  df = read_csv_safe(fileobj.name)
342
+ if dbg:
343
+ info = [
344
+ "Columns & dtypes:",
345
+ str(df.dtypes),
346
+ "",
347
+ "Sample rows:",
348
+ str(df.head(5)),
349
+ ]
350
+ debug_text = "\n".join(info)
351
+ else:
352
+ debug_text = ""
353
  bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
354
  df, text_column, date_column, int(topn), custom_only, custom_text
355
  )
356
  export_path = "enriched_claims.csv"
357
  with open(export_path, "wb") as f:
358
  f.write(csv_bytes)
359
+ return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path
360
  except Exception as e:
361
+ tb = traceback.format_exc()
362
+ debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}")
363
+ raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}")
364
 
365
  run_btn.click(
366
  run_pipeline,
367
+ inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode],
368
+ outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export],
369
  )
370
 
371
  if __name__ == "__main__":
372
+ # Spaces-friendly launch
373
+ port = int(os.environ.get("PORT", "7860"))
374
+ demo.launch(server_name="0.0.0.0", server_port=port)