Spaces:

gopalaKrishna1236
/

Insurance_Claim_Text_Analytics

Sleeping

App Files Files Community

gopalaKrishna1236 commited on Oct 11, 2025

Commit

d5a1755

verified ·

1 Parent(s): 0300507

Upload app.py

Browse files

Files changed (1) hide show

app.py +97 -31

app.py CHANGED Viewed

@@ -4,24 +4,31 @@ import io
 import re
 import json
 import uuid
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import gradio as gr
-# NLTK setup
 import nltk
 from nltk.corpus import stopwords
 from nltk.sentiment import SentimentIntensityAnalyzer
-# One-time downloads (safe to call repeatedly)
 def _ensure_nltk():
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         nltk.download("punkt", quiet=True)
-    # Newer NLTK sometimes references 'punkt_tab'; try best-effort
     try:
         nltk.data.find("tokenizers/punkt_tab")
     except LookupError:
@@ -42,13 +49,29 @@ _ensure_nltk()
 try:
     EN_STOPWORDS = set(stopwords.words("english"))
-except LookupError:
-    # If stopwords still missing, fallback empty set
     EN_STOPWORDS = set()
-SIA = SentimentIntensityAnalyzer()
-# Keyword category mapping (editable)
 CATEGORY_MAP = {
     "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
     "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
@@ -58,10 +81,15 @@ CATEGORY_MAP = {
     "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
     "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
 }
 DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
-TOKEN_PATTERN = re.compile(r"[A-Za-z']+")  # capture words with letters and apostrophes
 def tokenize_text(text: str):
     if not isinstance(text, str):
@@ -84,19 +112,21 @@ def count_keywords(token_lists, top_n=10, custom_keywords=None):
     return counter.most_common(top_n)
 def sentiments_for_texts(texts):
-    labels = []
-    compound_scores = []
     for t in texts:
-        vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
-        compound = vs["compound"]
-        compound_scores.append(compound)
         if compound >= 0.05:
             labels.append("Positive")
         elif compound <= -0.05:
             labels.append("Negative")
         else:
             labels.append("Neutral")
-    return labels, compound_scores
 def assign_categories(token_lists):
     assigned = []
@@ -174,24 +204,44 @@ def trend_chart_by_date(dates, compounds):
     return _save_fig_to_path(fig, "sentiment_trend")
 def read_csv_safe(path):
-    # Try UTF-8 first, then fallback to latin-1 for messy exports
-    try:
-        return pd.read_csv(path)
-    except UnicodeDecodeError:
         try:
-            return pd.read_csv(path, encoding="latin-1")
         except Exception as e:
-            raise e
-def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
     if text_col not in df.columns:
-        raise gr.Error(f"Selected text column '{text_col}' not found in dataset.")
     custom_keywords = None
     if custom_keywords_text:
         parts = re.split(r"[,\\n]+", custom_keywords_text)
         custom_keywords = [p.strip().lower() for p in parts if p.strip()]
     token_lists = df[text_col].apply(tokenize_text).tolist()
-    freq_pairs = count_keywords(token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None))
     sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
     categories = assign_categories(token_lists)
@@ -256,6 +306,7 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
             top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
             use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
             custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
             run_btn = gr.Button("Run Analysis 🚀", variant="primary")
         with gr.Column():
             bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
@@ -264,6 +315,7 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
             trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
             table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
             report = gr.Textbox(label="Auto-generated Report", lines=10)
             export = gr.File(label="Download Enriched CSV")
     def on_file_upload(fileobj):
@@ -282,27 +334,41 @@ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo
     data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
-    def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text):
         if fileobj is None:
             raise gr.Error("Please upload a CSV file.")
         try:
             df = read_csv_safe(fileobj.name)
             bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
                 df, text_column, date_column, int(topn), custom_only, custom_text
             )
             export_path = "enriched_claims.csv"
             with open(export_path, "wb") as f:
                 f.write(csv_bytes)
-            return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, export_path
         except Exception as e:
-            # Surface full message to the UI
-            raise gr.Error(f\"Error during analysis: {type(e).__name__}: {e}\")
     run_btn.click(
         run_pipeline,
-        inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text],
-        outputs=[bar_img, cat_img, pie_img, trend_img, table, report, export],
     )
 if __name__ == "__main__":
-    demo.launch()

 import re
 import json
 import uuid
+import sys
+import traceback
 import numpy as np
 import pandas as pd
+# Force headless backend before importing pyplot
+import matplotlib
+matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import gradio as gr
+# -------------------------
+# NLTK + VADER
+# -------------------------
 import nltk
 from nltk.corpus import stopwords
 from nltk.sentiment import SentimentIntensityAnalyzer
 def _ensure_nltk():
+    # Quiet downloads to avoid noisy logs
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         nltk.download("punkt", quiet=True)
     try:
         nltk.data.find("tokenizers/punkt_tab")
     except LookupError:
 try:
     EN_STOPWORDS = set(stopwords.words("english"))
+except Exception:
     EN_STOPWORDS = set()
+def _init_sia():
+    try:
+        return SentimentIntensityAnalyzer()
+    except Exception:
+        # Try re-downloading lexicon then retry
+        try:
+            nltk.download("vader_lexicon", quiet=True)
+            return SentimentIntensityAnalyzer()
+        except Exception:
+            # Fallback dummy analyzer
+            class _Dummy:
+                def polarity_scores(self, t):
+                    return {"compound": 0.0}
+            return _Dummy()
+SIA = _init_sia()
+# -------------------------
+# Config
+# -------------------------
 CATEGORY_MAP = {
     "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
     "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
     "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
     "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
 }
 DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
+TOKEN_PATTERN = re.compile(r"[A-Za-z']+")
+# -------------------------
+# Utils
+# -------------------------
+def debug(msg):
+    print(msg, file=sys.stderr, flush=True)
 def tokenize_text(text: str):
     if not isinstance(text, str):
     return counter.most_common(top_n)
 def sentiments_for_texts(texts):
+    labels, compounds = [], []
     for t in texts:
+        try:
+            vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
+            compound = float(vs.get("compound", 0.0))
+        except Exception:
+            compound = 0.0
+        compounds.append(compound)
         if compound >= 0.05:
             labels.append("Positive")
         elif compound <= -0.05:
             labels.append("Negative")
         else:
             labels.append("Neutral")
+    return labels, compounds
 def assign_categories(token_lists):
     assigned = []
     return _save_fig_to_path(fig, "sentiment_trend")
 def read_csv_safe(path):
+    # Try UTF-8 first, then fallbacks
+    last_err = None
+    for enc in [None, "utf-8", "utf-8-sig", "latin-1"]:
         try:
+            if enc is None:
+                return pd.read_csv(path)
+            return pd.read_csv(path, encoding=enc)
         except Exception as e:
+            last_err = e
+    raise last_err
+def validate_schema(df, text_col, date_col):
+    problems = []
     if text_col not in df.columns:
+        problems.append(f"- Text column '{text_col}' not found.")
+    else:
+        # Ensure there is at least one non-empty string
+        non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str)
+        if (non_empty == "").all():
+            problems.append(f"- Text column '{text_col}' has no non-empty values.")
+    if date_col:
+        if date_col not in df.columns:
+            problems.append(f"- Date column '{date_col}' not found.")
+    if problems:
+        raise gr.Error("Schema check failed:\n" + "\n".join(problems))
+def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
+    validate_schema(df, text_col, date_col)
     custom_keywords = None
     if custom_keywords_text:
         parts = re.split(r"[,\\n]+", custom_keywords_text)
         custom_keywords = [p.strip().lower() for p in parts if p.strip()]
     token_lists = df[text_col].apply(tokenize_text).tolist()
+    freq_pairs = count_keywords(
+        token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None)
+    )
     sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
     categories = assign_categories(token_lists)
             top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
             use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
             custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
+            debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False)
             run_btn = gr.Button("Run Analysis 🚀", variant="primary")
         with gr.Column():
             bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
             trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
             table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
             report = gr.Textbox(label="Auto-generated Report", lines=10)
+            debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False)
             export = gr.File(label="Download Enriched CSV")
     def on_file_upload(fileobj):
     data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
+    def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg):
         if fileobj is None:
             raise gr.Error("Please upload a CSV file.")
         try:
             df = read_csv_safe(fileobj.name)
+            if dbg:
+                info = [
+                    "Columns & dtypes:",
+                    str(df.dtypes),
+                    "",
+                    "Sample rows:",
+                    str(df.head(5)),
+                ]
+                debug_text = "\n".join(info)
+            else:
+                debug_text = ""
             bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
                 df, text_column, date_column, int(topn), custom_only, custom_text
             )
             export_path = "enriched_claims.csv"
             with open(export_path, "wb") as f:
                 f.write(csv_bytes)
+            return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path
         except Exception as e:
+            tb = traceback.format_exc()
+            debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}")
+            raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}")
     run_btn.click(
         run_pipeline,
+        inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode],
+        outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export],
     )
 if __name__ == "__main__":
+    # Spaces-friendly launch
+    port = int(os.environ.get("PORT", "7860"))
+    demo.launch(server_name="0.0.0.0", server_port=port)