import os import io import re import json import uuid import sys import traceback import numpy as np import pandas as pd # Force headless backend before importing pyplot import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt import gradio as gr # ------------------------- # NLTK + VADER # ------------------------- import nltk from nltk.corpus import stopwords from nltk.sentiment import SentimentIntensityAnalyzer def _ensure_nltk(): # Quiet downloads to avoid noisy logs try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt", quiet=True) try: nltk.data.find("tokenizers/punkt_tab") except LookupError: try: nltk.download("punkt_tab", quiet=True) except Exception: pass try: nltk.data.find("corpora/stopwords") except LookupError: nltk.download("stopwords", quiet=True) try: nltk.data.find("sentiment/vader_lexicon.zip") except LookupError: nltk.download("vader_lexicon", quiet=True) _ensure_nltk() try: EN_STOPWORDS = set(stopwords.words("english")) except Exception: EN_STOPWORDS = set() def _init_sia(): try: return SentimentIntensityAnalyzer() except Exception: # Try re-downloading lexicon then retry try: nltk.download("vader_lexicon", quiet=True) return SentimentIntensityAnalyzer() except Exception: # Fallback dummy analyzer class _Dummy: def polarity_scores(self, t): return {"compound": 0.0} return _Dummy() SIA = _init_sia() # ------------------------- # Config # ------------------------- CATEGORY_MAP = { "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"], "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"], "Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"], "Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"], "Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"], "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"], "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"], } DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"})) TOKEN_PATTERN = re.compile(r"[A-Za-z']+") # ------------------------- # Utils # ------------------------- def debug(msg): print(msg, file=sys.stderr, flush=True) def tokenize_text(text: str): if not isinstance(text, str): text = "" if pd.isna(text) else str(text) tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)] tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1] return tokens def count_keywords(token_lists, top_n=10, custom_keywords=None): from collections import Counter counter = Counter() custom_set = None if custom_keywords: custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()]) for toks in token_lists: if custom_set is None: counter.update(toks) else: counter.update([t for t in toks if t in custom_set]) return counter.most_common(top_n) def sentiments_for_texts(texts): labels, compounds = [], [] for t in texts: try: vs = SIA.polarity_scores("" if pd.isna(t) else str(t)) compound = float(vs.get("compound", 0.0)) except Exception: compound = 0.0 compounds.append(compound) if compound >= 0.05: labels.append("Positive") elif compound <= -0.05: labels.append("Negative") else: labels.append("Neutral") return labels, compounds def assign_categories(token_lists): assigned = [] for toks in token_lists: tokset = set(toks) best_cat, best_hits = None, 0 for cat, words in CATEGORY_MAP.items(): hits = len(tokset.intersection(words)) if hits > best_hits: best_cat, best_hits = cat, hits assigned.append(best_cat if best_hits > 0 else "Other/Unclear") return assigned def _save_fig_to_path(fig, name_prefix): os.makedirs("charts", exist_ok=True) fname = os.path.join("charts", f"{name_prefix}_{uuid.uuid4().hex}.png") fig.savefig(fname, format="png", dpi=150, bbox_inches="tight") plt.close(fig) return fname def bar_chart_top_keywords(freq_pairs): if len(freq_pairs) == 0: return None labels = [k for k,_ in freq_pairs] values = [v for _,v in freq_pairs] fig = plt.figure() plt.bar(range(len(labels)), values) plt.xticks(range(len(labels)), labels, rotation=45, ha='right') plt.title("Top Keywords") plt.xlabel("Keyword") plt.ylabel("Frequency") plt.tight_layout() return _save_fig_to_path(fig, "top_keywords") def bar_chart_categories(cats): if len(cats) == 0: return None s = pd.Series(cats).value_counts() fig = plt.figure() plt.bar(range(len(s.index)), s.values) plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right') plt.title("Claim Categories") plt.xlabel("Category") plt.ylabel("Count") plt.tight_layout() return _save_fig_to_path(fig, "categories") def pie_chart_sentiment(sent_labels): if len(sent_labels) == 0: return None vals = pd.Series(sent_labels).value_counts() fig = plt.figure() plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90) plt.title("Sentiment Distribution") plt.tight_layout() return _save_fig_to_path(fig, "sentiment_pie") def trend_chart_by_date(dates, compounds): s = pd.DataFrame({"date": dates, "compound": compounds}).dropna() if s.empty: return None try: s["date"] = pd.to_datetime(s["date"], errors="coerce") s = s.dropna(subset=["date"]).sort_values("date") except Exception: return None if s.empty: return None fig = plt.figure() plt.plot(s["date"], s["compound"]) plt.title("Sentiment Trend Over Time (compound)") plt.xlabel("Date") plt.ylabel("VADER Compound") plt.tight_layout() return _save_fig_to_path(fig, "sentiment_trend") def read_csv_safe(path): # Try UTF-8 first, then fallbacks last_err = None for enc in [None, "utf-8", "utf-8-sig", "latin-1"]: try: if enc is None: return pd.read_csv(path) return pd.read_csv(path, encoding=enc) except Exception as e: last_err = e raise last_err def validate_schema(df, text_col, date_col): problems = [] if text_col not in df.columns: problems.append(f"- Text column '{text_col}' not found.") else: # Ensure there is at least one non-empty string non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str) if (non_empty == "").all(): problems.append(f"- Text column '{text_col}' has no non-empty values.") if date_col: if date_col not in df.columns: problems.append(f"- Date column '{date_col}' not found.") if problems: raise gr.Error("Schema check failed:\n" + "\n".join(problems)) def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text): validate_schema(df, text_col, date_col) custom_keywords = None if custom_keywords_text: parts = re.split(r"[,\\n]+", custom_keywords_text) custom_keywords = [p.strip().lower() for p in parts if p.strip()] token_lists = df[text_col].apply(tokenize_text).tolist() freq_pairs = count_keywords( token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None) ) sent_labels, compounds = sentiments_for_texts(df[text_col].tolist()) categories = assign_categories(token_lists) out_df = df.copy() out_df["tokens"] = token_lists out_df["sentiment"] = sent_labels out_df["compound"] = compounds out_df["category"] = categories bar_path = bar_chart_top_keywords(freq_pairs) cat_path = bar_chart_categories(categories) pie_path = pie_chart_sentiment(sent_labels) trend_path = None if date_col and date_col in df.columns: trend_path = trend_chart_by_date(df[date_col], compounds) cat_counts = out_df["category"].value_counts().head(5) cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()] pos_rate = (out_df["sentiment"] == "Positive").mean() neg_rate = (out_df["sentiment"] == "Negative").mean() neu_rate = (out_df["sentiment"] == "Neutral").mean() report = [ "Common Claim Categories (Top 5):", *cat_lines, "", f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative", ] if len(freq_pairs) > 0: top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]]) report += ["", f"Top Keywords: {top_kw}"] report_text = "\n".join(report) csv_bytes = out_df.to_csv(index=False).encode("utf-8") return ( bar_path, cat_path, pie_path, trend_path, out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"), report_text, csv_bytes ) def infer_text_columns(df: pd.DataFrame): candidates = [] for c in df.columns: if df[c].dtype == "object": sample = df[c].astype(str).head(50).tolist() avg_len = np.mean([len(s) for s in sample]) if sample else 0 candidates.append((c, avg_len)) candidates.sort(key=lambda x: x[1], reverse=True) return [c for c,_ in candidates] with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo: gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.") with gr.Row(): with gr.Column(): data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"]) text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None) date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True) top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart") use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False) custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3) debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False) run_btn = gr.Button("Run Analysis 🚀", variant="primary") with gr.Column(): bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath") cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="filepath") pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="filepath") trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath") table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True) report = gr.Textbox(label="Auto-generated Report", lines=10) debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False) export = gr.File(label="Download Enriched CSV") def on_file_upload(fileobj): if fileobj is None: return gr.update(choices=[], value=None), gr.update(choices=[], value=None) df = read_csv_safe(fileobj.name) cols = df.columns.tolist() text_candidates = infer_text_columns(df) if not text_candidates: text_candidates = [c for c in cols if df[c].dtype == "object"] text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None) return ( gr.update(choices=text_candidates or cols, value=text_value), gr.update(choices=cols, value=None), ) data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col]) def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg): if fileobj is None: raise gr.Error("Please upload a CSV file.") try: df = read_csv_safe(fileobj.name) if dbg: info = [ "Columns & dtypes:", str(df.dtypes), "", "Sample rows:", str(df.head(5)), ] debug_text = "\n".join(info) else: debug_text = "" bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze( df, text_column, date_column, int(topn), custom_only, custom_text ) export_path = "enriched_claims.csv" with open(export_path, "wb") as f: f.write(csv_bytes) return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path except Exception as e: tb = traceback.format_exc() debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}") raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}") run_btn.click( run_pipeline, inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode], outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export], ) if __name__ == "__main__": # Spaces-friendly launch port = int(os.environ.get("PORT", "7860")) demo.launch(server_name="0.0.0.0", server_port=port)