| |
|
| | import os |
| | import io |
| | import re |
| | import json |
| | import uuid |
| | import sys |
| | import traceback |
| | import numpy as np |
| | import pandas as pd |
| |
|
| | |
| | import matplotlib |
| | matplotlib.use("Agg") |
| | import matplotlib.pyplot as plt |
| |
|
| | import gradio as gr |
| |
|
| | |
| | |
| | |
| | import nltk |
| | from nltk.corpus import stopwords |
| | from nltk.sentiment import SentimentIntensityAnalyzer |
| |
|
| | def _ensure_nltk(): |
| | |
| | try: |
| | nltk.data.find("tokenizers/punkt") |
| | except LookupError: |
| | nltk.download("punkt", quiet=True) |
| | try: |
| | nltk.data.find("tokenizers/punkt_tab") |
| | except LookupError: |
| | try: |
| | nltk.download("punkt_tab", quiet=True) |
| | except Exception: |
| | pass |
| | try: |
| | nltk.data.find("corpora/stopwords") |
| | except LookupError: |
| | nltk.download("stopwords", quiet=True) |
| | try: |
| | nltk.data.find("sentiment/vader_lexicon.zip") |
| | except LookupError: |
| | nltk.download("vader_lexicon", quiet=True) |
| |
|
| | _ensure_nltk() |
| |
|
| | try: |
| | EN_STOPWORDS = set(stopwords.words("english")) |
| | except Exception: |
| | EN_STOPWORDS = set() |
| |
|
| | def _init_sia(): |
| | try: |
| | return SentimentIntensityAnalyzer() |
| | except Exception: |
| | |
| | try: |
| | nltk.download("vader_lexicon", quiet=True) |
| | return SentimentIntensityAnalyzer() |
| | except Exception: |
| | |
| | class _Dummy: |
| | def polarity_scores(self, t): |
| | return {"compound": 0.0} |
| | return _Dummy() |
| |
|
| | SIA = _init_sia() |
| |
|
| | |
| | |
| | |
| | CATEGORY_MAP = { |
| | "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"], |
| | "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"], |
| | "Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"], |
| | "Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"], |
| | "Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"], |
| | "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"], |
| | "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"], |
| | } |
| | DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"})) |
| |
|
| | TOKEN_PATTERN = re.compile(r"[A-Za-z']+") |
| |
|
| | |
| | |
| | |
| | def debug(msg): |
| | print(msg, file=sys.stderr, flush=True) |
| |
|
| | def tokenize_text(text: str): |
| | if not isinstance(text, str): |
| | text = "" if pd.isna(text) else str(text) |
| | tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)] |
| | tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1] |
| | return tokens |
| |
|
| | def count_keywords(token_lists, top_n=10, custom_keywords=None): |
| | from collections import Counter |
| | counter = Counter() |
| | custom_set = None |
| | if custom_keywords: |
| | custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()]) |
| | for toks in token_lists: |
| | if custom_set is None: |
| | counter.update(toks) |
| | else: |
| | counter.update([t for t in toks if t in custom_set]) |
| | return counter.most_common(top_n) |
| |
|
| | def sentiments_for_texts(texts): |
| | labels, compounds = [], [] |
| | for t in texts: |
| | try: |
| | vs = SIA.polarity_scores("" if pd.isna(t) else str(t)) |
| | compound = float(vs.get("compound", 0.0)) |
| | except Exception: |
| | compound = 0.0 |
| | compounds.append(compound) |
| | if compound >= 0.05: |
| | labels.append("Positive") |
| | elif compound <= -0.05: |
| | labels.append("Negative") |
| | else: |
| | labels.append("Neutral") |
| | return labels, compounds |
| |
|
| | def assign_categories(token_lists): |
| | assigned = [] |
| | for toks in token_lists: |
| | tokset = set(toks) |
| | best_cat, best_hits = None, 0 |
| | for cat, words in CATEGORY_MAP.items(): |
| | hits = len(tokset.intersection(words)) |
| | if hits > best_hits: |
| | best_cat, best_hits = cat, hits |
| | assigned.append(best_cat if best_hits > 0 else "Other/Unclear") |
| | return assigned |
| |
|
| | def _save_fig_to_path(fig, name_prefix): |
| | os.makedirs("charts", exist_ok=True) |
| | fname = os.path.join("charts", f"{name_prefix}_{uuid.uuid4().hex}.png") |
| | fig.savefig(fname, format="png", dpi=150, bbox_inches="tight") |
| | plt.close(fig) |
| | return fname |
| |
|
| | def bar_chart_top_keywords(freq_pairs): |
| | if len(freq_pairs) == 0: |
| | return None |
| | labels = [k for k,_ in freq_pairs] |
| | values = [v for _,v in freq_pairs] |
| | fig = plt.figure() |
| | plt.bar(range(len(labels)), values) |
| | plt.xticks(range(len(labels)), labels, rotation=45, ha='right') |
| | plt.title("Top Keywords") |
| | plt.xlabel("Keyword") |
| | plt.ylabel("Frequency") |
| | plt.tight_layout() |
| | return _save_fig_to_path(fig, "top_keywords") |
| |
|
| | def bar_chart_categories(cats): |
| | if len(cats) == 0: |
| | return None |
| | s = pd.Series(cats).value_counts() |
| | fig = plt.figure() |
| | plt.bar(range(len(s.index)), s.values) |
| | plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right') |
| | plt.title("Claim Categories") |
| | plt.xlabel("Category") |
| | plt.ylabel("Count") |
| | plt.tight_layout() |
| | return _save_fig_to_path(fig, "categories") |
| |
|
| | def pie_chart_sentiment(sent_labels): |
| | if len(sent_labels) == 0: |
| | return None |
| | vals = pd.Series(sent_labels).value_counts() |
| | fig = plt.figure() |
| | plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90) |
| | plt.title("Sentiment Distribution") |
| | plt.tight_layout() |
| | return _save_fig_to_path(fig, "sentiment_pie") |
| |
|
| | def trend_chart_by_date(dates, compounds): |
| | s = pd.DataFrame({"date": dates, "compound": compounds}).dropna() |
| | if s.empty: |
| | return None |
| | try: |
| | s["date"] = pd.to_datetime(s["date"], errors="coerce") |
| | s = s.dropna(subset=["date"]).sort_values("date") |
| | except Exception: |
| | return None |
| | if s.empty: |
| | return None |
| | fig = plt.figure() |
| | plt.plot(s["date"], s["compound"]) |
| | plt.title("Sentiment Trend Over Time (compound)") |
| | plt.xlabel("Date") |
| | plt.ylabel("VADER Compound") |
| | plt.tight_layout() |
| | return _save_fig_to_path(fig, "sentiment_trend") |
| |
|
| | def read_csv_safe(path): |
| | |
| | last_err = None |
| | for enc in [None, "utf-8", "utf-8-sig", "latin-1"]: |
| | try: |
| | if enc is None: |
| | return pd.read_csv(path) |
| | return pd.read_csv(path, encoding=enc) |
| | except Exception as e: |
| | last_err = e |
| | raise last_err |
| |
|
| | def validate_schema(df, text_col, date_col): |
| | problems = [] |
| | if text_col not in df.columns: |
| | problems.append(f"- Text column '{text_col}' not found.") |
| | else: |
| | |
| | non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str) |
| | if (non_empty == "").all(): |
| | problems.append(f"- Text column '{text_col}' has no non-empty values.") |
| | if date_col: |
| | if date_col not in df.columns: |
| | problems.append(f"- Date column '{date_col}' not found.") |
| | if problems: |
| | raise gr.Error("Schema check failed:\n" + "\n".join(problems)) |
| |
|
| | def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text): |
| | validate_schema(df, text_col, date_col) |
| |
|
| | custom_keywords = None |
| | if custom_keywords_text: |
| | parts = re.split(r"[,\\n]+", custom_keywords_text) |
| | custom_keywords = [p.strip().lower() for p in parts if p.strip()] |
| |
|
| | token_lists = df[text_col].apply(tokenize_text).tolist() |
| | freq_pairs = count_keywords( |
| | token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None) |
| | ) |
| | sent_labels, compounds = sentiments_for_texts(df[text_col].tolist()) |
| | categories = assign_categories(token_lists) |
| |
|
| | out_df = df.copy() |
| | out_df["tokens"] = token_lists |
| | out_df["sentiment"] = sent_labels |
| | out_df["compound"] = compounds |
| | out_df["category"] = categories |
| |
|
| | bar_path = bar_chart_top_keywords(freq_pairs) |
| | cat_path = bar_chart_categories(categories) |
| | pie_path = pie_chart_sentiment(sent_labels) |
| | trend_path = None |
| | if date_col and date_col in df.columns: |
| | trend_path = trend_chart_by_date(df[date_col], compounds) |
| |
|
| | cat_counts = out_df["category"].value_counts().head(5) |
| | cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()] |
| | pos_rate = (out_df["sentiment"] == "Positive").mean() |
| | neg_rate = (out_df["sentiment"] == "Negative").mean() |
| | neu_rate = (out_df["sentiment"] == "Neutral").mean() |
| | report = [ |
| | "Common Claim Categories (Top 5):", |
| | *cat_lines, |
| | "", |
| | f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative", |
| | ] |
| | if len(freq_pairs) > 0: |
| | top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]]) |
| | report += ["", f"Top Keywords: {top_kw}"] |
| | report_text = "\n".join(report) |
| |
|
| | csv_bytes = out_df.to_csv(index=False).encode("utf-8") |
| | return ( |
| | bar_path, |
| | cat_path, |
| | pie_path, |
| | trend_path, |
| | out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"), |
| | report_text, |
| | csv_bytes |
| | ) |
| |
|
| | def infer_text_columns(df: pd.DataFrame): |
| | candidates = [] |
| | for c in df.columns: |
| | if df[c].dtype == "object": |
| | sample = df[c].astype(str).head(50).tolist() |
| | avg_len = np.mean([len(s) for s in sample]) if sample else 0 |
| | candidates.append((c, avg_len)) |
| | candidates.sort(key=lambda x: x[1], reverse=True) |
| | return [c for c,_ in candidates] |
| |
|
| | with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo: |
| | gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.") |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"]) |
| | text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None) |
| | date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True) |
| | top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart") |
| | use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False) |
| | custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3) |
| | debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False) |
| | run_btn = gr.Button("Run Analysis 🚀", variant="primary") |
| | with gr.Column(): |
| | bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath") |
| | cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="filepath") |
| | pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="filepath") |
| | trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath") |
| | table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True) |
| | report = gr.Textbox(label="Auto-generated Report", lines=10) |
| | debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False) |
| | export = gr.File(label="Download Enriched CSV") |
| |
|
| | def on_file_upload(fileobj): |
| | if fileobj is None: |
| | return gr.update(choices=[], value=None), gr.update(choices=[], value=None) |
| | df = read_csv_safe(fileobj.name) |
| | cols = df.columns.tolist() |
| | text_candidates = infer_text_columns(df) |
| | if not text_candidates: |
| | text_candidates = [c for c in cols if df[c].dtype == "object"] |
| | text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None) |
| | return ( |
| | gr.update(choices=text_candidates or cols, value=text_value), |
| | gr.update(choices=cols, value=None), |
| | ) |
| |
|
| | data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col]) |
| |
|
| | def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg): |
| | if fileobj is None: |
| | raise gr.Error("Please upload a CSV file.") |
| | try: |
| | df = read_csv_safe(fileobj.name) |
| | if dbg: |
| | info = [ |
| | "Columns & dtypes:", |
| | str(df.dtypes), |
| | "", |
| | "Sample rows:", |
| | str(df.head(5)), |
| | ] |
| | debug_text = "\n".join(info) |
| | else: |
| | debug_text = "" |
| | bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze( |
| | df, text_column, date_column, int(topn), custom_only, custom_text |
| | ) |
| | export_path = "enriched_claims.csv" |
| | with open(export_path, "wb") as f: |
| | f.write(csv_bytes) |
| | return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path |
| | except Exception as e: |
| | tb = traceback.format_exc() |
| | debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}") |
| | raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}") |
| |
|
| | run_btn.click( |
| | run_pipeline, |
| | inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode], |
| | outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export], |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | |
| | port = int(os.environ.get("PORT", "7860")) |
| | demo.launch(server_name="0.0.0.0", server_port=port) |
| |
|