Spaces:

gopalaKrishna1236
/

Insurance_Claim_Text_Analytics

Sleeping

File size: 14,093 Bytes


import os
import io
import re
import json
import uuid
import sys
import traceback
import numpy as np
import pandas as pd

# Force headless backend before importing pyplot
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

import gradio as gr

# -------------------------
# NLTK + VADER
# -------------------------
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

def _ensure_nltk():
    # Quiet downloads to avoid noisy logs
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt", quiet=True)
    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
        try:
            nltk.download("punkt_tab", quiet=True)
        except Exception:
            pass
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        nltk.download("stopwords", quiet=True)
    try:
        nltk.data.find("sentiment/vader_lexicon.zip")
    except LookupError:
        nltk.download("vader_lexicon", quiet=True)

_ensure_nltk()

try:
    EN_STOPWORDS = set(stopwords.words("english"))
except Exception:
    EN_STOPWORDS = set()

def _init_sia():
    try:
        return SentimentIntensityAnalyzer()
    except Exception:
        # Try re-downloading lexicon then retry
        try:
            nltk.download("vader_lexicon", quiet=True)
            return SentimentIntensityAnalyzer()
        except Exception:
            # Fallback dummy analyzer
            class _Dummy:
                def polarity_scores(self, t): 
                    return {"compound": 0.0}
            return _Dummy()

SIA = _init_sia()

# -------------------------
# Config
# -------------------------
CATEGORY_MAP = {
    "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
    "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
    "Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"],
    "Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"],
    "Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"],
    "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
    "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
}
DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))

TOKEN_PATTERN = re.compile(r"[A-Za-z']+")

# -------------------------
# Utils
# -------------------------
def debug(msg):
    print(msg, file=sys.stderr, flush=True)

def tokenize_text(text: str):
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)]
    tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1]
    return tokens

def count_keywords(token_lists, top_n=10, custom_keywords=None):
    from collections import Counter
    counter = Counter()
    custom_set = None
    if custom_keywords:
        custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()])
    for toks in token_lists:
        if custom_set is None:
            counter.update(toks)
        else:
            counter.update([t for t in toks if t in custom_set])
    return counter.most_common(top_n)

def sentiments_for_texts(texts):
    labels, compounds = [], []
    for t in texts:
        try:
            vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
            compound = float(vs.get("compound", 0.0))
        except Exception:
            compound = 0.0
        compounds.append(compound)
        if compound >= 0.05:
            labels.append("Positive")
        elif compound <= -0.05:
            labels.append("Negative")
        else:
            labels.append("Neutral")
    return labels, compounds

def assign_categories(token_lists):
    assigned = []
    for toks in token_lists:
        tokset = set(toks)
        best_cat, best_hits = None, 0
        for cat, words in CATEGORY_MAP.items():
            hits = len(tokset.intersection(words))
            if hits > best_hits:
                best_cat, best_hits = cat, hits
        assigned.append(best_cat if best_hits > 0 else "Other/Unclear")
    return assigned

def _save_fig_to_path(fig, name_prefix):
    os.makedirs("charts", exist_ok=True)
    fname = os.path.join("charts", f"{name_prefix}_{uuid.uuid4().hex}.png")
    fig.savefig(fname, format="png", dpi=150, bbox_inches="tight")
    plt.close(fig)
    return fname

def bar_chart_top_keywords(freq_pairs):
    if len(freq_pairs) == 0:
        return None
    labels = [k for k,_ in freq_pairs]
    values = [v for _,v in freq_pairs]
    fig = plt.figure()
    plt.bar(range(len(labels)), values)
    plt.xticks(range(len(labels)), labels, rotation=45, ha='right')
    plt.title("Top Keywords")
    plt.xlabel("Keyword")
    plt.ylabel("Frequency")
    plt.tight_layout()
    return _save_fig_to_path(fig, "top_keywords")

def bar_chart_categories(cats):
    if len(cats) == 0:
        return None
    s = pd.Series(cats).value_counts()
    fig = plt.figure()
    plt.bar(range(len(s.index)), s.values)
    plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right')
    plt.title("Claim Categories")
    plt.xlabel("Category")
    plt.ylabel("Count")
    plt.tight_layout()
    return _save_fig_to_path(fig, "categories")

def pie_chart_sentiment(sent_labels):
    if len(sent_labels) == 0:
        return None
    vals = pd.Series(sent_labels).value_counts()
    fig = plt.figure()
    plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90)
    plt.title("Sentiment Distribution")
    plt.tight_layout()
    return _save_fig_to_path(fig, "sentiment_pie")

def trend_chart_by_date(dates, compounds):
    s = pd.DataFrame({"date": dates, "compound": compounds}).dropna()
    if s.empty:
        return None
    try:
        s["date"] = pd.to_datetime(s["date"], errors="coerce")
        s = s.dropna(subset=["date"]).sort_values("date")
    except Exception:
        return None
    if s.empty:
        return None
    fig = plt.figure()
    plt.plot(s["date"], s["compound"])
    plt.title("Sentiment Trend Over Time (compound)")
    plt.xlabel("Date")
    plt.ylabel("VADER Compound")
    plt.tight_layout()
    return _save_fig_to_path(fig, "sentiment_trend")

def read_csv_safe(path):
    # Try UTF-8 first, then fallbacks
    last_err = None
    for enc in [None, "utf-8", "utf-8-sig", "latin-1"]:
        try:
            if enc is None:
                return pd.read_csv(path)
            return pd.read_csv(path, encoding=enc)
        except Exception as e:
            last_err = e
    raise last_err

def validate_schema(df, text_col, date_col):
    problems = []
    if text_col not in df.columns:
        problems.append(f"- Text column '{text_col}' not found.")
    else:
        # Ensure there is at least one non-empty string
        non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str)
        if (non_empty == "").all():
            problems.append(f"- Text column '{text_col}' has no non-empty values.")
    if date_col:
        if date_col not in df.columns:
            problems.append(f"- Date column '{date_col}' not found.")
    if problems:
        raise gr.Error("Schema check failed:\n" + "\n".join(problems))

def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
    validate_schema(df, text_col, date_col)

    custom_keywords = None
    if custom_keywords_text:
        parts = re.split(r"[,\\n]+", custom_keywords_text)
        custom_keywords = [p.strip().lower() for p in parts if p.strip()]

    token_lists = df[text_col].apply(tokenize_text).tolist()
    freq_pairs = count_keywords(
        token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None)
    )
    sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
    categories = assign_categories(token_lists)

    out_df = df.copy()
    out_df["tokens"] = token_lists
    out_df["sentiment"] = sent_labels
    out_df["compound"] = compounds
    out_df["category"] = categories

    bar_path = bar_chart_top_keywords(freq_pairs)
    cat_path = bar_chart_categories(categories)
    pie_path = pie_chart_sentiment(sent_labels)
    trend_path = None
    if date_col and date_col in df.columns:
        trend_path = trend_chart_by_date(df[date_col], compounds)

    cat_counts = out_df["category"].value_counts().head(5)
    cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()]
    pos_rate = (out_df["sentiment"] == "Positive").mean()
    neg_rate = (out_df["sentiment"] == "Negative").mean()
    neu_rate = (out_df["sentiment"] == "Neutral").mean()
    report = [
        "Common Claim Categories (Top 5):",
        *cat_lines,
        "",
        f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative",
    ]
    if len(freq_pairs) > 0:
        top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]])
        report += ["", f"Top Keywords: {top_kw}"]
    report_text = "\n".join(report)

    csv_bytes = out_df.to_csv(index=False).encode("utf-8")
    return (
        bar_path,
        cat_path,
        pie_path,
        trend_path,
        out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"),
        report_text,
        csv_bytes
    )

def infer_text_columns(df: pd.DataFrame):
    candidates = []
    for c in df.columns:
        if df[c].dtype == "object":
            sample = df[c].astype(str).head(50).tolist()
            avg_len = np.mean([len(s) for s in sample]) if sample else 0
            candidates.append((c, avg_len))
    candidates.sort(key=lambda x: x[1], reverse=True)
    return [c for c,_ in candidates]

with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo:
    gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.")

    with gr.Row():
        with gr.Column():
            data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"])
            text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None)
            date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True)
            top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
            use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
            custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
            debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False)
            run_btn = gr.Button("Run Analysis 🚀", variant="primary")
        with gr.Column():
            bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
            cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="filepath")
            pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="filepath")
            trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
            table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
            report = gr.Textbox(label="Auto-generated Report", lines=10)
            debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False)
            export = gr.File(label="Download Enriched CSV")

    def on_file_upload(fileobj):
        if fileobj is None:
            return gr.update(choices=[], value=None), gr.update(choices=[], value=None)
        df = read_csv_safe(fileobj.name)
        cols = df.columns.tolist()
        text_candidates = infer_text_columns(df)
        if not text_candidates:
            text_candidates = [c for c in cols if df[c].dtype == "object"]
        text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None)
        return (
            gr.update(choices=text_candidates or cols, value=text_value),
            gr.update(choices=cols, value=None),
        )

    data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])

    def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg):
        if fileobj is None:
            raise gr.Error("Please upload a CSV file.")
        try:
            df = read_csv_safe(fileobj.name)
            if dbg:
                info = [
                    "Columns & dtypes:",
                    str(df.dtypes),
                    "",
                    "Sample rows:",
                    str(df.head(5)),
                ]
                debug_text = "\n".join(info)
            else:
                debug_text = ""
            bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
                df, text_column, date_column, int(topn), custom_only, custom_text
            )
            export_path = "enriched_claims.csv"
            with open(export_path, "wb") as f:
                f.write(csv_bytes)
            return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path
        except Exception as e:
            tb = traceback.format_exc()
            debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}")
            raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}")

    run_btn.click(
        run_pipeline,
        inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode],
        outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export],
    )

if __name__ == "__main__":
    # Spaces-friendly launch
    port = int(os.environ.get("PORT", "7860"))
    demo.launch(server_name="0.0.0.0", server_port=port)