Spaces:

gopalaKrishna1236
/

Insurance_Claim_Text_Analytics

Sleeping

App Files Files Community

gopalaKrishna1236 commited on Oct 11, 2025

Commit

fe4c97b

verified ·

1 Parent(s): fd438c4

Upload 2 files

Browse files

Files changed (2) hide show

app.py +289 -83
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -1,92 +1,298 @@
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from typing import List, Dict, Any
 import re
-from collections import Counter
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
-from nltk.stem import WordNetLemmatizer
-from nltk.sentiment.vader import SentimentIntensityAnalyzer
-# Ensure needed NLTK data is present in your runtime/environment
-# nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet'); nltk.download('vader_lexicon')
-app = FastAPI(title="Insurance Claim Text Analytics API")
-stop_words = set(stopwords.words('english'))
-lemmatizer = WordNetLemmatizer()
-sia = SentimentIntensityAnalyzer()
-category_map = {
-    'accident': 'Accident',
-    'collision': 'Accident',
-    'crash': 'Accident',
-    'damage': 'Damage',
-    'fire': 'Damage',
-    'theft': 'Theft',
-    'stolen': 'Theft',
-    'vandal': 'Vandalism',
-    'flood': 'Natural Disaster',
-    'storm': 'Natural Disaster',
-    'injury': 'Injury',
-    'breakdown': 'Mechanical',
-    'engine': 'Mechanical',
-    'water': 'Damage',
-    'laptop': 'Theft',
-    'bike': 'Theft',
-    'car': 'Accident'
 }
-class PredictRequest(BaseModel):
-    text: str
-    top_k: int = 10
-class PredictResponse(BaseModel):
-    text: str
-    keywords: List[Dict[str, Any]]
-    categories: List[str]
-    sentiment: Dict[str, Any]
-def clean_text(text: str) -> str:
-    text = str(text).lower()
-    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
-    text = re.sub(r'\S+@\S+', '', text)
-    text = re.sub(r'\d+', ' ', text)
-    text = re.sub(r'[^a-z\s]', ' ', text)
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-def tokenize(text: str):
-    text = clean_text(text)
-    tokens = word_tokenize(text)
-    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
-    tokens = [lemmatizer.lemmatize(t) for t in tokens]
     return tokens
-@app.post("/predict", response_model=PredictResponse)
-def predict(req: PredictRequest):
-    if not req.text or not req.text.strip():
-        raise HTTPException(status_code=400, detail="Empty text")
-    tokens = tokenize(req.text)
-    freq = Counter(tokens)
-    topk = freq.most_common(req.top_k)
-    cats = set()
-    for t in set(tokens):
-        if t in category_map:
-            cats.add(category_map[t])
-    scores = sia.polarity_scores(req.text)
-    comp = scores['compound']
-    if comp >= 0.05:
-        label = 'positive'
-    elif comp <= -0.05:
-        label = 'negative'
-    else:
-        label = 'neutral'
-    return {
-        "text": req.text,
-        "keywords": [{"keyword": k, "count": c} for k, c in topk],
-        "categories": list(cats),
-        "sentiment": {"neg": scores['neg'], "neu": scores['neu'], "pos": scores['pos'], "compound": comp, "label": label}
-    }

+import os
+import io
 import re
+import json
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import gradio as gr
+# NLTK setup
 import nltk
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
+from nltk.sentiment import SentimentIntensityAnalyzer
+# One-time downloads (safe to call repeatedly)
+def _ensure_nltk():
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt")
+    try:
+        nltk.data.find("tokenizers/punkt_tab")
+    except LookupError:
+        # Some environments need this for newer NLTK tokenizers
+        try:
+            nltk.download("punkt_tab")
+        except Exception:
+            pass
+    try:
+        nltk.data.find("corpora/stopwords")
+    except LookupError:
+        nltk.download("stopwords")
+    try:
+        nltk.data.find("sentiment/vader_lexicon.zip")
+    except LookupError:
+        nltk.download("vader_lexicon")
+_ensure_nltk()
+EN_STOPWORDS = set(stopwords.words("english"))
+SIA = SentimentIntensityAnalyzer()
+# Keyword category mapping (editable)
+CATEGORY_MAP = {
+    "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
+    "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
+    "Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"],
+    "Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"],
+    "Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"],
+    "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
+    "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
 }
+DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
+TOKEN_PATTERN = re.compile(r"[A-Za-z']+")  # capture words with letters and apostrophes
+def tokenize_text(text: str):
+    if not isinstance(text, str):
+        text = "" if pd.isna(text) else str(text)
+    tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)]
+    tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1]
     return tokens
+def count_keywords(token_lists, top_n=10, custom_keywords=None):
+    from collections import Counter
+    counter = Counter()
+    custom_set = None
+    if custom_keywords:
+        custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()])
+    for toks in token_lists:
+        if custom_set is None:
+            counter.update(toks)
+        else:
+            counter.update([t for t in toks if t in custom_set])
+    return counter.most_common(top_n)
+def sentiments_for_texts(texts):
+    labels = []
+    compound_scores = []
+    for t in texts:
+        vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
+        compound = vs["compound"]
+        compound_scores.append(compound)
+        if compound >= 0.05:
+            labels.append("Positive")
+        elif compound <= -0.05:
+            labels.append("Negative")
+        else:
+            labels.append("Neutral")
+    return labels, compound_scores
+def assign_categories(token_lists):
+    assigned = []
+    for toks in token_lists:
+        tokset = set(toks)
+        best_cat, best_hits = None, 0
+        for cat, words in CATEGORY_MAP.items():
+            hits = len(tokset.intersection(words))
+            if hits > best_hits:
+                best_cat, best_hits = cat, hits
+        assigned.append(best_cat if best_hits > 0 else "Other/Unclear")
+    return assigned
+def bar_chart_top_keywords(freq_pairs):
+    if len(freq_pairs) == 0:
+        return None
+    labels = [k for k,_ in freq_pairs]
+    values = [v for _,v in freq_pairs]
+    fig = plt.figure()
+    plt.bar(range(len(labels)), values)
+    plt.xticks(range(len(labels)), labels, rotation=45, ha='right')
+    plt.title("Top Keywords")
+    plt.xlabel("Keyword")
+    plt.ylabel("Frequency")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf
+def bar_chart_categories(cats):
+    if len(cats) == 0:
+        return None
+    s = pd.Series(cats).value_counts()
+    fig = plt.figure()
+    plt.bar(range(len(s.index)), s.values)
+    plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right')
+    plt.title("Claim Categories")
+    plt.xlabel("Category")
+    plt.ylabel("Count")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf
+def pie_chart_sentiment(sent_labels):
+    if len(sent_labels) == 0:
+        return None
+    vals = pd.Series(sent_labels).value_counts()
+    fig = plt.figure()
+    plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90)
+    plt.title("Sentiment Distribution")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf
+def trend_chart_by_date(dates, compounds):
+    s = pd.DataFrame({"date": dates, "compound": compounds}).dropna()
+    if s.empty:
+        return None
+    try:
+        s["date"] = pd.to_datetime(s["date"], errors="coerce")
+        s = s.dropna(subset=["date"]).sort_values("date")
+    except Exception:
+        return None
+    if s.empty:
+        return None
+    fig = plt.figure()
+    plt.plot(s["date"], s["compound"])
+    plt.title("Sentiment Trend Over Time (compound)")
+    plt.xlabel("Date")
+    plt.ylabel("VADER Compound")
+    plt.tight_layout()
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    buf.seek(0)
+    return buf
+def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
+    if text_col not in df.columns:
+        raise gr.Error(f"Selected text column '{text_col}' not found in dataset.")
+    custom_keywords = None
+    if custom_keywords_text:
+        parts = re.split(r"[,\\n]+", custom_keywords_text)
+        custom_keywords = [p.strip().lower() for p in parts if p.strip()]
+    token_lists = df[text_col].apply(tokenize_text).tolist()
+    freq_pairs = count_keywords(token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None))
+    sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
+    categories = assign_categories(token_lists)
+    out_df = df.copy()
+    out_df["tokens"] = token_lists
+    out_df["sentiment"] = sent_labels
+    out_df["compound"] = compounds
+    out_df["category"] = categories
+    bar_buf = bar_chart_top_keywords(freq_pairs)
+    cat_buf = bar_chart_categories(categories)
+    pie_buf = pie_chart_sentiment(sent_labels)
+    trend_buf = None
+    if date_col and date_col in df.columns:
+        trend_buf = trend_chart_by_date(df[date_col], compounds)
+    cat_counts = out_df["category"].value_counts().head(5)
+    cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()]
+    pos_rate = (out_df["sentiment"] == "Positive").mean()
+    neg_rate = (out_df["sentiment"] == "Negative").mean()
+    neu_rate = (out_df["sentiment"] == "Neutral").mean()
+    report = [
+        "Common Claim Categories (Top 5):",
+        *cat_lines,
+        "",
+        f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative",
+    ]
+    if len(freq_pairs) > 0:
+        top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]])
+        report += ["", f"Top Keywords: {top_kw}"]
+    report_text = "\n".join(report)
+    csv_bytes = out_df.to_csv(index=False).encode("utf-8")
+    return (
+        (None if bar_buf is None else bar_buf.getvalue()),
+        (None if cat_buf is None else cat_buf.getvalue()),
+        (None if pie_buf is None else pie_buf.getvalue()),
+        (None if trend_buf is None else trend_buf.getvalue()),
+        out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"),
+        report_text,
+        csv_bytes
+    )
+def infer_text_columns(df: pd.DataFrame):
+    candidates = []
+    for c in df.columns:
+        if df[c].dtype == "object":
+            sample = df[c].astype(str).head(50).tolist()
+            avg_len = np.mean([len(s) for s in sample]) if sample else 0
+            candidates.append((c, avg_len))
+    candidates.sort(key=lambda x: x[1], reverse=True)
+    return [c for c,_ in candidates]
+with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo:
+    gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.")
+    with gr.Row():
+        with gr.Column():
+            data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"])
+            text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None)
+            date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True)
+            top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
+            use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
+            custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
+            run_btn = gr.Button("Run Analysis 🚀", variant="primary")
+        with gr.Column():
+            bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="numpy")
+            cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="numpy")
+            pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="numpy")
+            trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="numpy")
+            table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
+            report = gr.Textbox(label="Auto-generated Report", lines=10)
+            export = gr.File(label="Download Enriched CSV")
+    def on_file_upload(fileobj):
+        if fileobj is None:
+            return gr.update(choices=[], value=None), gr.update(choices=[], value=None)
+        df = pd.read_csv(fileobj.name)
+        cols = df.columns.tolist()
+        text_candidates = infer_text_columns(df)
+        if not text_candidates:
+            text_candidates = [c for c in cols if df[c].dtype == "object"]
+        text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None)
+        return (
+            gr.update(choices=text_candidates or cols, value=text_value),
+            gr.update(choices=cols, value=None),
+        )
+    data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
+    def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text):
+        if fileobj is None:
+            raise gr.Error("Please upload a CSV file.")
+        df = pd.read_csv(fileobj.name)
+        bar_png, cat_png, pie_png, trend_png, summary_df, report_text, csv_bytes = analyze(
+            df, text_column, date_column, int(topn), custom_only, custom_text
+        )
+        export_path = "enriched_claims.csv"
+        with open(export_path, "wb") as f:
+            f.write(csv_bytes)
+        return bar_png, cat_png, pie_png, trend_png, summary_df, report_text, export_path
+    run_btn.click(
+        run_pipeline,
+        inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text],
+        outputs=[bar_img, cat_img, pie_img, trend_img, table, report, export],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-fastapi
-uvicorn[standard]
-nltk
-pandas
-python-docx

+gradio==4.44.1
+pandas==2.2.2
+numpy==1.26.4
+matplotlib==3.8.4
+nltk==3.8.1
+scikit-learn==1.4.2