gopalaKrishna1236's picture
Upload app.py
d5a1755 verified
import os
import io
import re
import json
import uuid
import sys
import traceback
import numpy as np
import pandas as pd
# Force headless backend before importing pyplot
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import gradio as gr
# -------------------------
# NLTK + VADER
# -------------------------
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
def _ensure_nltk():
# Quiet downloads to avoid noisy logs
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", quiet=True)
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
try:
nltk.download("punkt_tab", quiet=True)
except Exception:
pass
try:
nltk.data.find("corpora/stopwords")
except LookupError:
nltk.download("stopwords", quiet=True)
try:
nltk.data.find("sentiment/vader_lexicon.zip")
except LookupError:
nltk.download("vader_lexicon", quiet=True)
_ensure_nltk()
try:
EN_STOPWORDS = set(stopwords.words("english"))
except Exception:
EN_STOPWORDS = set()
def _init_sia():
try:
return SentimentIntensityAnalyzer()
except Exception:
# Try re-downloading lexicon then retry
try:
nltk.download("vader_lexicon", quiet=True)
return SentimentIntensityAnalyzer()
except Exception:
# Fallback dummy analyzer
class _Dummy:
def polarity_scores(self, t):
return {"compound": 0.0}
return _Dummy()
SIA = _init_sia()
# -------------------------
# Config
# -------------------------
CATEGORY_MAP = {
"Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
"Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
"Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"],
"Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"],
"Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"],
"Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
"Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
}
DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
TOKEN_PATTERN = re.compile(r"[A-Za-z']+")
# -------------------------
# Utils
# -------------------------
def debug(msg):
print(msg, file=sys.stderr, flush=True)
def tokenize_text(text: str):
if not isinstance(text, str):
text = "" if pd.isna(text) else str(text)
tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)]
tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1]
return tokens
def count_keywords(token_lists, top_n=10, custom_keywords=None):
from collections import Counter
counter = Counter()
custom_set = None
if custom_keywords:
custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()])
for toks in token_lists:
if custom_set is None:
counter.update(toks)
else:
counter.update([t for t in toks if t in custom_set])
return counter.most_common(top_n)
def sentiments_for_texts(texts):
labels, compounds = [], []
for t in texts:
try:
vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
compound = float(vs.get("compound", 0.0))
except Exception:
compound = 0.0
compounds.append(compound)
if compound >= 0.05:
labels.append("Positive")
elif compound <= -0.05:
labels.append("Negative")
else:
labels.append("Neutral")
return labels, compounds
def assign_categories(token_lists):
assigned = []
for toks in token_lists:
tokset = set(toks)
best_cat, best_hits = None, 0
for cat, words in CATEGORY_MAP.items():
hits = len(tokset.intersection(words))
if hits > best_hits:
best_cat, best_hits = cat, hits
assigned.append(best_cat if best_hits > 0 else "Other/Unclear")
return assigned
def _save_fig_to_path(fig, name_prefix):
os.makedirs("charts", exist_ok=True)
fname = os.path.join("charts", f"{name_prefix}_{uuid.uuid4().hex}.png")
fig.savefig(fname, format="png", dpi=150, bbox_inches="tight")
plt.close(fig)
return fname
def bar_chart_top_keywords(freq_pairs):
if len(freq_pairs) == 0:
return None
labels = [k for k,_ in freq_pairs]
values = [v for _,v in freq_pairs]
fig = plt.figure()
plt.bar(range(len(labels)), values)
plt.xticks(range(len(labels)), labels, rotation=45, ha='right')
plt.title("Top Keywords")
plt.xlabel("Keyword")
plt.ylabel("Frequency")
plt.tight_layout()
return _save_fig_to_path(fig, "top_keywords")
def bar_chart_categories(cats):
if len(cats) == 0:
return None
s = pd.Series(cats).value_counts()
fig = plt.figure()
plt.bar(range(len(s.index)), s.values)
plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right')
plt.title("Claim Categories")
plt.xlabel("Category")
plt.ylabel("Count")
plt.tight_layout()
return _save_fig_to_path(fig, "categories")
def pie_chart_sentiment(sent_labels):
if len(sent_labels) == 0:
return None
vals = pd.Series(sent_labels).value_counts()
fig = plt.figure()
plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90)
plt.title("Sentiment Distribution")
plt.tight_layout()
return _save_fig_to_path(fig, "sentiment_pie")
def trend_chart_by_date(dates, compounds):
s = pd.DataFrame({"date": dates, "compound": compounds}).dropna()
if s.empty:
return None
try:
s["date"] = pd.to_datetime(s["date"], errors="coerce")
s = s.dropna(subset=["date"]).sort_values("date")
except Exception:
return None
if s.empty:
return None
fig = plt.figure()
plt.plot(s["date"], s["compound"])
plt.title("Sentiment Trend Over Time (compound)")
plt.xlabel("Date")
plt.ylabel("VADER Compound")
plt.tight_layout()
return _save_fig_to_path(fig, "sentiment_trend")
def read_csv_safe(path):
# Try UTF-8 first, then fallbacks
last_err = None
for enc in [None, "utf-8", "utf-8-sig", "latin-1"]:
try:
if enc is None:
return pd.read_csv(path)
return pd.read_csv(path, encoding=enc)
except Exception as e:
last_err = e
raise last_err
def validate_schema(df, text_col, date_col):
problems = []
if text_col not in df.columns:
problems.append(f"- Text column '{text_col}' not found.")
else:
# Ensure there is at least one non-empty string
non_empty = df[text_col].astype(str).str.strip().replace({"nan": ""}).astype(str)
if (non_empty == "").all():
problems.append(f"- Text column '{text_col}' has no non-empty values.")
if date_col:
if date_col not in df.columns:
problems.append(f"- Date column '{date_col}' not found.")
if problems:
raise gr.Error("Schema check failed:\n" + "\n".join(problems))
def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
validate_schema(df, text_col, date_col)
custom_keywords = None
if custom_keywords_text:
parts = re.split(r"[,\\n]+", custom_keywords_text)
custom_keywords = [p.strip().lower() for p in parts if p.strip()]
token_lists = df[text_col].apply(tokenize_text).tolist()
freq_pairs = count_keywords(
token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None)
)
sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
categories = assign_categories(token_lists)
out_df = df.copy()
out_df["tokens"] = token_lists
out_df["sentiment"] = sent_labels
out_df["compound"] = compounds
out_df["category"] = categories
bar_path = bar_chart_top_keywords(freq_pairs)
cat_path = bar_chart_categories(categories)
pie_path = pie_chart_sentiment(sent_labels)
trend_path = None
if date_col and date_col in df.columns:
trend_path = trend_chart_by_date(df[date_col], compounds)
cat_counts = out_df["category"].value_counts().head(5)
cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()]
pos_rate = (out_df["sentiment"] == "Positive").mean()
neg_rate = (out_df["sentiment"] == "Negative").mean()
neu_rate = (out_df["sentiment"] == "Neutral").mean()
report = [
"Common Claim Categories (Top 5):",
*cat_lines,
"",
f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative",
]
if len(freq_pairs) > 0:
top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]])
report += ["", f"Top Keywords: {top_kw}"]
report_text = "\n".join(report)
csv_bytes = out_df.to_csv(index=False).encode("utf-8")
return (
bar_path,
cat_path,
pie_path,
trend_path,
out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"),
report_text,
csv_bytes
)
def infer_text_columns(df: pd.DataFrame):
candidates = []
for c in df.columns:
if df[c].dtype == "object":
sample = df[c].astype(str).head(50).tolist()
avg_len = np.mean([len(s) for s in sample]) if sample else 0
candidates.append((c, avg_len))
candidates.sort(key=lambda x: x[1], reverse=True)
return [c for c,_ in candidates]
with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo:
gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.")
with gr.Row():
with gr.Column():
data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"])
text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None)
date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True)
top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
debug_mode = gr.Checkbox(label="Debug mode (show schema & sample rows)", value=False)
run_btn = gr.Button("Run Analysis 🚀", variant="primary")
with gr.Column():
bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="filepath")
cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="filepath")
pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="filepath")
trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="filepath")
table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
report = gr.Textbox(label="Auto-generated Report", lines=10)
debug_out = gr.Textbox(label="Debug info", lines=8, interactive=False)
export = gr.File(label="Download Enriched CSV")
def on_file_upload(fileobj):
if fileobj is None:
return gr.update(choices=[], value=None), gr.update(choices=[], value=None)
df = read_csv_safe(fileobj.name)
cols = df.columns.tolist()
text_candidates = infer_text_columns(df)
if not text_candidates:
text_candidates = [c for c in cols if df[c].dtype == "object"]
text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None)
return (
gr.update(choices=text_candidates or cols, value=text_value),
gr.update(choices=cols, value=None),
)
data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text, dbg):
if fileobj is None:
raise gr.Error("Please upload a CSV file.")
try:
df = read_csv_safe(fileobj.name)
if dbg:
info = [
"Columns & dtypes:",
str(df.dtypes),
"",
"Sample rows:",
str(df.head(5)),
]
debug_text = "\n".join(info)
else:
debug_text = ""
bar_path, cat_path, pie_path, trend_path, summary_df, report_text, csv_bytes = analyze(
df, text_column, date_column, int(topn), custom_only, custom_text
)
export_path = "enriched_claims.csv"
with open(export_path, "wb") as f:
f.write(csv_bytes)
return bar_path, cat_path, pie_path, trend_path, summary_df, report_text, debug_text, export_path
except Exception as e:
tb = traceback.format_exc()
debug(f"[ERROR] {type(e).__name__}: {e}\n{tb}")
raise gr.Error(f"RuntimeError: {type(e).__name__}: {e}")
run_btn.click(
run_pipeline,
inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text, debug_mode],
outputs=[bar_img, cat_img, pie_img, trend_img, table, report, debug_out, export],
)
if __name__ == "__main__":
# Spaces-friendly launch
port = int(os.environ.get("PORT", "7860"))
demo.launch(server_name="0.0.0.0", server_port=port)