gopalaKrishna1236 commited on
Commit
fe4c97b
·
verified ·
1 Parent(s): fd438c4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +289 -83
  2. requirements.txt +6 -5
app.py CHANGED
@@ -1,92 +1,298 @@
1
 
2
- from fastapi import FastAPI, HTTPException
3
- from pydantic import BaseModel
4
- from typing import List, Dict, Any
5
  import re
6
- from collections import Counter
 
 
 
 
 
 
 
7
  import nltk
8
  from nltk.corpus import stopwords
9
  from nltk.tokenize import word_tokenize
10
- from nltk.stem import WordNetLemmatizer
11
- from nltk.sentiment.vader import SentimentIntensityAnalyzer
12
-
13
- # Ensure needed NLTK data is present in your runtime/environment
14
- # nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet'); nltk.download('vader_lexicon')
15
-
16
- app = FastAPI(title="Insurance Claim Text Analytics API")
17
-
18
- stop_words = set(stopwords.words('english'))
19
- lemmatizer = WordNetLemmatizer()
20
- sia = SentimentIntensityAnalyzer()
21
-
22
- category_map = {
23
- 'accident': 'Accident',
24
- 'collision': 'Accident',
25
- 'crash': 'Accident',
26
- 'damage': 'Damage',
27
- 'fire': 'Damage',
28
- 'theft': 'Theft',
29
- 'stolen': 'Theft',
30
- 'vandal': 'Vandalism',
31
- 'flood': 'Natural Disaster',
32
- 'storm': 'Natural Disaster',
33
- 'injury': 'Injury',
34
- 'breakdown': 'Mechanical',
35
- 'engine': 'Mechanical',
36
- 'water': 'Damage',
37
- 'laptop': 'Theft',
38
- 'bike': 'Theft',
39
- 'car': 'Accident'
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
- class PredictRequest(BaseModel):
43
- text: str
44
- top_k: int = 10
45
-
46
- class PredictResponse(BaseModel):
47
- text: str
48
- keywords: List[Dict[str, Any]]
49
- categories: List[str]
50
- sentiment: Dict[str, Any]
51
-
52
- def clean_text(text: str) -> str:
53
- text = str(text).lower()
54
- text = re.sub(r'http\S+|www\S+|https\S+', '', text)
55
- text = re.sub(r'\S+@\S+', '', text)
56
- text = re.sub(r'\d+', ' ', text)
57
- text = re.sub(r'[^a-z\s]', ' ', text)
58
- text = re.sub(r'\s+', ' ', text).strip()
59
- return text
60
-
61
- def tokenize(text: str):
62
- text = clean_text(text)
63
- tokens = word_tokenize(text)
64
- tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
65
- tokens = [lemmatizer.lemmatize(t) for t in tokens]
66
  return tokens
67
 
68
- @app.post("/predict", response_model=PredictResponse)
69
- def predict(req: PredictRequest):
70
- if not req.text or not req.text.strip():
71
- raise HTTPException(status_code=400, detail="Empty text")
72
- tokens = tokenize(req.text)
73
- freq = Counter(tokens)
74
- topk = freq.most_common(req.top_k)
75
- cats = set()
76
- for t in set(tokens):
77
- if t in category_map:
78
- cats.add(category_map[t])
79
- scores = sia.polarity_scores(req.text)
80
- comp = scores['compound']
81
- if comp >= 0.05:
82
- label = 'positive'
83
- elif comp <= -0.05:
84
- label = 'negative'
85
- else:
86
- label = 'neutral'
87
- return {
88
- "text": req.text,
89
- "keywords": [{"keyword": k, "count": c} for k, c in topk],
90
- "categories": list(cats),
91
- "sentiment": {"neg": scores['neg'], "neu": scores['neu'], "pos": scores['pos'], "compound": comp, "label": label}
92
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
 
2
+ import os
3
+ import io
 
4
  import re
5
+ import json
6
+ import numpy as np
7
+ import pandas as pd
8
+ import matplotlib.pyplot as plt
9
+
10
+ import gradio as gr
11
+
12
+ # NLTK setup
13
  import nltk
14
  from nltk.corpus import stopwords
15
  from nltk.tokenize import word_tokenize
16
+ from nltk.sentiment import SentimentIntensityAnalyzer
17
+
18
+ # One-time downloads (safe to call repeatedly)
19
+ def _ensure_nltk():
20
+ try:
21
+ nltk.data.find("tokenizers/punkt")
22
+ except LookupError:
23
+ nltk.download("punkt")
24
+ try:
25
+ nltk.data.find("tokenizers/punkt_tab")
26
+ except LookupError:
27
+ # Some environments need this for newer NLTK tokenizers
28
+ try:
29
+ nltk.download("punkt_tab")
30
+ except Exception:
31
+ pass
32
+ try:
33
+ nltk.data.find("corpora/stopwords")
34
+ except LookupError:
35
+ nltk.download("stopwords")
36
+ try:
37
+ nltk.data.find("sentiment/vader_lexicon.zip")
38
+ except LookupError:
39
+ nltk.download("vader_lexicon")
40
+
41
+ _ensure_nltk()
42
+
43
+ EN_STOPWORDS = set(stopwords.words("english"))
44
+ SIA = SentimentIntensityAnalyzer()
45
+
46
+ # Keyword category mapping (editable)
47
+ CATEGORY_MAP = {
48
+ "Accident": ["accident","collision","crash","rear-end","bump","skid","impact","hit","fender"],
49
+ "Theft": ["theft","stolen","robbery","burglary","break-in","snatched","pickpocket","hijack"],
50
+ "Fire/Water/Storm Damage": ["fire","smoke","flames","water","flood","leak","storm","hail","wind","cyclone","lightning"],
51
+ "Property Damage": ["damage","dent","scratch","broken","shattered","glass","windshield","bumper","paint","roof","door","window"],
52
+ "Injury/Medical": ["injury","hurt","hospital","treatment","fracture","bleeding","ambulance","doctor","clinic"],
53
+ "Liability": ["liability","lawsuit","negligence","fault","third-party","claimant"],
54
+ "Total Loss/Write-off": ["totalled","totaled","write-off","beyond","salvage"],
55
  }
56
 
57
+ DEFAULT_KEYWORDS = sorted(list({w for ws in CATEGORY_MAP.values() for w in ws} | {"accident","theft","damage"}))
58
+
59
+ TOKEN_PATTERN = re.compile(r"[A-Za-z']+") # capture words with letters and apostrophes
60
+
61
+ def tokenize_text(text: str):
62
+ if not isinstance(text, str):
63
+ text = "" if pd.isna(text) else str(text)
64
+ tokens = [t.lower() for t in TOKEN_PATTERN.findall(text)]
65
+ tokens = [t for t in tokens if t not in EN_STOPWORDS and len(t) > 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  return tokens
67
 
68
+ def count_keywords(token_lists, top_n=10, custom_keywords=None):
69
+ from collections import Counter
70
+ counter = Counter()
71
+ custom_set = None
72
+ if custom_keywords:
73
+ custom_set = set([k.strip().lower() for k in custom_keywords if k and k.strip()])
74
+ for toks in token_lists:
75
+ if custom_set is None:
76
+ counter.update(toks)
77
+ else:
78
+ counter.update([t for t in toks if t in custom_set])
79
+ return counter.most_common(top_n)
80
+
81
+ def sentiments_for_texts(texts):
82
+ labels = []
83
+ compound_scores = []
84
+ for t in texts:
85
+ vs = SIA.polarity_scores("" if pd.isna(t) else str(t))
86
+ compound = vs["compound"]
87
+ compound_scores.append(compound)
88
+ if compound >= 0.05:
89
+ labels.append("Positive")
90
+ elif compound <= -0.05:
91
+ labels.append("Negative")
92
+ else:
93
+ labels.append("Neutral")
94
+ return labels, compound_scores
95
+
96
+ def assign_categories(token_lists):
97
+ assigned = []
98
+ for toks in token_lists:
99
+ tokset = set(toks)
100
+ best_cat, best_hits = None, 0
101
+ for cat, words in CATEGORY_MAP.items():
102
+ hits = len(tokset.intersection(words))
103
+ if hits > best_hits:
104
+ best_cat, best_hits = cat, hits
105
+ assigned.append(best_cat if best_hits > 0 else "Other/Unclear")
106
+ return assigned
107
+
108
+ def bar_chart_top_keywords(freq_pairs):
109
+ if len(freq_pairs) == 0:
110
+ return None
111
+ labels = [k for k,_ in freq_pairs]
112
+ values = [v for _,v in freq_pairs]
113
+ fig = plt.figure()
114
+ plt.bar(range(len(labels)), values)
115
+ plt.xticks(range(len(labels)), labels, rotation=45, ha='right')
116
+ plt.title("Top Keywords")
117
+ plt.xlabel("Keyword")
118
+ plt.ylabel("Frequency")
119
+ plt.tight_layout()
120
+ buf = io.BytesIO()
121
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
122
+ plt.close(fig)
123
+ buf.seek(0)
124
+ return buf
125
+
126
+ def bar_chart_categories(cats):
127
+ if len(cats) == 0:
128
+ return None
129
+ s = pd.Series(cats).value_counts()
130
+ fig = plt.figure()
131
+ plt.bar(range(len(s.index)), s.values)
132
+ plt.xticks(range(len(s.index)), s.index, rotation=45, ha='right')
133
+ plt.title("Claim Categories")
134
+ plt.xlabel("Category")
135
+ plt.ylabel("Count")
136
+ plt.tight_layout()
137
+ buf = io.BytesIO()
138
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
139
+ plt.close(fig)
140
+ buf.seek(0)
141
+ return buf
142
+
143
+ def pie_chart_sentiment(sent_labels):
144
+ if len(sent_labels) == 0:
145
+ return None
146
+ vals = pd.Series(sent_labels).value_counts()
147
+ fig = plt.figure()
148
+ plt.pie(vals.values, labels=vals.index, autopct="%1.1f%%", startangle=90)
149
+ plt.title("Sentiment Distribution")
150
+ plt.tight_layout()
151
+ buf = io.BytesIO()
152
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
153
+ plt.close(fig)
154
+ buf.seek(0)
155
+ return buf
156
+
157
+ def trend_chart_by_date(dates, compounds):
158
+ s = pd.DataFrame({"date": dates, "compound": compounds}).dropna()
159
+ if s.empty:
160
+ return None
161
+ try:
162
+ s["date"] = pd.to_datetime(s["date"], errors="coerce")
163
+ s = s.dropna(subset=["date"]).sort_values("date")
164
+ except Exception:
165
+ return None
166
+ if s.empty:
167
+ return None
168
+ fig = plt.figure()
169
+ plt.plot(s["date"], s["compound"])
170
+ plt.title("Sentiment Trend Over Time (compound)")
171
+ plt.xlabel("Date")
172
+ plt.ylabel("VADER Compound")
173
+ plt.tight_layout()
174
+ buf = io.BytesIO()
175
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight")
176
+ plt.close(fig)
177
+ buf.seek(0)
178
+ return buf
179
+
180
+ def analyze(df, text_col, date_col, top_n, use_custom_only, custom_keywords_text):
181
+ if text_col not in df.columns:
182
+ raise gr.Error(f"Selected text column '{text_col}' not found in dataset.")
183
+ custom_keywords = None
184
+ if custom_keywords_text:
185
+ parts = re.split(r"[,\\n]+", custom_keywords_text)
186
+ custom_keywords = [p.strip().lower() for p in parts if p.strip()]
187
+ token_lists = df[text_col].apply(tokenize_text).tolist()
188
+ freq_pairs = count_keywords(token_lists, top_n=top_n, custom_keywords=(custom_keywords if use_custom_only else None))
189
+ sent_labels, compounds = sentiments_for_texts(df[text_col].tolist())
190
+ categories = assign_categories(token_lists)
191
+
192
+ out_df = df.copy()
193
+ out_df["tokens"] = token_lists
194
+ out_df["sentiment"] = sent_labels
195
+ out_df["compound"] = compounds
196
+ out_df["category"] = categories
197
+
198
+ bar_buf = bar_chart_top_keywords(freq_pairs)
199
+ cat_buf = bar_chart_categories(categories)
200
+ pie_buf = pie_chart_sentiment(sent_labels)
201
+ trend_buf = None
202
+ if date_col and date_col in df.columns:
203
+ trend_buf = trend_chart_by_date(df[date_col], compounds)
204
+
205
+ cat_counts = out_df["category"].value_counts().head(5)
206
+ cat_lines = [f"- {idx}: {val}" for idx, val in cat_counts.items()]
207
+ pos_rate = (out_df["sentiment"] == "Positive").mean()
208
+ neg_rate = (out_df["sentiment"] == "Negative").mean()
209
+ neu_rate = (out_df["sentiment"] == "Neutral").mean()
210
+ report = [
211
+ "Common Claim Categories (Top 5):",
212
+ *cat_lines,
213
+ "",
214
+ f"Sentiment: {pos_rate:.1%} Positive | {neu_rate:.1%} Neutral | {neg_rate:.1%} Negative",
215
+ ]
216
+ if len(freq_pairs) > 0:
217
+ top_kw = ", ".join([f"{k}({v})" for k,v in freq_pairs[:10]])
218
+ report += ["", f"Top Keywords: {top_kw}"]
219
+ report_text = "\n".join(report)
220
+
221
+ csv_bytes = out_df.to_csv(index=False).encode("utf-8")
222
+ return (
223
+ (None if bar_buf is None else bar_buf.getvalue()),
224
+ (None if cat_buf is None else cat_buf.getvalue()),
225
+ (None if pie_buf is None else pie_buf.getvalue()),
226
+ (None if trend_buf is None else trend_buf.getvalue()),
227
+ out_df[["sentiment","compound","category"]].value_counts().reset_index(name="count"),
228
+ report_text,
229
+ csv_bytes
230
+ )
231
+
232
+ def infer_text_columns(df: pd.DataFrame):
233
+ candidates = []
234
+ for c in df.columns:
235
+ if df[c].dtype == "object":
236
+ sample = df[c].astype(str).head(50).tolist()
237
+ avg_len = np.mean([len(s) for s in sample]) if sample else 0
238
+ candidates.append((c, avg_len))
239
+ candidates.sort(key=lambda x: x[1], reverse=True)
240
+ return [c for c,_ in candidates]
241
+
242
+ with gr.Blocks(title="Insurance Claim Text Analytics", fill_height=True) as demo:
243
+ gr.Markdown("# 🧠 Insurance Claim Text Analytics\nAnalyze claim descriptions for keywords, sentiment, and categories.")
244
+
245
+ with gr.Row():
246
+ with gr.Column():
247
+ data = gr.File(label="Upload CSV (UTF-8)", file_count="single", file_types=[".csv"])
248
+ text_col = gr.Dropdown(label="Text column (claim description)", choices=[], value=None)
249
+ date_col = gr.Dropdown(label="Optional date column (for trend)", choices=[], value=None, allow_custom_value=True)
250
+ top_n = gr.Slider(5, 30, value=10, step=1, label="Top N keywords for bar chart")
251
+ use_custom_only = gr.Checkbox(label="Only count custom keywords", value=False)
252
+ custom_keywords_text = gr.Textbox(label="Custom keywords (comma or new line separated). Leave empty to count all tokens.", value=", ".join(DEFAULT_KEYWORDS), lines=3)
253
+ run_btn = gr.Button("Run Analysis 🚀", variant="primary")
254
+ with gr.Column():
255
+ bar_img = gr.Image(label="Top 10 Keywords (Bar Chart)", type="numpy")
256
+ cat_img = gr.Image(label="Claim Categories (Bar Chart)", type="numpy")
257
+ pie_img = gr.Image(label="Sentiment Distribution (Pie Chart)", type="numpy")
258
+ trend_img = gr.Image(label="Sentiment Trend Over Time (Optional)", type="numpy")
259
+ table = gr.Dataframe(label="Sentiment & Category Summary", wrap=True)
260
+ report = gr.Textbox(label="Auto-generated Report", lines=10)
261
+ export = gr.File(label="Download Enriched CSV")
262
+
263
+ def on_file_upload(fileobj):
264
+ if fileobj is None:
265
+ return gr.update(choices=[], value=None), gr.update(choices=[], value=None)
266
+ df = pd.read_csv(fileobj.name)
267
+ cols = df.columns.tolist()
268
+ text_candidates = infer_text_columns(df)
269
+ if not text_candidates:
270
+ text_candidates = [c for c in cols if df[c].dtype == "object"]
271
+ text_value = text_candidates[0] if text_candidates else (cols[0] if cols else None)
272
+ return (
273
+ gr.update(choices=text_candidates or cols, value=text_value),
274
+ gr.update(choices=cols, value=None),
275
+ )
276
+
277
+ data.change(on_file_upload, inputs=[data], outputs=[text_col, date_col])
278
+
279
+ def run_pipeline(fileobj, text_column, date_column, topn, custom_only, custom_text):
280
+ if fileobj is None:
281
+ raise gr.Error("Please upload a CSV file.")
282
+ df = pd.read_csv(fileobj.name)
283
+ bar_png, cat_png, pie_png, trend_png, summary_df, report_text, csv_bytes = analyze(
284
+ df, text_column, date_column, int(topn), custom_only, custom_text
285
+ )
286
+ export_path = "enriched_claims.csv"
287
+ with open(export_path, "wb") as f:
288
+ f.write(csv_bytes)
289
+ return bar_png, cat_png, pie_png, trend_png, summary_df, report_text, export_path
290
+
291
+ run_btn.click(
292
+ run_pipeline,
293
+ inputs=[data, text_col, date_col, top_n, use_custom_only, custom_keywords_text],
294
+ outputs=[bar_img, cat_img, pie_img, trend_img, table, report, export],
295
+ )
296
+
297
+ if __name__ == "__main__":
298
+ demo.launch()
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- fastapi
2
- uvicorn[standard]
3
- nltk
4
- pandas
5
- python-docx
 
 
1
+ gradio==4.44.1
2
+ pandas==2.2.2
3
+ numpy==1.26.4
4
+ matplotlib==3.8.4
5
+ nltk==3.8.1
6
+ scikit-learn==1.4.2