import dash
from dash import html, dcc
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import io
import base64
from datasets import load_dataset
from wordcloud import WordCloud
dash.register_page(__name__, path='/performance', name="Forensic Performance")
# --- DATA ENGINE ---
def load_performance_data():
raw_data = load_dataset("amitkedia/Financial-Fraud-Dataset")
df = pd.DataFrame(raw_data[list(raw_data.keys())[0]])
df["clean_text"] = df["Fillings"].astype(str).str.lower()
df["word_count"] = df["Fillings"].astype(str).apply(lambda x: len(x.split()))
# Vocabulary Richness Logic
df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split())))
df["lexical_diversity"] = df["unique_words"] / (df["word_count"] + 1)
return df
df = load_performance_data()
NEON_GREEN = "#55EFC4"
NEON_FUCHSIA = "#FF00FF"
DARK_BG = "#0c0c0c"
# --- WORD CLOUD ENGINE ---
def get_wc_base64(text, colormap):
wc = WordCloud(width=1000, height=1000, background_color="rgba(0,0,0,0)",
mode="RGBA", colormap=colormap, max_words=250).generate(text)
img = io.BytesIO()
wc.to_image().save(img, format='PNG')
return "data:image/png;base64," + base64.b64encode(img.getvalue()).decode()
src_fraud = get_wc_base64(" ".join(df[df["Fraud"]=="yes"]["clean_text"]), "spring")
src_clean = get_wc_base64(" ".join(df[df["Fraud"]=="no"]["clean_text"]), "summer")
# --- FORENSIC ENGINES (NLP Logic) ---
def get_top_ngrams(corpus, n=2, top_n=20):
vec = CountVectorizer(stop_words="english", ngram_range=(n, n), max_features=top_n)
X = vec.fit_transform(corpus)
counts = np.asarray(X.sum(axis=0)).ravel()
# Sorted ascending for Plotly 'h' orientation to match top-down visual priority
return pd.DataFrame({"ngram": vec.get_feature_names_out(), "count": counts}).sort_values("count", ascending=True)
def top_tfidf_terms(corpus, n=20):
tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
X = tfidf.fit_transform(corpus)
scores = np.asarray(X.mean(axis=0)).ravel()
return pd.DataFrame({"term": tfidf.get_feature_names_out(), "avg_tfidf": scores}).sort_values("avg_tfidf", ascending=True).tail(n)
# --- UI HELPERS ---
def create_kpi_card(title, value, color):
return html.Div([
dbc.Row([
dbc.Col([
html.H2(value, className="fw-bold mb-0", style={"color": "white", "fontSize": "22px"}),
html.Small(title, className="text-muted text-uppercase fw-bold", style={"fontSize": "9px"}),
], width=9, className="ps-4 d-flex flex-column justify-content-center"),
dbc.Col(style={"backgroundColor": color, "height": "100%"}, width=3)
], className="g-0 align-items-stretch", style={"height": "100%"})
], style={"backgroundColor": "#111", "border": "1px solid #222", "borderRadius": "10px", "height": "80px", "overflow": "hidden"})
def create_forensic_subplots(df_left, df_right, col_y, col_x, palette_left, palette_right, title_left, title_right):
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.22,
subplot_titles=(f'{title_left}', f'{title_right}'))
fig.add_trace(go.Bar(x=df_left[col_x], y=df_left[col_y], orientation='h',
marker=dict(color=df_left[col_x], colorscale=palette_left)), 1, 1)
fig.add_trace(go.Bar(x=df_right[col_x], y=df_right[col_y], orientation='h',
marker=dict(color=df_right[col_x], colorscale=palette_right)), 1, 2)
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
font_color="white", height=600, showlegend=False,
margin=dict(t=100, b=50, l=150, r=20))
fig.update_yaxes(automargin=True)
return fig
# --- LAYOUT ---
layout = dbc.Container([
html.H1("Forensic Performance Engine", className="text-white fw-bold mt-4 mb-0"),
html.P("Auditing target distribution and linguistic fingerprints.", className="text-muted mb-5"),
# 1. KPI Strip
dbc.Row([
dbc.Col(create_kpi_card("Mean Unique Words", f"{df['unique_words'].mean():.1f}", NEON_GREEN), lg=3),
dbc.Col(create_kpi_card("Fraud Lexical Diversity", f"{df[df['Fraud']=='yes']['lexical_diversity'].mean():.3f}", NEON_FUCHSIA), lg=3),
dbc.Col(create_kpi_card("Standard Lexical Diversity", f"{df[df['Fraud']=='no']['lexical_diversity'].mean():.3f}", "#74B9FF"), lg=3),
dbc.Col(create_kpi_card("Total Reports", f"{len(df):,}", "#A29BFE"), lg=3),
], className="g-4 mb-5"),
# 2. Target Distribution Section
dbc.Row([
dbc.Col(dbc.Card([
dbc.CardBody([
html.H4("Target Distribution & Class Balance", className="text-white fw-bold mb-4"),
dbc.Row([
dbc.Col(dcc.Graph(figure=px.histogram(df, x="Fraud", color="Fraud", text_auto=True,
color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
title="Class Distribution: Count")
.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350, showlegend=False)), lg=7),
dbc.Col(dcc.Graph(figure=px.pie(df, names="Fraud", hole=0.4,
color="Fraud", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
title="Class Distribution: Proportion")
.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350)), lg=5),
])
])
], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
], className="mb-5"),
# 3. Lexical Diversity & Word Variance
dbc.Row([
dbc.Col(dbc.Card([
dbc.CardBody([
html.H4("Vocabulary Richness Analysis", className="text-white fw-bold mb-4"),
dbc.Row([
dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="unique_words", color="Fraud", title="Unique Word Variance",
color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="lexical_diversity", color="Fraud", title="Lexical Diversity Ratio",
color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
])
])
], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
], className="mb-5"),
# 4. TF-IDF Terms
dbc.Row([
dbc.Col(dbc.Card([
dbc.CardBody([
html.H4("Top 20 TF-IDF Terms by Class", className="text-white fw-bold mb-4"),
dcc.Graph(figure=create_forensic_subplots(
top_tfidf_terms(df.loc[df["Fraud"] == "yes", "clean_text"]),
top_tfidf_terms(df.loc[df["Fraud"] == "no", "clean_text"]),
"term", "avg_tfidf", "Reds", "Greens", "Fraudulent Significance", "Non-Fraudulent Significance"
))
])
], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
], className="mb-5"),
# 5. N-Gram Analysis (Bigrams)
dbc.Row([
dbc.Col(dbc.Card([
dbc.CardBody([
html.H4("Top 20 Bigrams by Class", className="text-white fw-bold mb-4"),
dcc.Graph(figure=create_forensic_subplots(
get_top_ngrams(df.loc[df["Fraud"] == "yes", "clean_text"]),
get_top_ngrams(df.loc[df["Fraud"] == "no", "clean_text"]),
"ngram", "count", "Reds", "Greens", "Top Bigrams — Fraudulent", "Top Bigrams — Non-Fraudulent"
))
])
], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
], className="mb-5"),
# 6. Word Clouds
dbc.Card([
dbc.CardBody([
html.H2("Semantic Fingerprints (Word Clouds)", className="text-white fw-bold text-center mb-5"),
dbc.Row([
dbc.Col([
html.H4("Anomalous Lens", className="text-center", style={"color": NEON_FUCHSIA}),
html.Div(html.Img(src=src_fraud, style={"width": "100%"}),
style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_FUCHSIA}", "margin": "0 auto", "overflow": "hidden"})
], lg=6),
dbc.Col([
html.H4("Standard Lens", className="text-center", style={"color": NEON_GREEN}),
html.Div(html.Img(src=src_clean, style={"width": "100%"}),
style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_GREEN}", "margin": "0 auto", "overflow": "hidden"})
], lg=6),
])
], className="py-5")
], style={"backgroundColor": "#050505", "border": "1px solid #222", "borderRadius": "25px"}, className="mb-5")
], fluid=True, style={"backgroundColor": "#000", "minHeight": "100vh", "padding": "0 5%"})