import dash from dash import html, dcc import dash_bootstrap_components as dbc import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import io import base64 from datasets import load_dataset from wordcloud import WordCloud dash.register_page(__name__, path='/performance', name="Forensic Performance") # --- DATA ENGINE --- def load_performance_data(): raw_data = load_dataset("amitkedia/Financial-Fraud-Dataset") df = pd.DataFrame(raw_data[list(raw_data.keys())[0]]) df["clean_text"] = df["Fillings"].astype(str).str.lower() df["word_count"] = df["Fillings"].astype(str).apply(lambda x: len(x.split())) # Vocabulary Richness Logic df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split()))) df["lexical_diversity"] = df["unique_words"] / (df["word_count"] + 1) return df df = load_performance_data() NEON_GREEN = "#55EFC4" NEON_FUCHSIA = "#FF00FF" DARK_BG = "#0c0c0c" # --- WORD CLOUD ENGINE --- def get_wc_base64(text, colormap): wc = WordCloud(width=1000, height=1000, background_color="rgba(0,0,0,0)", mode="RGBA", colormap=colormap, max_words=250).generate(text) img = io.BytesIO() wc.to_image().save(img, format='PNG') return "data:image/png;base64," + base64.b64encode(img.getvalue()).decode() src_fraud = get_wc_base64(" ".join(df[df["Fraud"]=="yes"]["clean_text"]), "spring") src_clean = get_wc_base64(" ".join(df[df["Fraud"]=="no"]["clean_text"]), "summer") # --- FORENSIC ENGINES (NLP Logic) --- def get_top_ngrams(corpus, n=2, top_n=20): vec = CountVectorizer(stop_words="english", ngram_range=(n, n), max_features=top_n) X = vec.fit_transform(corpus) counts = np.asarray(X.sum(axis=0)).ravel() # Sorted ascending for Plotly 'h' orientation to match top-down visual priority return pd.DataFrame({"ngram": vec.get_feature_names_out(), "count": counts}).sort_values("count", ascending=True) def top_tfidf_terms(corpus, n=20): tfidf = TfidfVectorizer(stop_words="english", max_features=5000) X = tfidf.fit_transform(corpus) scores = np.asarray(X.mean(axis=0)).ravel() return pd.DataFrame({"term": tfidf.get_feature_names_out(), "avg_tfidf": scores}).sort_values("avg_tfidf", ascending=True).tail(n) # --- UI HELPERS --- def create_kpi_card(title, value, color): return html.Div([ dbc.Row([ dbc.Col([ html.H2(value, className="fw-bold mb-0", style={"color": "white", "fontSize": "22px"}), html.Small(title, className="text-muted text-uppercase fw-bold", style={"fontSize": "9px"}), ], width=9, className="ps-4 d-flex flex-column justify-content-center"), dbc.Col(style={"backgroundColor": color, "height": "100%"}, width=3) ], className="g-0 align-items-stretch", style={"height": "100%"}) ], style={"backgroundColor": "#111", "border": "1px solid #222", "borderRadius": "10px", "height": "80px", "overflow": "hidden"}) def create_forensic_subplots(df_left, df_right, col_y, col_x, palette_left, palette_right, title_left, title_right): fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.22, subplot_titles=(f'{title_left}', f'{title_right}')) fig.add_trace(go.Bar(x=df_left[col_x], y=df_left[col_y], orientation='h', marker=dict(color=df_left[col_x], colorscale=palette_left)), 1, 1) fig.add_trace(go.Bar(x=df_right[col_x], y=df_right[col_y], orientation='h', marker=dict(color=df_right[col_x], colorscale=palette_right)), 1, 2) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=600, showlegend=False, margin=dict(t=100, b=50, l=150, r=20)) fig.update_yaxes(automargin=True) return fig # --- LAYOUT --- layout = dbc.Container([ html.H1("Forensic Performance Engine", className="text-white fw-bold mt-4 mb-0"), html.P("Auditing target distribution and linguistic fingerprints.", className="text-muted mb-5"), # 1. KPI Strip dbc.Row([ dbc.Col(create_kpi_card("Mean Unique Words", f"{df['unique_words'].mean():.1f}", NEON_GREEN), lg=3), dbc.Col(create_kpi_card("Fraud Lexical Diversity", f"{df[df['Fraud']=='yes']['lexical_diversity'].mean():.3f}", NEON_FUCHSIA), lg=3), dbc.Col(create_kpi_card("Standard Lexical Diversity", f"{df[df['Fraud']=='no']['lexical_diversity'].mean():.3f}", "#74B9FF"), lg=3), dbc.Col(create_kpi_card("Total Reports", f"{len(df):,}", "#A29BFE"), lg=3), ], className="g-4 mb-5"), # 2. Target Distribution Section dbc.Row([ dbc.Col(dbc.Card([ dbc.CardBody([ html.H4("Target Distribution & Class Balance", className="text-white fw-bold mb-4"), dbc.Row([ dbc.Col(dcc.Graph(figure=px.histogram(df, x="Fraud", color="Fraud", text_auto=True, color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}, title="Class Distribution: Count") .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350, showlegend=False)), lg=7), dbc.Col(dcc.Graph(figure=px.pie(df, names="Fraud", hole=0.4, color="Fraud", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}, title="Class Distribution: Proportion") .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350)), lg=5), ]) ]) ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), ], className="mb-5"), # 3. Lexical Diversity & Word Variance dbc.Row([ dbc.Col(dbc.Card([ dbc.CardBody([ html.H4("Vocabulary Richness Analysis", className="text-white fw-bold mb-4"), dbc.Row([ dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="unique_words", color="Fraud", title="Unique Word Variance", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}) .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6), dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="lexical_diversity", color="Fraud", title="Lexical Diversity Ratio", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}) .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6), ]) ]) ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), ], className="mb-5"), # 4. TF-IDF Terms dbc.Row([ dbc.Col(dbc.Card([ dbc.CardBody([ html.H4("Top 20 TF-IDF Terms by Class", className="text-white fw-bold mb-4"), dcc.Graph(figure=create_forensic_subplots( top_tfidf_terms(df.loc[df["Fraud"] == "yes", "clean_text"]), top_tfidf_terms(df.loc[df["Fraud"] == "no", "clean_text"]), "term", "avg_tfidf", "Reds", "Greens", "Fraudulent Significance", "Non-Fraudulent Significance" )) ]) ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), ], className="mb-5"), # 5. N-Gram Analysis (Bigrams) dbc.Row([ dbc.Col(dbc.Card([ dbc.CardBody([ html.H4("Top 20 Bigrams by Class", className="text-white fw-bold mb-4"), dcc.Graph(figure=create_forensic_subplots( get_top_ngrams(df.loc[df["Fraud"] == "yes", "clean_text"]), get_top_ngrams(df.loc[df["Fraud"] == "no", "clean_text"]), "ngram", "count", "Reds", "Greens", "Top Bigrams — Fraudulent", "Top Bigrams — Non-Fraudulent" )) ]) ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), ], className="mb-5"), # 6. Word Clouds dbc.Card([ dbc.CardBody([ html.H2("Semantic Fingerprints (Word Clouds)", className="text-white fw-bold text-center mb-5"), dbc.Row([ dbc.Col([ html.H4("Anomalous Lens", className="text-center", style={"color": NEON_FUCHSIA}), html.Div(html.Img(src=src_fraud, style={"width": "100%"}), style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_FUCHSIA}", "margin": "0 auto", "overflow": "hidden"}) ], lg=6), dbc.Col([ html.H4("Standard Lens", className="text-center", style={"color": NEON_GREEN}), html.Div(html.Img(src=src_clean, style={"width": "100%"}), style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_GREEN}", "margin": "0 auto", "overflow": "hidden"}) ], lg=6), ]) ], className="py-5") ], style={"backgroundColor": "#050505", "border": "1px solid #222", "borderRadius": "25px"}, className="mb-5") ], fluid=True, style={"backgroundColor": "#000", "minHeight": "100vh", "padding": "0 5%"})