Spaces:

hellosara
/

fraud_financial

Sleeping

File size: 9,881 Bytes

ee94c4a

import dash
from dash import html, dcc
import dash_bootstrap_components as dbc
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import io
import base64
from datasets import load_dataset
from wordcloud import WordCloud

dash.register_page(__name__, path='/performance', name="Forensic Performance")

# --- DATA ENGINE ---
def load_performance_data():
    raw_data = load_dataset("amitkedia/Financial-Fraud-Dataset")
    df = pd.DataFrame(raw_data[list(raw_data.keys())[0]])
    df["clean_text"] = df["Fillings"].astype(str).str.lower()
    df["word_count"] = df["Fillings"].astype(str).apply(lambda x: len(x.split()))
    # Vocabulary Richness Logic
    df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split())))
    df["lexical_diversity"] = df["unique_words"] / (df["word_count"] + 1)
    return df

df = load_performance_data()
NEON_GREEN = "#55EFC4"
NEON_FUCHSIA = "#FF00FF"
DARK_BG = "#0c0c0c"

# --- WORD CLOUD ENGINE ---
def get_wc_base64(text, colormap):
    wc = WordCloud(width=1000, height=1000, background_color="rgba(0,0,0,0)", 
                   mode="RGBA", colormap=colormap, max_words=250).generate(text)
    img = io.BytesIO()
    wc.to_image().save(img, format='PNG')
    return "data:image/png;base64," + base64.b64encode(img.getvalue()).decode()

src_fraud = get_wc_base64(" ".join(df[df["Fraud"]=="yes"]["clean_text"]), "spring")
src_clean = get_wc_base64(" ".join(df[df["Fraud"]=="no"]["clean_text"]), "summer")

# --- FORENSIC ENGINES (NLP Logic) ---
def get_top_ngrams(corpus, n=2, top_n=20):
    vec = CountVectorizer(stop_words="english", ngram_range=(n, n), max_features=top_n)
    X = vec.fit_transform(corpus)
    counts = np.asarray(X.sum(axis=0)).ravel()
    # Sorted ascending for Plotly 'h' orientation to match top-down visual priority
    return pd.DataFrame({"ngram": vec.get_feature_names_out(), "count": counts}).sort_values("count", ascending=True)

def top_tfidf_terms(corpus, n=20):
    tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
    X = tfidf.fit_transform(corpus)
    scores = np.asarray(X.mean(axis=0)).ravel()
    return pd.DataFrame({"term": tfidf.get_feature_names_out(), "avg_tfidf": scores}).sort_values("avg_tfidf", ascending=True).tail(n)

# --- UI HELPERS ---
def create_kpi_card(title, value, color):
    return html.Div([
        dbc.Row([
            dbc.Col([
                html.H2(value, className="fw-bold mb-0", style={"color": "white", "fontSize": "22px"}),
                html.Small(title, className="text-muted text-uppercase fw-bold", style={"fontSize": "9px"}),
            ], width=9, className="ps-4 d-flex flex-column justify-content-center"),
            dbc.Col(style={"backgroundColor": color, "height": "100%"}, width=3)
        ], className="g-0 align-items-stretch", style={"height": "100%"})
    ], style={"backgroundColor": "#111", "border": "1px solid #222", "borderRadius": "10px", "height": "80px", "overflow": "hidden"})

def create_forensic_subplots(df_left, df_right, col_y, col_x, palette_left, palette_right, title_left, title_right):
    fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.22,
                        subplot_titles=(f'<b>{title_left}</b>', f'<b>{title_right}</b>'))
    
    fig.add_trace(go.Bar(x=df_left[col_x], y=df_left[col_y], orientation='h',
                         marker=dict(color=df_left[col_x], colorscale=palette_left)), 1, 1)
    
    fig.add_trace(go.Bar(x=df_right[col_x], y=df_right[col_y], orientation='h',
                         marker=dict(color=df_right[col_x], colorscale=palette_right)), 1, 2)
    
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', 
                      font_color="white", height=600, showlegend=False,
                      margin=dict(t=100, b=50, l=150, r=20))
    fig.update_yaxes(automargin=True)
    return fig

# --- LAYOUT ---
layout = dbc.Container([
    html.H1("Forensic Performance Engine", className="text-white fw-bold mt-4 mb-0"),
    html.P("Auditing target distribution and linguistic fingerprints.", className="text-muted mb-5"),

    # 1. KPI Strip
    dbc.Row([
        dbc.Col(create_kpi_card("Mean Unique Words", f"{df['unique_words'].mean():.1f}", NEON_GREEN), lg=3),
        dbc.Col(create_kpi_card("Fraud Lexical Diversity", f"{df[df['Fraud']=='yes']['lexical_diversity'].mean():.3f}", NEON_FUCHSIA), lg=3),
        dbc.Col(create_kpi_card("Standard Lexical Diversity", f"{df[df['Fraud']=='no']['lexical_diversity'].mean():.3f}", "#74B9FF"), lg=3),
        dbc.Col(create_kpi_card("Total Reports", f"{len(df):,}", "#A29BFE"), lg=3),
    ], className="g-4 mb-5"),

    # 2. Target Distribution Section
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Target Distribution & Class Balance", className="text-white fw-bold mb-4"),
                dbc.Row([
                    dbc.Col(dcc.Graph(figure=px.histogram(df, x="Fraud", color="Fraud", text_auto=True,
                                                        color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
                                                        title="Class Distribution: Count")
                                      .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350, showlegend=False)), lg=7),
                    dbc.Col(dcc.Graph(figure=px.pie(df, names="Fraud", hole=0.4,
                                                 color="Fraud", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
                                                 title="Class Distribution: Proportion")
                                      .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350)), lg=5),
                ])
            ])
        ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
    ], className="mb-5"),

    # 3. Lexical Diversity & Word Variance
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Vocabulary Richness Analysis", className="text-white fw-bold mb-4"),
                dbc.Row([
                    dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="unique_words", color="Fraud", title="Unique Word Variance",
                                                  color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
                                      .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
                    dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="lexical_diversity", color="Fraud", title="Lexical Diversity Ratio",
                                                  color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
                                      .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
                ])
            ])
        ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
    ], className="mb-5"),

    # 4. TF-IDF Terms
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Top 20 TF-IDF Terms by Class", className="text-white fw-bold mb-4"),
                dcc.Graph(figure=create_forensic_subplots(
                    top_tfidf_terms(df.loc[df["Fraud"] == "yes", "clean_text"]),
                    top_tfidf_terms(df.loc[df["Fraud"] == "no", "clean_text"]),
                    "term", "avg_tfidf", "Reds", "Greens", "Fraudulent Significance", "Non-Fraudulent Significance"
                ))
            ])
        ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
    ], className="mb-5"),

    # 5. N-Gram Analysis (Bigrams)
    dbc.Row([
        dbc.Col(dbc.Card([
            dbc.CardBody([
                html.H4("Top 20 Bigrams by Class", className="text-white fw-bold mb-4"),
                dcc.Graph(figure=create_forensic_subplots(
                    get_top_ngrams(df.loc[df["Fraud"] == "yes", "clean_text"]),
                    get_top_ngrams(df.loc[df["Fraud"] == "no", "clean_text"]),
                    "ngram", "count", "Reds", "Greens", "Top Bigrams — Fraudulent", "Top Bigrams — Non-Fraudulent"
                ))
            ])
        ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
    ], className="mb-5"),

    # 6. Word Clouds
    dbc.Card([
        dbc.CardBody([
            html.H2("Semantic Fingerprints (Word Clouds)", className="text-white fw-bold text-center mb-5"),
            dbc.Row([
                dbc.Col([
                    html.H4("Anomalous Lens", className="text-center", style={"color": NEON_FUCHSIA}),
                    html.Div(html.Img(src=src_fraud, style={"width": "100%"}),
                             style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_FUCHSIA}", "margin": "0 auto", "overflow": "hidden"})
                ], lg=6),
                dbc.Col([
                    html.H4("Standard Lens", className="text-center", style={"color": NEON_GREEN}),
                    html.Div(html.Img(src=src_clean, style={"width": "100%"}),
                             style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_GREEN}", "margin": "0 auto", "overflow": "hidden"})
                ], lg=6),
            ])
        ], className="py-5")
    ], style={"backgroundColor": "#050505", "border": "1px solid #222", "borderRadius": "25px"}, className="mb-5")

], fluid=True, style={"backgroundColor": "#000", "minHeight": "100vh", "padding": "0 5%"})