Spaces:
Sleeping
Sleeping
| import dash | |
| from dash import html, dcc | |
| import dash_bootstrap_components as dbc | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
| import io | |
| import base64 | |
| from datasets import load_dataset | |
| from wordcloud import WordCloud | |
| dash.register_page(__name__, path='/performance', name="Forensic Performance") | |
| # --- DATA ENGINE --- | |
| def load_performance_data(): | |
| raw_data = load_dataset("amitkedia/Financial-Fraud-Dataset") | |
| df = pd.DataFrame(raw_data[list(raw_data.keys())[0]]) | |
| df["clean_text"] = df["Fillings"].astype(str).str.lower() | |
| df["word_count"] = df["Fillings"].astype(str).apply(lambda x: len(x.split())) | |
| # Vocabulary Richness Logic | |
| df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split()))) | |
| df["lexical_diversity"] = df["unique_words"] / (df["word_count"] + 1) | |
| return df | |
| df = load_performance_data() | |
| NEON_GREEN = "#55EFC4" | |
| NEON_FUCHSIA = "#FF00FF" | |
| DARK_BG = "#0c0c0c" | |
| # --- WORD CLOUD ENGINE --- | |
| def get_wc_base64(text, colormap): | |
| wc = WordCloud(width=1000, height=1000, background_color="rgba(0,0,0,0)", | |
| mode="RGBA", colormap=colormap, max_words=250).generate(text) | |
| img = io.BytesIO() | |
| wc.to_image().save(img, format='PNG') | |
| return "data:image/png;base64," + base64.b64encode(img.getvalue()).decode() | |
| src_fraud = get_wc_base64(" ".join(df[df["Fraud"]=="yes"]["clean_text"]), "spring") | |
| src_clean = get_wc_base64(" ".join(df[df["Fraud"]=="no"]["clean_text"]), "summer") | |
| # --- FORENSIC ENGINES (NLP Logic) --- | |
| def get_top_ngrams(corpus, n=2, top_n=20): | |
| vec = CountVectorizer(stop_words="english", ngram_range=(n, n), max_features=top_n) | |
| X = vec.fit_transform(corpus) | |
| counts = np.asarray(X.sum(axis=0)).ravel() | |
| # Sorted ascending for Plotly 'h' orientation to match top-down visual priority | |
| return pd.DataFrame({"ngram": vec.get_feature_names_out(), "count": counts}).sort_values("count", ascending=True) | |
| def top_tfidf_terms(corpus, n=20): | |
| tfidf = TfidfVectorizer(stop_words="english", max_features=5000) | |
| X = tfidf.fit_transform(corpus) | |
| scores = np.asarray(X.mean(axis=0)).ravel() | |
| return pd.DataFrame({"term": tfidf.get_feature_names_out(), "avg_tfidf": scores}).sort_values("avg_tfidf", ascending=True).tail(n) | |
| # --- UI HELPERS --- | |
| def create_kpi_card(title, value, color): | |
| return html.Div([ | |
| dbc.Row([ | |
| dbc.Col([ | |
| html.H2(value, className="fw-bold mb-0", style={"color": "white", "fontSize": "22px"}), | |
| html.Small(title, className="text-muted text-uppercase fw-bold", style={"fontSize": "9px"}), | |
| ], width=9, className="ps-4 d-flex flex-column justify-content-center"), | |
| dbc.Col(style={"backgroundColor": color, "height": "100%"}, width=3) | |
| ], className="g-0 align-items-stretch", style={"height": "100%"}) | |
| ], style={"backgroundColor": "#111", "border": "1px solid #222", "borderRadius": "10px", "height": "80px", "overflow": "hidden"}) | |
| def create_forensic_subplots(df_left, df_right, col_y, col_x, palette_left, palette_right, title_left, title_right): | |
| fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.22, | |
| subplot_titles=(f'<b>{title_left}</b>', f'<b>{title_right}</b>')) | |
| fig.add_trace(go.Bar(x=df_left[col_x], y=df_left[col_y], orientation='h', | |
| marker=dict(color=df_left[col_x], colorscale=palette_left)), 1, 1) | |
| fig.add_trace(go.Bar(x=df_right[col_x], y=df_right[col_y], orientation='h', | |
| marker=dict(color=df_right[col_x], colorscale=palette_right)), 1, 2) | |
| fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', | |
| font_color="white", height=600, showlegend=False, | |
| margin=dict(t=100, b=50, l=150, r=20)) | |
| fig.update_yaxes(automargin=True) | |
| return fig | |
| # --- LAYOUT --- | |
| layout = dbc.Container([ | |
| html.H1("Forensic Performance Engine", className="text-white fw-bold mt-4 mb-0"), | |
| html.P("Auditing target distribution and linguistic fingerprints.", className="text-muted mb-5"), | |
| # 1. KPI Strip | |
| dbc.Row([ | |
| dbc.Col(create_kpi_card("Mean Unique Words", f"{df['unique_words'].mean():.1f}", NEON_GREEN), lg=3), | |
| dbc.Col(create_kpi_card("Fraud Lexical Diversity", f"{df[df['Fraud']=='yes']['lexical_diversity'].mean():.3f}", NEON_FUCHSIA), lg=3), | |
| dbc.Col(create_kpi_card("Standard Lexical Diversity", f"{df[df['Fraud']=='no']['lexical_diversity'].mean():.3f}", "#74B9FF"), lg=3), | |
| dbc.Col(create_kpi_card("Total Reports", f"{len(df):,}", "#A29BFE"), lg=3), | |
| ], className="g-4 mb-5"), | |
| # 2. Target Distribution Section | |
| dbc.Row([ | |
| dbc.Col(dbc.Card([ | |
| dbc.CardBody([ | |
| html.H4("Target Distribution & Class Balance", className="text-white fw-bold mb-4"), | |
| dbc.Row([ | |
| dbc.Col(dcc.Graph(figure=px.histogram(df, x="Fraud", color="Fraud", text_auto=True, | |
| color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}, | |
| title="Class Distribution: Count") | |
| .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350, showlegend=False)), lg=7), | |
| dbc.Col(dcc.Graph(figure=px.pie(df, names="Fraud", hole=0.4, | |
| color="Fraud", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}, | |
| title="Class Distribution: Proportion") | |
| .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350)), lg=5), | |
| ]) | |
| ]) | |
| ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), | |
| ], className="mb-5"), | |
| # 3. Lexical Diversity & Word Variance | |
| dbc.Row([ | |
| dbc.Col(dbc.Card([ | |
| dbc.CardBody([ | |
| html.H4("Vocabulary Richness Analysis", className="text-white fw-bold mb-4"), | |
| dbc.Row([ | |
| dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="unique_words", color="Fraud", title="Unique Word Variance", | |
| color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}) | |
| .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6), | |
| dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="lexical_diversity", color="Fraud", title="Lexical Diversity Ratio", | |
| color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN}) | |
| .update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6), | |
| ]) | |
| ]) | |
| ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), | |
| ], className="mb-5"), | |
| # 4. TF-IDF Terms | |
| dbc.Row([ | |
| dbc.Col(dbc.Card([ | |
| dbc.CardBody([ | |
| html.H4("Top 20 TF-IDF Terms by Class", className="text-white fw-bold mb-4"), | |
| dcc.Graph(figure=create_forensic_subplots( | |
| top_tfidf_terms(df.loc[df["Fraud"] == "yes", "clean_text"]), | |
| top_tfidf_terms(df.loc[df["Fraud"] == "no", "clean_text"]), | |
| "term", "avg_tfidf", "Reds", "Greens", "Fraudulent Significance", "Non-Fraudulent Significance" | |
| )) | |
| ]) | |
| ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), | |
| ], className="mb-5"), | |
| # 5. N-Gram Analysis (Bigrams) | |
| dbc.Row([ | |
| dbc.Col(dbc.Card([ | |
| dbc.CardBody([ | |
| html.H4("Top 20 Bigrams by Class", className="text-white fw-bold mb-4"), | |
| dcc.Graph(figure=create_forensic_subplots( | |
| get_top_ngrams(df.loc[df["Fraud"] == "yes", "clean_text"]), | |
| get_top_ngrams(df.loc[df["Fraud"] == "no", "clean_text"]), | |
| "ngram", "count", "Reds", "Greens", "Top Bigrams — Fraudulent", "Top Bigrams — Non-Fraudulent" | |
| )) | |
| ]) | |
| ], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12), | |
| ], className="mb-5"), | |
| # 6. Word Clouds | |
| dbc.Card([ | |
| dbc.CardBody([ | |
| html.H2("Semantic Fingerprints (Word Clouds)", className="text-white fw-bold text-center mb-5"), | |
| dbc.Row([ | |
| dbc.Col([ | |
| html.H4("Anomalous Lens", className="text-center", style={"color": NEON_FUCHSIA}), | |
| html.Div(html.Img(src=src_fraud, style={"width": "100%"}), | |
| style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_FUCHSIA}", "margin": "0 auto", "overflow": "hidden"}) | |
| ], lg=6), | |
| dbc.Col([ | |
| html.H4("Standard Lens", className="text-center", style={"color": NEON_GREEN}), | |
| html.Div(html.Img(src=src_clean, style={"width": "100%"}), | |
| style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_GREEN}", "margin": "0 auto", "overflow": "hidden"}) | |
| ], lg=6), | |
| ]) | |
| ], className="py-5") | |
| ], style={"backgroundColor": "#050505", "border": "1px solid #222", "borderRadius": "25px"}, className="mb-5") | |
| ], fluid=True, style={"backgroundColor": "#000", "minHeight": "100vh", "padding": "0 5%"}) |