Spaces:

hellosara
/

fraud_financial

Sleeping

App Files Files Community

fraud_financial / pages /analytics.py

hellosara

Update pages/analytics.py

ee94c4a verified 16 days ago

raw

history blame contribute delete

9.88 kB

	import dash
	from dash import html, dcc
	import dash_bootstrap_components as dbc
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	import io
	import base64
	from datasets import load_dataset
	from wordcloud import WordCloud

	dash.register_page(__name__, path='/performance', name="Forensic Performance")

	# --- DATA ENGINE ---
	def load_performance_data():
	raw_data = load_dataset("amitkedia/Financial-Fraud-Dataset")
	df = pd.DataFrame(raw_data[list(raw_data.keys())[0]])
	df["clean_text"] = df["Fillings"].astype(str).str.lower()
	df["word_count"] = df["Fillings"].astype(str).apply(lambda x: len(x.split()))
	# Vocabulary Richness Logic
	df["unique_words"] = df["clean_text"].apply(lambda x: len(set(x.split())))
	df["lexical_diversity"] = df["unique_words"] / (df["word_count"] + 1)
	return df

	df = load_performance_data()
	NEON_GREEN = "#55EFC4"
	NEON_FUCHSIA = "#FF00FF"
	DARK_BG = "#0c0c0c"

	# --- WORD CLOUD ENGINE ---
	def get_wc_base64(text, colormap):
	wc = WordCloud(width=1000, height=1000, background_color="rgba(0,0,0,0)",
	mode="RGBA", colormap=colormap, max_words=250).generate(text)
	img = io.BytesIO()
	wc.to_image().save(img, format='PNG')
	return "data:image/png;base64," + base64.b64encode(img.getvalue()).decode()

	src_fraud = get_wc_base64(" ".join(df[df["Fraud"]=="yes"]["clean_text"]), "spring")
	src_clean = get_wc_base64(" ".join(df[df["Fraud"]=="no"]["clean_text"]), "summer")

	# --- FORENSIC ENGINES (NLP Logic) ---
	def get_top_ngrams(corpus, n=2, top_n=20):
	vec = CountVectorizer(stop_words="english", ngram_range=(n, n), max_features=top_n)
	X = vec.fit_transform(corpus)
	counts = np.asarray(X.sum(axis=0)).ravel()
	# Sorted ascending for Plotly 'h' orientation to match top-down visual priority
	return pd.DataFrame({"ngram": vec.get_feature_names_out(), "count": counts}).sort_values("count", ascending=True)

	def top_tfidf_terms(corpus, n=20):
	tfidf = TfidfVectorizer(stop_words="english", max_features=5000)
	X = tfidf.fit_transform(corpus)
	scores = np.asarray(X.mean(axis=0)).ravel()
	return pd.DataFrame({"term": tfidf.get_feature_names_out(), "avg_tfidf": scores}).sort_values("avg_tfidf", ascending=True).tail(n)

	# --- UI HELPERS ---
	def create_kpi_card(title, value, color):
	return html.Div([
	dbc.Row([
	dbc.Col([
	html.H2(value, className="fw-bold mb-0", style={"color": "white", "fontSize": "22px"}),
	html.Small(title, className="text-muted text-uppercase fw-bold", style={"fontSize": "9px"}),
	], width=9, className="ps-4 d-flex flex-column justify-content-center"),
	dbc.Col(style={"backgroundColor": color, "height": "100%"}, width=3)
	], className="g-0 align-items-stretch", style={"height": "100%"})
	], style={"backgroundColor": "#111", "border": "1px solid #222", "borderRadius": "10px", "height": "80px", "overflow": "hidden"})

	def create_forensic_subplots(df_left, df_right, col_y, col_x, palette_left, palette_right, title_left, title_right):
	fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.22,
	subplot_titles=(f'<b>{title_left}</b>', f'<b>{title_right}</b>'))

	fig.add_trace(go.Bar(x=df_left[col_x], y=df_left[col_y], orientation='h',
	marker=dict(color=df_left[col_x], colorscale=palette_left)), 1, 1)

	fig.add_trace(go.Bar(x=df_right[col_x], y=df_right[col_y], orientation='h',
	marker=dict(color=df_right[col_x], colorscale=palette_right)), 1, 2)

	fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)',
	font_color="white", height=600, showlegend=False,
	margin=dict(t=100, b=50, l=150, r=20))
	fig.update_yaxes(automargin=True)
	return fig

	# --- LAYOUT ---
	layout = dbc.Container([
	html.H1("Forensic Performance Engine", className="text-white fw-bold mt-4 mb-0"),
	html.P("Auditing target distribution and linguistic fingerprints.", className="text-muted mb-5"),

	# 1. KPI Strip
	dbc.Row([
	dbc.Col(create_kpi_card("Mean Unique Words", f"{df['unique_words'].mean():.1f}", NEON_GREEN), lg=3),
	dbc.Col(create_kpi_card("Fraud Lexical Diversity", f"{df[df['Fraud']=='yes']['lexical_diversity'].mean():.3f}", NEON_FUCHSIA), lg=3),
	dbc.Col(create_kpi_card("Standard Lexical Diversity", f"{df[df['Fraud']=='no']['lexical_diversity'].mean():.3f}", "#74B9FF"), lg=3),
	dbc.Col(create_kpi_card("Total Reports", f"{len(df):,}", "#A29BFE"), lg=3),
	], className="g-4 mb-5"),

	# 2. Target Distribution Section
	dbc.Row([
	dbc.Col(dbc.Card([
	dbc.CardBody([
	html.H4("Target Distribution & Class Balance", className="text-white fw-bold mb-4"),
	dbc.Row([
	dbc.Col(dcc.Graph(figure=px.histogram(df, x="Fraud", color="Fraud", text_auto=True,
	color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
	title="Class Distribution: Count")
	.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350, showlegend=False)), lg=7),
	dbc.Col(dcc.Graph(figure=px.pie(df, names="Fraud", hole=0.4,
	color="Fraud", color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN},
	title="Class Distribution: Proportion")
	.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=350)), lg=5),
	])
	])
	], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
	], className="mb-5"),

	# 3. Lexical Diversity & Word Variance
	dbc.Row([
	dbc.Col(dbc.Card([
	dbc.CardBody([
	html.H4("Vocabulary Richness Analysis", className="text-white fw-bold mb-4"),
	dbc.Row([
	dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="unique_words", color="Fraud", title="Unique Word Variance",
	color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
	.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
	dbc.Col(dcc.Graph(figure=px.box(df, x="Fraud", y="lexical_diversity", color="Fraud", title="Lexical Diversity Ratio",
	color_discrete_map={"yes": NEON_FUCHSIA, "no": NEON_GREEN})
	.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="white", height=380)), lg=6),
	])
	])
	], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
	], className="mb-5"),

	# 4. TF-IDF Terms
	dbc.Row([
	dbc.Col(dbc.Card([
	dbc.CardBody([
	html.H4("Top 20 TF-IDF Terms by Class", className="text-white fw-bold mb-4"),
	dcc.Graph(figure=create_forensic_subplots(
	top_tfidf_terms(df.loc[df["Fraud"] == "yes", "clean_text"]),
	top_tfidf_terms(df.loc[df["Fraud"] == "no", "clean_text"]),
	"term", "avg_tfidf", "Reds", "Greens", "Fraudulent Significance", "Non-Fraudulent Significance"
	))
	])
	], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
	], className="mb-5"),

	# 5. N-Gram Analysis (Bigrams)
	dbc.Row([
	dbc.Col(dbc.Card([
	dbc.CardBody([
	html.H4("Top 20 Bigrams by Class", className="text-white fw-bold mb-4"),
	dcc.Graph(figure=create_forensic_subplots(
	get_top_ngrams(df.loc[df["Fraud"] == "yes", "clean_text"]),
	get_top_ngrams(df.loc[df["Fraud"] == "no", "clean_text"]),
	"ngram", "count", "Reds", "Greens", "Top Bigrams — Fraudulent", "Top Bigrams — Non-Fraudulent"
	))
	])
	], style={"backgroundColor": DARK_BG, "border": "1px solid #222"}), width=12),
	], className="mb-5"),

	# 6. Word Clouds
	dbc.Card([
	dbc.CardBody([
	html.H2("Semantic Fingerprints (Word Clouds)", className="text-white fw-bold text-center mb-5"),
	dbc.Row([
	dbc.Col([
	html.H4("Anomalous Lens", className="text-center", style={"color": NEON_FUCHSIA}),
	html.Div(html.Img(src=src_fraud, style={"width": "100%"}),
	style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_FUCHSIA}", "margin": "0 auto", "overflow": "hidden"})
	], lg=6),
	dbc.Col([
	html.H4("Standard Lens", className="text-center", style={"color": NEON_GREEN}),
	html.Div(html.Img(src=src_clean, style={"width": "100%"}),
	style={"width": "420px", "height": "420px", "borderRadius": "50%", "border": f"5px solid {NEON_GREEN}", "margin": "0 auto", "overflow": "hidden"})
	], lg=6),
	])
	], className="py-5")
	], style={"backgroundColor": "#050505", "border": "1px solid #222", "borderRadius": "25px"}, className="mb-5")

	], fluid=True, style={"backgroundColor": "#000", "minHeight": "100vh", "padding": "0 5%"})