Spaces:

LeonceNsh
/

hones

Sleeping

App Files Files Community

hones / app.py

LeonceNsh

Upload folder using huggingface_hub

1885ec3 verified 10 months ago

raw

history blame contribute delete

15.3 kB

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import duckdb
	import numpy as np
	from datetime import datetime
	import os

	# Database connection
	DATABASE_PATH = "./data/h1bs_analytics.duckdb"

	def get_db_connection():
	"""Create a connection to the DuckDB database"""
	if os.path.exists(DATABASE_PATH):
	return duckdb.connect(DATABASE_PATH, read_only=True)
	else:
	# Create sample data if database doesn't exist
	return create_sample_data()

	def create_sample_data():
	"""Create sample H1B facts data for demonstration"""
	conn = duckdb.connect(":memory:")

	# Sample fact table based on H1B schema
	np.random.seed(42)
	n_records = 5000

	sample_facts = pd.DataFrame({
	'record_id': range(1, n_records + 1),
	'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
	'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
	'country_of_birth': np.random.choice([
	'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
	'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
	], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
	'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), # Log-normal for realistic wage distribution
	'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
	'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
	'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
	'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
	'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
	'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
	'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
	'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
	})

	conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")

	return conn

	# Load data
	conn = get_db_connection()

	def load_facts_data():
	"""Load H1B applications fact table"""
	try:
	query = """
	SELECT * FROM fct_h1b_applications
	WHERE wage_amt IS NOT NULL
	LIMIT 10000
	"""
	return conn.execute(query).df()
	except Exception as e:
	print(f"Error loading facts data: {e}")
	return pd.DataFrame()

	# Load the facts data
	facts_df = load_facts_data()

	# ---- FACTS TABLE VISUALIZATIONS ----

	def facts_overview():
	"""Overview of the facts table with key metrics"""
	if facts_df.empty:
	return go.Figure().update_layout(title="No facts data available")

	# Key metrics
	total_records = len(facts_df)
	avg_wage = facts_df['wage_amt'].mean()
	median_wage = facts_df['wage_amt'].median()
	multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100

	# Create metrics dashboard
	fig = make_subplots(
	rows=2, cols=2,
	specs=[[{"type": "indicator"}, {"type": "indicator"}],
	[{"type": "indicator"}, {"type": "indicator"}]],
	subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
	)

	fig.add_trace(
	go.Indicator(
	mode="number",
	value=total_records,
	number={"valueformat": ","},
	title={"text": "Total Records"}
	),
	row=1, col=1
	)

	fig.add_trace(
	go.Indicator(
	mode="number",
	value=avg_wage,
	number={"prefix": "$", "valueformat": ",.0f"},
	title={"text": "Average Wage"}
	),
	row=1, col=2
	)

	fig.add_trace(
	go.Indicator(
	mode="number",
	value=median_wage,
	number={"prefix": "$", "valueformat": ",.0f"},
	title={"text": "Median Wage"}
	),
	row=2, col=1
	)

	fig.add_trace(
	go.Indicator(
	mode="number",
	value=multiple_reg_pct,
	number={"suffix": "%", "valueformat": ".1f"},
	title={"text": "Multiple Registrations"}
	),
	row=2, col=2
	)

	fig.update_layout(
	height=400,
	title_text="H1B Facts Table - Key Metrics"
	)

	return fig

	def wage_distribution():
	"""Visualize wage distribution from facts table"""
	if facts_df.empty:
	return go.Figure().update_layout(title="No data available")

	fig = make_subplots(
	rows=1, cols=2,
	specs=[[{"type": "histogram"}, {"type": "box"}]],
	subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
	)

	# Histogram
	fig.add_trace(
	go.Histogram(
	x=facts_df['wage_amt'],
	nbinsx=50,
	marker_color='skyblue',
	opacity=0.7,
	name='Wage Distribution'
	),
	row=1, col=1
	)

	# Box plot
	fig.add_trace(
	go.Box(
	y=facts_df['wage_amt'],
	marker_color='lightcoral',
	name='Wage Box Plot'
	),
	row=1, col=2
	)

	fig.update_layout(
	height=500,
	title_text="Wage Analysis from Facts Table",
	showlegend=False
	)

	fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
	fig.update_yaxes(title_text="Frequency", row=1, col=1)
	fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)

	return fig

	def country_analysis():
	"""Analyze country distribution from facts table"""
	if facts_df.empty:
	return go.Figure().update_layout(title="No data available")

	# Country counts
	country_counts = facts_df['country_of_birth'].value_counts().head(10)

	# Average wage by country
	country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
	country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') # Min 50 applications

	fig = make_subplots(
	rows=1, cols=2,
	specs=[[{"type": "bar"}, {"type": "bar"}]],
	subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
	)

	# Applications by country
	fig.add_trace(
	go.Bar(
	x=country_counts.index,
	y=country_counts.values,
	marker_color='teal',
	text=country_counts.values,
	textposition='auto',
	name='Application Count'
	),
	row=1, col=1
	)

	# Average wage by country
	fig.add_trace(
	go.Bar(
	x=country_wages['country_of_birth'],
	y=country_wages['mean'],
	marker_color='orange',
	text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
	textposition='auto',
	name='Average Wage'
	),
	row=1, col=2
	)

	fig.update_layout(
	height=500,
	title_text="Country Analysis from Facts Table",
	showlegend=False
	)

	fig.update_xaxes(tickangle=45, row=1, col=1)
	fig.update_xaxes(tickangle=45, row=1, col=2)
	fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
	fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)

	return fig

	def temporal_analysis():
	"""Analyze temporal patterns from facts table"""
	if facts_df.empty:
	return go.Figure().update_layout(title="No data available")

	# Yearly trends
	yearly_stats = facts_df.groupby('fiscal_year').agg({
	'record_id': 'count',
	'wage_amt': 'mean',
	'is_multiple_registration': 'mean'
	}).reset_index()

	yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100

	fig = make_subplots(
	rows=2, cols=1,
	specs=[[{"secondary_y": True}], [{"type": "bar"}]],
	subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
	)

	# Applications count
	fig.add_trace(
	go.Scatter(
	x=yearly_stats['fiscal_year'],
	y=yearly_stats['record_id'],
	mode='lines+markers',
	name='Applications',
	line=dict(color='blue', width=3),
	marker=dict(size=8)
	),
	row=1, col=1
	)

	# Average wage (secondary y-axis)
	fig.add_trace(
	go.Scatter(
	x=yearly_stats['fiscal_year'],
	y=yearly_stats['wage_amt'],
	mode='lines+markers',
	name='Average Wage',
	line=dict(color='red', width=3),
	marker=dict(size=8),
	yaxis='y2'
	),
	row=1, col=1
	)

	# Multiple registration percentage
	fig.add_trace(
	go.Bar(
	x=yearly_stats['fiscal_year'],
	y=yearly_stats['multiple_reg_pct'],
	marker_color='green',
	text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
	textposition='auto',
	name='Multiple Registration %'
	),
	row=2, col=1
	)

	# Update layout
	fig.update_layout(
	height=600,
	title_text="Temporal Analysis from Facts Table"
	)

	# Update y-axes
	fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
	fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
	fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
	fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)

	return fig

	def demographic_analysis():
	"""Analyze demographic patterns from facts table"""
	if facts_df.empty:
	return go.Figure().update_layout(title="No data available")

	# Age distribution
	age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
	age_counts = age_bins.value_counts().sort_index()

	# Full-time vs Part-time
	employment_type = facts_df['full_time_ind'].value_counts()
	employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]

	# Same state employment
	same_state = facts_df['employer_worksite_same_state'].value_counts()
	same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]

	fig = make_subplots(
	rows=2, cols=2,
	specs=[[{"type": "bar"}, {"type": "pie"}],
	[{"type": "pie"}, {"type": "histogram"}]],
	subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
	)

	# Age distribution
	fig.add_trace(
	go.Bar(
	x=[str(interval) for interval in age_counts.index],
	y=age_counts.values,
	marker_color='lightblue',
	name='Age Distribution'
	),
	row=1, col=1
	)

	# Employment type pie chart
	fig.add_trace(
	go.Pie(
	labels=employment_labels,
	values=employment_type.values,
	name="Employment Type"
	),
	row=1, col=2
	)

	# Same state pie chart
	fig.add_trace(
	go.Pie(
	labels=same_state_labels,
	values=same_state.values,
	name="Location"
	),
	row=2, col=1
	)

	# Years since application
	years_since = facts_df['years_since_application'].value_counts().sort_index()
	fig.add_trace(
	go.Histogram(
	x=facts_df['years_since_application'],
	nbinsx=10,
	marker_color='lightgreen',
	name='Years Since Application'
	),
	row=2, col=2
	)

	fig.update_layout(
	height=600,
	title_text="Demographic Analysis from Facts Table",
	showlegend=False
	)

	return fig

	def facts_data_table():
	"""Display sample of facts table data"""
	if facts_df.empty:
	return pd.DataFrame()

	# Return first 100 rows with key columns
	display_cols = [
	'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
	'wage_amt', 'age_at_application', 'is_multiple_registration',
	'full_time_ind', 'employer_worksite_same_state'
	]

	sample_data = facts_df[display_cols].head(100).copy()

	# Format wage column
	sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")

	return sample_data

	# ---- GRADIO INTERFACE ----

	with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
	gr.Markdown("# 📊 H1B Facts Table Analytics Dashboard")
	gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")

	with gr.Tab("📈 Facts Overview"):
	gr.Markdown("### Key Metrics from Facts Table")
	facts_overview_plot = gr.Plot()
	gr.Button("Load Facts Overview", variant="primary").click(
	fn=facts_overview,
	outputs=facts_overview_plot
	)

	with gr.Tab("💰 Wage Analysis"):
	gr.Markdown("### Wage Distribution from Facts Table")
	wage_plot = gr.Plot()
	gr.Button("Analyze Wages", variant="primary").click(
	fn=wage_distribution,
	outputs=wage_plot
	)

	with gr.Tab("🌍 Country Analysis"):
	gr.Markdown("### Country-wise Analysis from Facts Table")
	country_plot = gr.Plot()
	gr.Button("Analyze Countries", variant="primary").click(
	fn=country_analysis,
	outputs=country_plot
	)

	with gr.Tab("📅 Temporal Analysis"):
	gr.Markdown("### Time-based Trends from Facts Table")
	temporal_plot = gr.Plot()
	gr.Button("Analyze Trends", variant="primary").click(
	fn=temporal_analysis,
	outputs=temporal_plot
	)

	with gr.Tab("👥 Demographics"):
	gr.Markdown("### Demographic Patterns from Facts Table")
	demo_plot = gr.Plot()
	gr.Button("Analyze Demographics", variant="primary").click(
	fn=demographic_analysis,
	outputs=demo_plot
	)

	with gr.Tab("📋 Raw Data"):
	gr.Markdown("### Sample Facts Table Data (First 100 rows)")
	data_table = gr.DataFrame()
	gr.Button("Load Sample Data", variant="primary").click(
	fn=facts_data_table,
	outputs=data_table
	)

	# Footer
	gr.Markdown("---")
	gr.Markdown("### Facts Table Schema")
	gr.Markdown("""
	Table: `fct_h1b_applications`

	Key Columns:
	- `record_id`: Unique identifier for each application
	- `lottery_year`, `fiscal_year`: Temporal dimensions
	- `country_of_birth`: Beneficiary country
	- `wage_amt`: Offered wage amount
	- `age_at_application`: Beneficiary age
	- `is_multiple_registration`: Multiple lottery entries flag
	- `full_time_ind`: Full-time employment indicator
	- `employer_worksite_same_state`: Location alignment flag
	- Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	show_error=True
	)