| import gradio as gr |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from plotly.subplots import make_subplots |
| import duckdb |
| import numpy as np |
| from datetime import datetime |
| import os |
|
|
| |
| DATABASE_PATH = "./data/h1bs_analytics.duckdb" |
|
|
| def get_db_connection(): |
| """Create a connection to the DuckDB database""" |
| if os.path.exists(DATABASE_PATH): |
| return duckdb.connect(DATABASE_PATH, read_only=True) |
| else: |
| |
| return create_sample_data() |
|
|
| def create_sample_data(): |
| """Create sample H1B facts data for demonstration""" |
| conn = duckdb.connect(":memory:") |
| |
| |
| np.random.seed(42) |
| n_records = 5000 |
| |
| sample_facts = pd.DataFrame({ |
| 'record_id': range(1, n_records + 1), |
| 'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records), |
| 'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records), |
| 'country_of_birth': np.random.choice([ |
| 'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM', |
| 'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL' |
| ], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]), |
| 'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), |
| 'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]), |
| 'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45), |
| 'years_since_application': np.random.choice([0, 1, 2, 3], n_records), |
| 'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]), |
| 'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]), |
| 'employer_sk': [f'EMP_{i%500}' for i in range(n_records)], |
| 'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)], |
| 'job_sk': [f'JOB_{i%300}' for i in range(n_records)] |
| }) |
| |
| conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts") |
| |
| return conn |
|
|
| |
| conn = get_db_connection() |
|
|
| def load_facts_data(): |
| """Load H1B applications fact table""" |
| try: |
| query = """ |
| SELECT * FROM fct_h1b_applications |
| WHERE wage_amt IS NOT NULL |
| LIMIT 10000 |
| """ |
| return conn.execute(query).df() |
| except Exception as e: |
| print(f"Error loading facts data: {e}") |
| return pd.DataFrame() |
|
|
| |
| facts_df = load_facts_data() |
|
|
| |
|
|
| def facts_overview(): |
| """Overview of the facts table with key metrics""" |
| if facts_df.empty: |
| return go.Figure().update_layout(title="No facts data available") |
| |
| |
| total_records = len(facts_df) |
| avg_wage = facts_df['wage_amt'].mean() |
| median_wage = facts_df['wage_amt'].median() |
| multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100 |
| |
| |
| fig = make_subplots( |
| rows=2, cols=2, |
| specs=[[{"type": "indicator"}, {"type": "indicator"}], |
| [{"type": "indicator"}, {"type": "indicator"}]], |
| subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %") |
| ) |
| |
| fig.add_trace( |
| go.Indicator( |
| mode="number", |
| value=total_records, |
| number={"valueformat": ","}, |
| title={"text": "Total Records"} |
| ), |
| row=1, col=1 |
| ) |
| |
| fig.add_trace( |
| go.Indicator( |
| mode="number", |
| value=avg_wage, |
| number={"prefix": "$", "valueformat": ",.0f"}, |
| title={"text": "Average Wage"} |
| ), |
| row=1, col=2 |
| ) |
| |
| fig.add_trace( |
| go.Indicator( |
| mode="number", |
| value=median_wage, |
| number={"prefix": "$", "valueformat": ",.0f"}, |
| title={"text": "Median Wage"} |
| ), |
| row=2, col=1 |
| ) |
| |
| fig.add_trace( |
| go.Indicator( |
| mode="number", |
| value=multiple_reg_pct, |
| number={"suffix": "%", "valueformat": ".1f"}, |
| title={"text": "Multiple Registrations"} |
| ), |
| row=2, col=2 |
| ) |
| |
| fig.update_layout( |
| height=400, |
| title_text="H1B Facts Table - Key Metrics" |
| ) |
| |
| return fig |
|
|
| def wage_distribution(): |
| """Visualize wage distribution from facts table""" |
| if facts_df.empty: |
| return go.Figure().update_layout(title="No data available") |
| |
| fig = make_subplots( |
| rows=1, cols=2, |
| specs=[[{"type": "histogram"}, {"type": "box"}]], |
| subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)") |
| ) |
| |
| |
| fig.add_trace( |
| go.Histogram( |
| x=facts_df['wage_amt'], |
| nbinsx=50, |
| marker_color='skyblue', |
| opacity=0.7, |
| name='Wage Distribution' |
| ), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Box( |
| y=facts_df['wage_amt'], |
| marker_color='lightcoral', |
| name='Wage Box Plot' |
| ), |
| row=1, col=2 |
| ) |
| |
| fig.update_layout( |
| height=500, |
| title_text="Wage Analysis from Facts Table", |
| showlegend=False |
| ) |
| |
| fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1) |
| fig.update_yaxes(title_text="Frequency", row=1, col=1) |
| fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2) |
| |
| return fig |
|
|
| def country_analysis(): |
| """Analyze country distribution from facts table""" |
| if facts_df.empty: |
| return go.Figure().update_layout(title="No data available") |
| |
| |
| country_counts = facts_df['country_of_birth'].value_counts().head(10) |
| |
| |
| country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index() |
| country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') |
| |
| fig = make_subplots( |
| rows=1, cols=2, |
| specs=[[{"type": "bar"}, {"type": "bar"}]], |
| subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)") |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar( |
| x=country_counts.index, |
| y=country_counts.values, |
| marker_color='teal', |
| text=country_counts.values, |
| textposition='auto', |
| name='Application Count' |
| ), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar( |
| x=country_wages['country_of_birth'], |
| y=country_wages['mean'], |
| marker_color='orange', |
| text=['$' + f"{x:,.0f}" for x in country_wages['mean']], |
| textposition='auto', |
| name='Average Wage' |
| ), |
| row=1, col=2 |
| ) |
| |
| fig.update_layout( |
| height=500, |
| title_text="Country Analysis from Facts Table", |
| showlegend=False |
| ) |
| |
| fig.update_xaxes(tickangle=45, row=1, col=1) |
| fig.update_xaxes(tickangle=45, row=1, col=2) |
| fig.update_yaxes(title_text="Number of Applications", row=1, col=1) |
| fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2) |
| |
| return fig |
|
|
| def temporal_analysis(): |
| """Analyze temporal patterns from facts table""" |
| if facts_df.empty: |
| return go.Figure().update_layout(title="No data available") |
| |
| |
| yearly_stats = facts_df.groupby('fiscal_year').agg({ |
| 'record_id': 'count', |
| 'wage_amt': 'mean', |
| 'is_multiple_registration': 'mean' |
| }).reset_index() |
| |
| yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100 |
| |
| fig = make_subplots( |
| rows=2, cols=1, |
| specs=[[{"secondary_y": True}], [{"type": "bar"}]], |
| subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year") |
| ) |
| |
| |
| fig.add_trace( |
| go.Scatter( |
| x=yearly_stats['fiscal_year'], |
| y=yearly_stats['record_id'], |
| mode='lines+markers', |
| name='Applications', |
| line=dict(color='blue', width=3), |
| marker=dict(size=8) |
| ), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Scatter( |
| x=yearly_stats['fiscal_year'], |
| y=yearly_stats['wage_amt'], |
| mode='lines+markers', |
| name='Average Wage', |
| line=dict(color='red', width=3), |
| marker=dict(size=8), |
| yaxis='y2' |
| ), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar( |
| x=yearly_stats['fiscal_year'], |
| y=yearly_stats['multiple_reg_pct'], |
| marker_color='green', |
| text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']], |
| textposition='auto', |
| name='Multiple Registration %' |
| ), |
| row=2, col=1 |
| ) |
| |
| |
| fig.update_layout( |
| height=600, |
| title_text="Temporal Analysis from Facts Table" |
| ) |
| |
| |
| fig.update_yaxes(title_text="Number of Applications", row=1, col=1) |
| fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1) |
| fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1) |
| fig.update_xaxes(title_text="Fiscal Year", row=2, col=1) |
| |
| return fig |
|
|
| def demographic_analysis(): |
| """Analyze demographic patterns from facts table""" |
| if facts_df.empty: |
| return go.Figure().update_layout(title="No data available") |
| |
| |
| age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False) |
| age_counts = age_bins.value_counts().sort_index() |
| |
| |
| employment_type = facts_df['full_time_ind'].value_counts() |
| employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index] |
| |
| |
| same_state = facts_df['employer_worksite_same_state'].value_counts() |
| same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index] |
| |
| fig = make_subplots( |
| rows=2, cols=2, |
| specs=[[{"type": "bar"}, {"type": "pie"}], |
| [{"type": "pie"}, {"type": "histogram"}]], |
| subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application") |
| ) |
| |
| |
| fig.add_trace( |
| go.Bar( |
| x=[str(interval) for interval in age_counts.index], |
| y=age_counts.values, |
| marker_color='lightblue', |
| name='Age Distribution' |
| ), |
| row=1, col=1 |
| ) |
| |
| |
| fig.add_trace( |
| go.Pie( |
| labels=employment_labels, |
| values=employment_type.values, |
| name="Employment Type" |
| ), |
| row=1, col=2 |
| ) |
| |
| |
| fig.add_trace( |
| go.Pie( |
| labels=same_state_labels, |
| values=same_state.values, |
| name="Location" |
| ), |
| row=2, col=1 |
| ) |
| |
| |
| years_since = facts_df['years_since_application'].value_counts().sort_index() |
| fig.add_trace( |
| go.Histogram( |
| x=facts_df['years_since_application'], |
| nbinsx=10, |
| marker_color='lightgreen', |
| name='Years Since Application' |
| ), |
| row=2, col=2 |
| ) |
| |
| fig.update_layout( |
| height=600, |
| title_text="Demographic Analysis from Facts Table", |
| showlegend=False |
| ) |
| |
| return fig |
|
|
| def facts_data_table(): |
| """Display sample of facts table data""" |
| if facts_df.empty: |
| return pd.DataFrame() |
| |
| |
| display_cols = [ |
| 'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth', |
| 'wage_amt', 'age_at_application', 'is_multiple_registration', |
| 'full_time_ind', 'employer_worksite_same_state' |
| ] |
| |
| sample_data = facts_df[display_cols].head(100).copy() |
| |
| |
| sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}") |
| |
| return sample_data |
|
|
| |
|
|
| with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo: |
| gr.Markdown("# π H1B Facts Table Analytics Dashboard") |
| gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data") |
| |
| with gr.Tab("π Facts Overview"): |
| gr.Markdown("### Key Metrics from Facts Table") |
| facts_overview_plot = gr.Plot() |
| gr.Button("Load Facts Overview", variant="primary").click( |
| fn=facts_overview, |
| outputs=facts_overview_plot |
| ) |
| |
| with gr.Tab("π° Wage Analysis"): |
| gr.Markdown("### Wage Distribution from Facts Table") |
| wage_plot = gr.Plot() |
| gr.Button("Analyze Wages", variant="primary").click( |
| fn=wage_distribution, |
| outputs=wage_plot |
| ) |
| |
| with gr.Tab("π Country Analysis"): |
| gr.Markdown("### Country-wise Analysis from Facts Table") |
| country_plot = gr.Plot() |
| gr.Button("Analyze Countries", variant="primary").click( |
| fn=country_analysis, |
| outputs=country_plot |
| ) |
| |
| with gr.Tab("π
Temporal Analysis"): |
| gr.Markdown("### Time-based Trends from Facts Table") |
| temporal_plot = gr.Plot() |
| gr.Button("Analyze Trends", variant="primary").click( |
| fn=temporal_analysis, |
| outputs=temporal_plot |
| ) |
| |
| with gr.Tab("π₯ Demographics"): |
| gr.Markdown("### Demographic Patterns from Facts Table") |
| demo_plot = gr.Plot() |
| gr.Button("Analyze Demographics", variant="primary").click( |
| fn=demographic_analysis, |
| outputs=demo_plot |
| ) |
| |
| with gr.Tab("π Raw Data"): |
| gr.Markdown("### Sample Facts Table Data (First 100 rows)") |
| data_table = gr.DataFrame() |
| gr.Button("Load Sample Data", variant="primary").click( |
| fn=facts_data_table, |
| outputs=data_table |
| ) |
| |
| |
| gr.Markdown("---") |
| gr.Markdown("### Facts Table Schema") |
| gr.Markdown(""" |
| **Table**: `fct_h1b_applications` |
| |
| **Key Columns**: |
| - `record_id`: Unique identifier for each application |
| - `lottery_year`, `fiscal_year`: Temporal dimensions |
| - `country_of_birth`: Beneficiary country |
| - `wage_amt`: Offered wage amount |
| - `age_at_application`: Beneficiary age |
| - `is_multiple_registration`: Multiple lottery entries flag |
| - `full_time_ind`: Full-time employment indicator |
| - `employer_worksite_same_state`: Location alignment flag |
| - Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk` |
| """) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=True, |
| show_error=True |
| ) |