Upload folder using huggingface_hub
Browse files- .gradio/certificate.pem +31 -0
- README.md +2 -8
- __pycache__/model_and_load_toduckdb.cpython-311.pyc +0 -0
- app.py +489 -0
- model_and_load_toduckdb.py +973 -0
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.34.2
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: hones
|
| 3 |
+
app_file: app.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 5.34.2
|
|
|
|
|
|
|
| 6 |
---
|
|
|
|
|
|
__pycache__/model_and_load_toduckdb.cpython-311.pyc
ADDED
|
Binary file (50.4 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,489 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
from plotly.subplots import make_subplots
|
| 6 |
+
import duckdb
|
| 7 |
+
import numpy as np
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
# Database connection
|
| 12 |
+
DATABASE_PATH = "./data/h1bs_analytics.duckdb"
|
| 13 |
+
|
| 14 |
+
def get_db_connection():
|
| 15 |
+
"""Create a connection to the DuckDB database"""
|
| 16 |
+
if os.path.exists(DATABASE_PATH):
|
| 17 |
+
return duckdb.connect(DATABASE_PATH, read_only=True)
|
| 18 |
+
else:
|
| 19 |
+
# Create sample data if database doesn't exist
|
| 20 |
+
return create_sample_data()
|
| 21 |
+
|
| 22 |
+
def create_sample_data():
|
| 23 |
+
"""Create sample H1B facts data for demonstration"""
|
| 24 |
+
conn = duckdb.connect(":memory:")
|
| 25 |
+
|
| 26 |
+
# Sample fact table based on H1B schema
|
| 27 |
+
np.random.seed(42)
|
| 28 |
+
n_records = 5000
|
| 29 |
+
|
| 30 |
+
sample_facts = pd.DataFrame({
|
| 31 |
+
'record_id': range(1, n_records + 1),
|
| 32 |
+
'lottery_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
|
| 33 |
+
'fiscal_year': np.random.choice([2021, 2022, 2023, 2024], n_records),
|
| 34 |
+
'country_of_birth': np.random.choice([
|
| 35 |
+
'INDIA', 'CHINA', 'SOUTH KOREA', 'CANADA', 'UNITED KINGDOM',
|
| 36 |
+
'PHILIPPINES', 'TAIWAN', 'JAPAN', 'MEXICO', 'BRAZIL'
|
| 37 |
+
], n_records, p=[0.4, 0.15, 0.1, 0.08, 0.07, 0.05, 0.05, 0.04, 0.03, 0.03]),
|
| 38 |
+
'wage_amt': np.random.lognormal(11.2, 0.5, n_records).round(0), # Log-normal for realistic wage distribution
|
| 39 |
+
'is_multiple_registration': np.random.choice([True, False], n_records, p=[0.3, 0.7]),
|
| 40 |
+
'age_at_application': np.random.normal(28, 4, n_records).round(0).clip(22, 45),
|
| 41 |
+
'years_since_application': np.random.choice([0, 1, 2, 3], n_records),
|
| 42 |
+
'full_time_ind': np.random.choice([True, False], n_records, p=[0.85, 0.15]),
|
| 43 |
+
'employer_worksite_same_state': np.random.choice([True, False], n_records, p=[0.7, 0.3]),
|
| 44 |
+
'employer_sk': [f'EMP_{i%500}' for i in range(n_records)],
|
| 45 |
+
'beneficiary_sk': [f'BEN_{i}' for i in range(n_records)],
|
| 46 |
+
'job_sk': [f'JOB_{i%300}' for i in range(n_records)]
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
conn.execute("CREATE TABLE fct_h1b_applications AS SELECT * FROM sample_facts")
|
| 50 |
+
|
| 51 |
+
return conn
|
| 52 |
+
|
| 53 |
+
# Load data
|
| 54 |
+
conn = get_db_connection()
|
| 55 |
+
|
| 56 |
+
def load_facts_data():
|
| 57 |
+
"""Load H1B applications fact table"""
|
| 58 |
+
try:
|
| 59 |
+
query = """
|
| 60 |
+
SELECT * FROM fct_h1b_applications
|
| 61 |
+
WHERE wage_amt IS NOT NULL
|
| 62 |
+
LIMIT 10000
|
| 63 |
+
"""
|
| 64 |
+
return conn.execute(query).df()
|
| 65 |
+
except Exception as e:
|
| 66 |
+
print(f"Error loading facts data: {e}")
|
| 67 |
+
return pd.DataFrame()
|
| 68 |
+
|
| 69 |
+
# Load the facts data
|
| 70 |
+
facts_df = load_facts_data()
|
| 71 |
+
|
| 72 |
+
# ---- FACTS TABLE VISUALIZATIONS ----
|
| 73 |
+
|
| 74 |
+
def facts_overview():
|
| 75 |
+
"""Overview of the facts table with key metrics"""
|
| 76 |
+
if facts_df.empty:
|
| 77 |
+
return go.Figure().update_layout(title="No facts data available")
|
| 78 |
+
|
| 79 |
+
# Key metrics
|
| 80 |
+
total_records = len(facts_df)
|
| 81 |
+
avg_wage = facts_df['wage_amt'].mean()
|
| 82 |
+
median_wage = facts_df['wage_amt'].median()
|
| 83 |
+
multiple_reg_pct = (facts_df['is_multiple_registration'].sum() / len(facts_df)) * 100
|
| 84 |
+
|
| 85 |
+
# Create metrics dashboard
|
| 86 |
+
fig = make_subplots(
|
| 87 |
+
rows=2, cols=2,
|
| 88 |
+
specs=[[{"type": "indicator"}, {"type": "indicator"}],
|
| 89 |
+
[{"type": "indicator"}, {"type": "indicator"}]],
|
| 90 |
+
subplot_titles=("Total Records", "Average Wage", "Median Wage", "Multiple Registration %")
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
fig.add_trace(
|
| 94 |
+
go.Indicator(
|
| 95 |
+
mode="number",
|
| 96 |
+
value=total_records,
|
| 97 |
+
number={"valueformat": ","},
|
| 98 |
+
title={"text": "Total Records"}
|
| 99 |
+
),
|
| 100 |
+
row=1, col=1
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
fig.add_trace(
|
| 104 |
+
go.Indicator(
|
| 105 |
+
mode="number",
|
| 106 |
+
value=avg_wage,
|
| 107 |
+
number={"prefix": "$", "valueformat": ",.0f"},
|
| 108 |
+
title={"text": "Average Wage"}
|
| 109 |
+
),
|
| 110 |
+
row=1, col=2
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
fig.add_trace(
|
| 114 |
+
go.Indicator(
|
| 115 |
+
mode="number",
|
| 116 |
+
value=median_wage,
|
| 117 |
+
number={"prefix": "$", "valueformat": ",.0f"},
|
| 118 |
+
title={"text": "Median Wage"}
|
| 119 |
+
),
|
| 120 |
+
row=2, col=1
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
fig.add_trace(
|
| 124 |
+
go.Indicator(
|
| 125 |
+
mode="number",
|
| 126 |
+
value=multiple_reg_pct,
|
| 127 |
+
number={"suffix": "%", "valueformat": ".1f"},
|
| 128 |
+
title={"text": "Multiple Registrations"}
|
| 129 |
+
),
|
| 130 |
+
row=2, col=2
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
fig.update_layout(
|
| 134 |
+
height=400,
|
| 135 |
+
title_text="H1B Facts Table - Key Metrics"
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
return fig
|
| 139 |
+
|
| 140 |
+
def wage_distribution():
|
| 141 |
+
"""Visualize wage distribution from facts table"""
|
| 142 |
+
if facts_df.empty:
|
| 143 |
+
return go.Figure().update_layout(title="No data available")
|
| 144 |
+
|
| 145 |
+
fig = make_subplots(
|
| 146 |
+
rows=1, cols=2,
|
| 147 |
+
specs=[[{"type": "histogram"}, {"type": "box"}]],
|
| 148 |
+
subplot_titles=("Wage Distribution", "Wage Distribution (Box Plot)")
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
# Histogram
|
| 152 |
+
fig.add_trace(
|
| 153 |
+
go.Histogram(
|
| 154 |
+
x=facts_df['wage_amt'],
|
| 155 |
+
nbinsx=50,
|
| 156 |
+
marker_color='skyblue',
|
| 157 |
+
opacity=0.7,
|
| 158 |
+
name='Wage Distribution'
|
| 159 |
+
),
|
| 160 |
+
row=1, col=1
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Box plot
|
| 164 |
+
fig.add_trace(
|
| 165 |
+
go.Box(
|
| 166 |
+
y=facts_df['wage_amt'],
|
| 167 |
+
marker_color='lightcoral',
|
| 168 |
+
name='Wage Box Plot'
|
| 169 |
+
),
|
| 170 |
+
row=1, col=2
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
fig.update_layout(
|
| 174 |
+
height=500,
|
| 175 |
+
title_text="Wage Analysis from Facts Table",
|
| 176 |
+
showlegend=False
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
fig.update_xaxes(title_text="Wage Amount ($)", row=1, col=1)
|
| 180 |
+
fig.update_yaxes(title_text="Frequency", row=1, col=1)
|
| 181 |
+
fig.update_yaxes(title_text="Wage Amount ($)", row=1, col=2)
|
| 182 |
+
|
| 183 |
+
return fig
|
| 184 |
+
|
| 185 |
+
def country_analysis():
|
| 186 |
+
"""Analyze country distribution from facts table"""
|
| 187 |
+
if facts_df.empty:
|
| 188 |
+
return go.Figure().update_layout(title="No data available")
|
| 189 |
+
|
| 190 |
+
# Country counts
|
| 191 |
+
country_counts = facts_df['country_of_birth'].value_counts().head(10)
|
| 192 |
+
|
| 193 |
+
# Average wage by country
|
| 194 |
+
country_wages = facts_df.groupby('country_of_birth')['wage_amt'].agg(['mean', 'count']).reset_index()
|
| 195 |
+
country_wages = country_wages[country_wages['count'] >= 50].nlargest(8, 'mean') # Min 50 applications
|
| 196 |
+
|
| 197 |
+
fig = make_subplots(
|
| 198 |
+
rows=1, cols=2,
|
| 199 |
+
specs=[[{"type": "bar"}, {"type": "bar"}]],
|
| 200 |
+
subplot_titles=("Applications by Country", "Average Wage by Country (Min 50 apps)")
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# Applications by country
|
| 204 |
+
fig.add_trace(
|
| 205 |
+
go.Bar(
|
| 206 |
+
x=country_counts.index,
|
| 207 |
+
y=country_counts.values,
|
| 208 |
+
marker_color='teal',
|
| 209 |
+
text=country_counts.values,
|
| 210 |
+
textposition='auto',
|
| 211 |
+
name='Application Count'
|
| 212 |
+
),
|
| 213 |
+
row=1, col=1
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# Average wage by country
|
| 217 |
+
fig.add_trace(
|
| 218 |
+
go.Bar(
|
| 219 |
+
x=country_wages['country_of_birth'],
|
| 220 |
+
y=country_wages['mean'],
|
| 221 |
+
marker_color='orange',
|
| 222 |
+
text=['$' + f"{x:,.0f}" for x in country_wages['mean']],
|
| 223 |
+
textposition='auto',
|
| 224 |
+
name='Average Wage'
|
| 225 |
+
),
|
| 226 |
+
row=1, col=2
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
fig.update_layout(
|
| 230 |
+
height=500,
|
| 231 |
+
title_text="Country Analysis from Facts Table",
|
| 232 |
+
showlegend=False
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
fig.update_xaxes(tickangle=45, row=1, col=1)
|
| 236 |
+
fig.update_xaxes(tickangle=45, row=1, col=2)
|
| 237 |
+
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
|
| 238 |
+
fig.update_yaxes(title_text="Average Wage ($)", row=1, col=2)
|
| 239 |
+
|
| 240 |
+
return fig
|
| 241 |
+
|
| 242 |
+
def temporal_analysis():
|
| 243 |
+
"""Analyze temporal patterns from facts table"""
|
| 244 |
+
if facts_df.empty:
|
| 245 |
+
return go.Figure().update_layout(title="No data available")
|
| 246 |
+
|
| 247 |
+
# Yearly trends
|
| 248 |
+
yearly_stats = facts_df.groupby('fiscal_year').agg({
|
| 249 |
+
'record_id': 'count',
|
| 250 |
+
'wage_amt': 'mean',
|
| 251 |
+
'is_multiple_registration': 'mean'
|
| 252 |
+
}).reset_index()
|
| 253 |
+
|
| 254 |
+
yearly_stats['multiple_reg_pct'] = yearly_stats['is_multiple_registration'] * 100
|
| 255 |
+
|
| 256 |
+
fig = make_subplots(
|
| 257 |
+
rows=2, cols=1,
|
| 258 |
+
specs=[[{"secondary_y": True}], [{"type": "bar"}]],
|
| 259 |
+
subplot_titles=("Applications and Average Wage by Year", "Multiple Registration Percentage by Year")
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
# Applications count
|
| 263 |
+
fig.add_trace(
|
| 264 |
+
go.Scatter(
|
| 265 |
+
x=yearly_stats['fiscal_year'],
|
| 266 |
+
y=yearly_stats['record_id'],
|
| 267 |
+
mode='lines+markers',
|
| 268 |
+
name='Applications',
|
| 269 |
+
line=dict(color='blue', width=3),
|
| 270 |
+
marker=dict(size=8)
|
| 271 |
+
),
|
| 272 |
+
row=1, col=1
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Average wage (secondary y-axis)
|
| 276 |
+
fig.add_trace(
|
| 277 |
+
go.Scatter(
|
| 278 |
+
x=yearly_stats['fiscal_year'],
|
| 279 |
+
y=yearly_stats['wage_amt'],
|
| 280 |
+
mode='lines+markers',
|
| 281 |
+
name='Average Wage',
|
| 282 |
+
line=dict(color='red', width=3),
|
| 283 |
+
marker=dict(size=8),
|
| 284 |
+
yaxis='y2'
|
| 285 |
+
),
|
| 286 |
+
row=1, col=1
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
# Multiple registration percentage
|
| 290 |
+
fig.add_trace(
|
| 291 |
+
go.Bar(
|
| 292 |
+
x=yearly_stats['fiscal_year'],
|
| 293 |
+
y=yearly_stats['multiple_reg_pct'],
|
| 294 |
+
marker_color='green',
|
| 295 |
+
text=[f"{x:.1f}%" for x in yearly_stats['multiple_reg_pct']],
|
| 296 |
+
textposition='auto',
|
| 297 |
+
name='Multiple Registration %'
|
| 298 |
+
),
|
| 299 |
+
row=2, col=1
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
# Update layout
|
| 303 |
+
fig.update_layout(
|
| 304 |
+
height=600,
|
| 305 |
+
title_text="Temporal Analysis from Facts Table"
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Update y-axes
|
| 309 |
+
fig.update_yaxes(title_text="Number of Applications", row=1, col=1)
|
| 310 |
+
fig.update_yaxes(title_text="Average Wage ($)", secondary_y=True, row=1, col=1)
|
| 311 |
+
fig.update_yaxes(title_text="Multiple Registration (%)", row=2, col=1)
|
| 312 |
+
fig.update_xaxes(title_text="Fiscal Year", row=2, col=1)
|
| 313 |
+
|
| 314 |
+
return fig
|
| 315 |
+
|
| 316 |
+
def demographic_analysis():
|
| 317 |
+
"""Analyze demographic patterns from facts table"""
|
| 318 |
+
if facts_df.empty:
|
| 319 |
+
return go.Figure().update_layout(title="No data available")
|
| 320 |
+
|
| 321 |
+
# Age distribution
|
| 322 |
+
age_bins = pd.cut(facts_df['age_at_application'], bins=range(20, 50, 5), right=False)
|
| 323 |
+
age_counts = age_bins.value_counts().sort_index()
|
| 324 |
+
|
| 325 |
+
# Full-time vs Part-time
|
| 326 |
+
employment_type = facts_df['full_time_ind'].value_counts()
|
| 327 |
+
employment_labels = ['Full-time' if x else 'Part-time' for x in employment_type.index]
|
| 328 |
+
|
| 329 |
+
# Same state employment
|
| 330 |
+
same_state = facts_df['employer_worksite_same_state'].value_counts()
|
| 331 |
+
same_state_labels = ['Same State' if x else 'Different State' for x in same_state.index]
|
| 332 |
+
|
| 333 |
+
fig = make_subplots(
|
| 334 |
+
rows=2, cols=2,
|
| 335 |
+
specs=[[{"type": "bar"}, {"type": "pie"}],
|
| 336 |
+
[{"type": "pie"}, {"type": "histogram"}]],
|
| 337 |
+
subplot_titles=("Age Distribution", "Employment Type", "Employer-Worksite Location", "Years Since Application")
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Age distribution
|
| 341 |
+
fig.add_trace(
|
| 342 |
+
go.Bar(
|
| 343 |
+
x=[str(interval) for interval in age_counts.index],
|
| 344 |
+
y=age_counts.values,
|
| 345 |
+
marker_color='lightblue',
|
| 346 |
+
name='Age Distribution'
|
| 347 |
+
),
|
| 348 |
+
row=1, col=1
|
| 349 |
+
)
|
| 350 |
+
|
| 351 |
+
# Employment type pie chart
|
| 352 |
+
fig.add_trace(
|
| 353 |
+
go.Pie(
|
| 354 |
+
labels=employment_labels,
|
| 355 |
+
values=employment_type.values,
|
| 356 |
+
name="Employment Type"
|
| 357 |
+
),
|
| 358 |
+
row=1, col=2
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
# Same state pie chart
|
| 362 |
+
fig.add_trace(
|
| 363 |
+
go.Pie(
|
| 364 |
+
labels=same_state_labels,
|
| 365 |
+
values=same_state.values,
|
| 366 |
+
name="Location"
|
| 367 |
+
),
|
| 368 |
+
row=2, col=1
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Years since application
|
| 372 |
+
years_since = facts_df['years_since_application'].value_counts().sort_index()
|
| 373 |
+
fig.add_trace(
|
| 374 |
+
go.Histogram(
|
| 375 |
+
x=facts_df['years_since_application'],
|
| 376 |
+
nbinsx=10,
|
| 377 |
+
marker_color='lightgreen',
|
| 378 |
+
name='Years Since Application'
|
| 379 |
+
),
|
| 380 |
+
row=2, col=2
|
| 381 |
+
)
|
| 382 |
+
|
| 383 |
+
fig.update_layout(
|
| 384 |
+
height=600,
|
| 385 |
+
title_text="Demographic Analysis from Facts Table",
|
| 386 |
+
showlegend=False
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
return fig
|
| 390 |
+
|
| 391 |
+
def facts_data_table():
|
| 392 |
+
"""Display sample of facts table data"""
|
| 393 |
+
if facts_df.empty:
|
| 394 |
+
return pd.DataFrame()
|
| 395 |
+
|
| 396 |
+
# Return first 100 rows with key columns
|
| 397 |
+
display_cols = [
|
| 398 |
+
'record_id', 'lottery_year', 'fiscal_year', 'country_of_birth',
|
| 399 |
+
'wage_amt', 'age_at_application', 'is_multiple_registration',
|
| 400 |
+
'full_time_ind', 'employer_worksite_same_state'
|
| 401 |
+
]
|
| 402 |
+
|
| 403 |
+
sample_data = facts_df[display_cols].head(100).copy()
|
| 404 |
+
|
| 405 |
+
# Format wage column
|
| 406 |
+
sample_data['wage_amt'] = sample_data['wage_amt'].apply(lambda x: f"${x:,.0f}")
|
| 407 |
+
|
| 408 |
+
return sample_data
|
| 409 |
+
|
| 410 |
+
# ---- GRADIO INTERFACE ----
|
| 411 |
+
|
| 412 |
+
with gr.Blocks(theme=gr.themes.Soft(), title="H1B Facts Table Analytics") as demo:
|
| 413 |
+
gr.Markdown("# 📊 H1B Facts Table Analytics Dashboard")
|
| 414 |
+
gr.Markdown("### Comprehensive Analysis of H1B Applications Facts Data")
|
| 415 |
+
|
| 416 |
+
with gr.Tab("📈 Facts Overview"):
|
| 417 |
+
gr.Markdown("### Key Metrics from Facts Table")
|
| 418 |
+
facts_overview_plot = gr.Plot()
|
| 419 |
+
gr.Button("Load Facts Overview", variant="primary").click(
|
| 420 |
+
fn=facts_overview,
|
| 421 |
+
outputs=facts_overview_plot
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
with gr.Tab("💰 Wage Analysis"):
|
| 425 |
+
gr.Markdown("### Wage Distribution from Facts Table")
|
| 426 |
+
wage_plot = gr.Plot()
|
| 427 |
+
gr.Button("Analyze Wages", variant="primary").click(
|
| 428 |
+
fn=wage_distribution,
|
| 429 |
+
outputs=wage_plot
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
with gr.Tab("🌍 Country Analysis"):
|
| 433 |
+
gr.Markdown("### Country-wise Analysis from Facts Table")
|
| 434 |
+
country_plot = gr.Plot()
|
| 435 |
+
gr.Button("Analyze Countries", variant="primary").click(
|
| 436 |
+
fn=country_analysis,
|
| 437 |
+
outputs=country_plot
|
| 438 |
+
)
|
| 439 |
+
|
| 440 |
+
with gr.Tab("📅 Temporal Analysis"):
|
| 441 |
+
gr.Markdown("### Time-based Trends from Facts Table")
|
| 442 |
+
temporal_plot = gr.Plot()
|
| 443 |
+
gr.Button("Analyze Trends", variant="primary").click(
|
| 444 |
+
fn=temporal_analysis,
|
| 445 |
+
outputs=temporal_plot
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
with gr.Tab("👥 Demographics"):
|
| 449 |
+
gr.Markdown("### Demographic Patterns from Facts Table")
|
| 450 |
+
demo_plot = gr.Plot()
|
| 451 |
+
gr.Button("Analyze Demographics", variant="primary").click(
|
| 452 |
+
fn=demographic_analysis,
|
| 453 |
+
outputs=demo_plot
|
| 454 |
+
)
|
| 455 |
+
|
| 456 |
+
with gr.Tab("📋 Raw Data"):
|
| 457 |
+
gr.Markdown("### Sample Facts Table Data (First 100 rows)")
|
| 458 |
+
data_table = gr.DataFrame()
|
| 459 |
+
gr.Button("Load Sample Data", variant="primary").click(
|
| 460 |
+
fn=facts_data_table,
|
| 461 |
+
outputs=data_table
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
# Footer
|
| 465 |
+
gr.Markdown("---")
|
| 466 |
+
gr.Markdown("### Facts Table Schema")
|
| 467 |
+
gr.Markdown("""
|
| 468 |
+
**Table**: `fct_h1b_applications`
|
| 469 |
+
|
| 470 |
+
**Key Columns**:
|
| 471 |
+
- `record_id`: Unique identifier for each application
|
| 472 |
+
- `lottery_year`, `fiscal_year`: Temporal dimensions
|
| 473 |
+
- `country_of_birth`: Beneficiary country
|
| 474 |
+
- `wage_amt`: Offered wage amount
|
| 475 |
+
- `age_at_application`: Beneficiary age
|
| 476 |
+
- `is_multiple_registration`: Multiple lottery entries flag
|
| 477 |
+
- `full_time_ind`: Full-time employment indicator
|
| 478 |
+
- `employer_worksite_same_state`: Location alignment flag
|
| 479 |
+
- Foreign keys: `employer_sk`, `beneficiary_sk`, `job_sk`
|
| 480 |
+
""")
|
| 481 |
+
|
| 482 |
+
# Launch the app
|
| 483 |
+
if __name__ == "__main__":
|
| 484 |
+
demo.launch(
|
| 485 |
+
server_name="0.0.0.0",
|
| 486 |
+
server_port=7860,
|
| 487 |
+
share=True,
|
| 488 |
+
show_error=True
|
| 489 |
+
)
|
model_and_load_toduckdb.py
ADDED
|
@@ -0,0 +1,973 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
H1B Data Analytics Pipeline
|
| 3 |
+
|
| 4 |
+
This module provides a comprehensive ETL pipeline for processing H1B visa application data.
|
| 5 |
+
It loads CSV files into DuckDB, creates dimensional models, and performs data quality checks.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import gc
|
| 10 |
+
import logging
|
| 11 |
+
import hashlib
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from typing import List, Optional, Tuple
|
| 14 |
+
import traceback
|
| 15 |
+
|
| 16 |
+
import duckdb
|
| 17 |
+
import pandas as pd
|
| 18 |
+
import numpy as np
|
| 19 |
+
import psutil
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class H1BDataPipeline:
|
| 23 |
+
"""
|
| 24 |
+
Main pipeline class for processing H1B visa application data.
|
| 25 |
+
|
| 26 |
+
This class handles the complete ETL process including:
|
| 27 |
+
- Loading CSV files into DuckDB
|
| 28 |
+
- Creating dimensional models
|
| 29 |
+
- Data quality checks
|
| 30 |
+
- Database persistence
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(self, db_path: str = ':memory:', log_level: int = logging.INFO):
|
| 34 |
+
"""
|
| 35 |
+
Initialize the H1B data pipeline.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
db_path: Path to DuckDB database file. Use ':memory:' for in-memory database.
|
| 39 |
+
log_level: Logging level for the pipeline.
|
| 40 |
+
"""
|
| 41 |
+
self.db_path = db_path
|
| 42 |
+
self.conn = None
|
| 43 |
+
self.logger = self._setup_logging(log_level)
|
| 44 |
+
self._setup_database()
|
| 45 |
+
|
| 46 |
+
def _setup_logging(self, log_level: int) -> logging.Logger:
|
| 47 |
+
"""Set up logging configuration for the pipeline."""
|
| 48 |
+
logger = logging.getLogger(__name__)
|
| 49 |
+
logging.basicConfig(
|
| 50 |
+
level=log_level,
|
| 51 |
+
format="{asctime} - {name} - {levelname} - {message}",
|
| 52 |
+
style="{",
|
| 53 |
+
datefmt="%Y-%m-%d %H:%M:%S",
|
| 54 |
+
)
|
| 55 |
+
return logger
|
| 56 |
+
|
| 57 |
+
def _setup_database(self) -> None:
|
| 58 |
+
"""Initialize DuckDB connection."""
|
| 59 |
+
try:
|
| 60 |
+
self.conn = duckdb.connect(self.db_path)
|
| 61 |
+
self.logger.info(f"DuckDB connection established to {self.db_path}")
|
| 62 |
+
self.logger.info(f"DuckDB version: {duckdb.__version__}")
|
| 63 |
+
|
| 64 |
+
# Test connection
|
| 65 |
+
test_result = self.conn.execute("SELECT 'Hello DuckDB!' as message").fetchone()
|
| 66 |
+
self.logger.info(f"Connection test: {test_result[0]}")
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
self.logger.error(f"Failed to establish database connection: {e}")
|
| 70 |
+
raise
|
| 71 |
+
|
| 72 |
+
def __enter__(self):
|
| 73 |
+
"""Context manager entry."""
|
| 74 |
+
return self
|
| 75 |
+
|
| 76 |
+
def close(self) -> None:
|
| 77 |
+
"""Close database connection and cleanup resources."""
|
| 78 |
+
if self.conn:
|
| 79 |
+
self.conn.close()
|
| 80 |
+
self.logger.info("Database connection closed")
|
| 81 |
+
|
| 82 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 83 |
+
"""Context manager exit with cleanup."""
|
| 84 |
+
self.close()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class MemoryManager:
|
| 88 |
+
"""Utility class for monitoring and managing memory usage."""
|
| 89 |
+
|
| 90 |
+
@staticmethod
|
| 91 |
+
def check_memory_usage() -> float:
|
| 92 |
+
"""
|
| 93 |
+
Check current memory usage of the process.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Memory usage in MB.
|
| 97 |
+
"""
|
| 98 |
+
process = psutil.Process(os.getpid())
|
| 99 |
+
memory_mb = process.memory_info().rss / 1024 / 1024
|
| 100 |
+
print(f"Current memory usage: {memory_mb:.1f} MB")
|
| 101 |
+
return memory_mb
|
| 102 |
+
|
| 103 |
+
@staticmethod
|
| 104 |
+
def clear_memory() -> None:
|
| 105 |
+
"""Force garbage collection to clear memory."""
|
| 106 |
+
gc.collect()
|
| 107 |
+
print("Memory cleared")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class FileValidator:
|
| 111 |
+
"""Utility class for validating file existence and accessibility."""
|
| 112 |
+
|
| 113 |
+
@staticmethod
|
| 114 |
+
def validate_files(file_paths: List[str]) -> Tuple[List[str], List[str]]:
|
| 115 |
+
"""
|
| 116 |
+
Validate that files exist and are accessible.
|
| 117 |
+
|
| 118 |
+
Args:
|
| 119 |
+
file_paths: List of file paths to validate.
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
Tuple of (existing_files, missing_files).
|
| 123 |
+
"""
|
| 124 |
+
existing_files = []
|
| 125 |
+
missing_files = []
|
| 126 |
+
|
| 127 |
+
for file_path in file_paths:
|
| 128 |
+
if os.path.exists(file_path):
|
| 129 |
+
existing_files.append(file_path)
|
| 130 |
+
print(f"✓ Found: {file_path}")
|
| 131 |
+
else:
|
| 132 |
+
missing_files.append(file_path)
|
| 133 |
+
print(f"✗ Missing: {file_path}")
|
| 134 |
+
|
| 135 |
+
return existing_files, missing_files
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class DataLoader:
|
| 139 |
+
"""Handles loading data from various sources into DuckDB."""
|
| 140 |
+
|
| 141 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
| 142 |
+
"""
|
| 143 |
+
Initialize data loader.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
conn: DuckDB connection object.
|
| 147 |
+
logger: Logger instance for tracking operations.
|
| 148 |
+
"""
|
| 149 |
+
self.conn = conn
|
| 150 |
+
self.logger = logger
|
| 151 |
+
|
| 152 |
+
def load_csv_files(self, file_paths: List[str]) -> None:
|
| 153 |
+
"""
|
| 154 |
+
Load CSV files directly into DuckDB without loading into pandas first.
|
| 155 |
+
|
| 156 |
+
Args:
|
| 157 |
+
file_paths: List of CSV file paths to load.
|
| 158 |
+
"""
|
| 159 |
+
self.logger.info("Loading CSV files directly into DuckDB...")
|
| 160 |
+
|
| 161 |
+
for file_path in file_paths:
|
| 162 |
+
try:
|
| 163 |
+
self._load_single_csv(file_path)
|
| 164 |
+
except Exception as e:
|
| 165 |
+
self.logger.error(f"Error loading {file_path}: {e}")
|
| 166 |
+
|
| 167 |
+
def _load_single_csv(self, file_path: str) -> None:
|
| 168 |
+
"""
|
| 169 |
+
Load a single CSV file into DuckDB.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
file_path: Path to the CSV file.
|
| 173 |
+
"""
|
| 174 |
+
self.logger.info(f"Loading {file_path}")
|
| 175 |
+
|
| 176 |
+
# Extract metadata from filename
|
| 177 |
+
filename = file_path.split('/')[-1].replace('.csv', '')
|
| 178 |
+
table_name = f"raw_{filename}"
|
| 179 |
+
fiscal_year = self._extract_fiscal_year(filename)
|
| 180 |
+
|
| 181 |
+
# Load CSV directly into DuckDB
|
| 182 |
+
self.conn.execute(f"""
|
| 183 |
+
CREATE TABLE {table_name} AS
|
| 184 |
+
SELECT *,
|
| 185 |
+
'{file_path}' as source_file,
|
| 186 |
+
'{fiscal_year}' as fiscal_year
|
| 187 |
+
FROM read_csv_auto('{file_path}', header=true, normalize_names=true, ignore_errors=true)
|
| 188 |
+
""")
|
| 189 |
+
|
| 190 |
+
# Clean column names
|
| 191 |
+
self._clean_column_names(table_name)
|
| 192 |
+
|
| 193 |
+
# Log success
|
| 194 |
+
count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
| 195 |
+
self.logger.info(f"Loaded {count:,} records from {file_path} into {table_name}")
|
| 196 |
+
|
| 197 |
+
def _extract_fiscal_year(self, filename: str) -> str:
|
| 198 |
+
"""Extract fiscal year from filename."""
|
| 199 |
+
import re
|
| 200 |
+
match = re.search(r'FY(\d{4})', filename)
|
| 201 |
+
if match:
|
| 202 |
+
return match.group(1) # Return only the year digits
|
| 203 |
+
return "unknown"
|
| 204 |
+
|
| 205 |
+
def _clean_column_names(self, table_name: str) -> None:
|
| 206 |
+
"""
|
| 207 |
+
Clean column names in DuckDB table.
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
table_name: Name of the table to clean.
|
| 211 |
+
"""
|
| 212 |
+
columns_query = f"PRAGMA table_info('{table_name}')"
|
| 213 |
+
columns_info = self.conn.execute(columns_query).fetchall()
|
| 214 |
+
|
| 215 |
+
for col_info in columns_info:
|
| 216 |
+
old_name = col_info[1]
|
| 217 |
+
new_name = self._normalize_column_name(old_name)
|
| 218 |
+
|
| 219 |
+
if old_name != new_name:
|
| 220 |
+
self.conn.execute(f"""
|
| 221 |
+
ALTER TABLE {table_name}
|
| 222 |
+
RENAME COLUMN "{old_name}" TO {new_name}
|
| 223 |
+
""")
|
| 224 |
+
|
| 225 |
+
@staticmethod
|
| 226 |
+
def _normalize_column_name(column_name: str) -> str:
|
| 227 |
+
"""
|
| 228 |
+
Normalize column name to follow consistent naming convention.
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
column_name: Original column name.
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
Normalized column name.
|
| 235 |
+
"""
|
| 236 |
+
import re
|
| 237 |
+
|
| 238 |
+
# Remove URLs and other problematic patterns
|
| 239 |
+
normalized = re.sub(r'https?://[^\s]+', '', str(column_name))
|
| 240 |
+
normalized = re.sub(r'[^\w\s]', '_', normalized) # Replace special chars with underscore
|
| 241 |
+
normalized = re.sub(r'\s+', '_', normalized) # Replace spaces with underscore
|
| 242 |
+
normalized = re.sub(r'_+', '_', normalized) # Replace multiple underscores with single
|
| 243 |
+
normalized = normalized.lower().strip('_') # Lowercase and trim underscores
|
| 244 |
+
|
| 245 |
+
# Ensure it starts with letter or underscore
|
| 246 |
+
if normalized and not (normalized[0].isalpha() or normalized[0] == '_'):
|
| 247 |
+
normalized = f'col_{normalized}'
|
| 248 |
+
|
| 249 |
+
return normalized if normalized else 'unnamed_column'
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
class DataTransformer:
|
| 253 |
+
"""Handles data transformation and dimensional modeling."""
|
| 254 |
+
|
| 255 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
| 256 |
+
"""
|
| 257 |
+
Initialize data transformer.
|
| 258 |
+
|
| 259 |
+
Args:
|
| 260 |
+
conn: DuckDB connection object.
|
| 261 |
+
logger: Logger instance for tracking operations.
|
| 262 |
+
"""
|
| 263 |
+
self.conn = conn
|
| 264 |
+
self.logger = logger
|
| 265 |
+
|
| 266 |
+
def create_combined_table(self) -> None:
|
| 267 |
+
"""Create a combined table from all raw tables in DuckDB."""
|
| 268 |
+
self.logger.info("Creating combined table in DuckDB...")
|
| 269 |
+
|
| 270 |
+
# Get list of raw tables
|
| 271 |
+
raw_tables = self.conn.execute("""
|
| 272 |
+
SELECT table_name
|
| 273 |
+
FROM information_schema.tables
|
| 274 |
+
WHERE table_name LIKE 'raw_%'
|
| 275 |
+
""").fetchall()
|
| 276 |
+
|
| 277 |
+
if not raw_tables:
|
| 278 |
+
raise ValueError("No raw tables found")
|
| 279 |
+
|
| 280 |
+
# Create union query
|
| 281 |
+
union_parts = [f"SELECT * FROM {table_info[0]}" for table_info in raw_tables]
|
| 282 |
+
union_query = " UNION ALL ".join(union_parts)
|
| 283 |
+
|
| 284 |
+
# Create combined table
|
| 285 |
+
self.conn.execute(f"""
|
| 286 |
+
CREATE TABLE combined_data AS
|
| 287 |
+
{union_query}
|
| 288 |
+
""")
|
| 289 |
+
|
| 290 |
+
count = self.conn.execute("SELECT COUNT(*) FROM combined_data").fetchone()[0]
|
| 291 |
+
self.logger.info(f"Created combined table with {count:,} records")
|
| 292 |
+
|
| 293 |
+
def remove_columns_with_missing_data(self, table_name: str, threshold: float = 0.8) -> List[str]:
|
| 294 |
+
"""
|
| 295 |
+
Remove columns with high missing data directly in DuckDB.
|
| 296 |
+
|
| 297 |
+
Args:
|
| 298 |
+
table_name: Name of the table to clean.
|
| 299 |
+
threshold: Threshold for missing data ratio (0.0 to 1.0).
|
| 300 |
+
|
| 301 |
+
Returns:
|
| 302 |
+
List of columns that were kept.
|
| 303 |
+
"""
|
| 304 |
+
self.logger.info(f"Removing columns with >{threshold*100}% missing data from {table_name}...")
|
| 305 |
+
|
| 306 |
+
total_rows = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
| 307 |
+
columns_info = self.conn.execute(f"PRAGMA table_info('{table_name}')").fetchall()
|
| 308 |
+
|
| 309 |
+
columns_to_keep = []
|
| 310 |
+
columns_removed = []
|
| 311 |
+
|
| 312 |
+
for col_info in columns_info:
|
| 313 |
+
col_name = col_info[1]
|
| 314 |
+
col_type = col_info[2] # Get column type for better handling
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
# Handle different column types appropriately
|
| 318 |
+
if col_type.upper() in ['INTEGER', 'BIGINT', 'DOUBLE', 'FLOAT', 'DECIMAL', 'NUMERIC']:
|
| 319 |
+
# For numeric columns, only check for NULL
|
| 320 |
+
non_null_count = self.conn.execute(f"""
|
| 321 |
+
SELECT COUNT(*)
|
| 322 |
+
FROM {table_name}
|
| 323 |
+
WHERE "{col_name}" IS NOT NULL
|
| 324 |
+
""").fetchone()[0]
|
| 325 |
+
else:
|
| 326 |
+
# For text columns, check for NULL and empty strings
|
| 327 |
+
non_null_count = self.conn.execute(f"""
|
| 328 |
+
SELECT COUNT(*)
|
| 329 |
+
FROM {table_name}
|
| 330 |
+
WHERE "{col_name}" IS NOT NULL
|
| 331 |
+
AND TRIM(CAST("{col_name}" AS VARCHAR)) != ''
|
| 332 |
+
""").fetchone()[0]
|
| 333 |
+
|
| 334 |
+
missing_ratio = 1 - (non_null_count / total_rows)
|
| 335 |
+
|
| 336 |
+
self.logger.debug(f"Column {col_name}: {non_null_count}/{total_rows} non-null ({missing_ratio:.2%} missing)")
|
| 337 |
+
|
| 338 |
+
if missing_ratio <= threshold:
|
| 339 |
+
columns_to_keep.append(col_name)
|
| 340 |
+
else:
|
| 341 |
+
columns_removed.append(col_name)
|
| 342 |
+
self.logger.info(f"Removing column {col_name} with {missing_ratio:.2%} missing data")
|
| 343 |
+
|
| 344 |
+
except Exception as e:
|
| 345 |
+
self.logger.warning(f"Error processing column {col_name}: {e}")
|
| 346 |
+
# When in doubt, keep the column
|
| 347 |
+
columns_to_keep.append(col_name)
|
| 348 |
+
|
| 349 |
+
if columns_removed:
|
| 350 |
+
self.logger.info(f"Removing {len(columns_removed)} columns with high missing data")
|
| 351 |
+
self._recreate_table_with_columns(table_name, columns_to_keep)
|
| 352 |
+
|
| 353 |
+
return columns_to_keep
|
| 354 |
+
|
| 355 |
+
def _recreate_table_with_columns(self, table_name: str, columns_to_keep: List[str]) -> None:
|
| 356 |
+
"""
|
| 357 |
+
Recreate table with only specified columns.
|
| 358 |
+
|
| 359 |
+
Args:
|
| 360 |
+
table_name: Original table name.
|
| 361 |
+
columns_to_keep: List of column names to retain.
|
| 362 |
+
"""
|
| 363 |
+
columns_str = ', '.join(columns_to_keep)
|
| 364 |
+
self.conn.execute(f"""
|
| 365 |
+
CREATE TABLE {table_name}_clean AS
|
| 366 |
+
SELECT {columns_str}
|
| 367 |
+
FROM {table_name}
|
| 368 |
+
""")
|
| 369 |
+
|
| 370 |
+
self.conn.execute(f"DROP TABLE {table_name}")
|
| 371 |
+
self.conn.execute(f"ALTER TABLE {table_name}_clean RENAME TO {table_name}")
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
class DimensionalModeler:
|
| 375 |
+
"""Creates dimensional model tables for analytics."""
|
| 376 |
+
|
| 377 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
| 378 |
+
"""
|
| 379 |
+
Initialize dimensional modeler.
|
| 380 |
+
|
| 381 |
+
Args:
|
| 382 |
+
conn: DuckDB connection object.
|
| 383 |
+
logger: Logger instance for tracking operations.
|
| 384 |
+
"""
|
| 385 |
+
self.conn = conn
|
| 386 |
+
self.logger = logger
|
| 387 |
+
|
| 388 |
+
def create_all_dimensions(self) -> None:
|
| 389 |
+
"""Create all dimension tables."""
|
| 390 |
+
self._prepare_cleaned_data()
|
| 391 |
+
self._create_beneficiary_dimension()
|
| 392 |
+
self._create_employer_dimension()
|
| 393 |
+
self._create_job_dimension()
|
| 394 |
+
self._create_agent_dimension()
|
| 395 |
+
self._create_status_dimension()
|
| 396 |
+
self._create_date_dimension()
|
| 397 |
+
|
| 398 |
+
def _prepare_cleaned_data(self) -> None:
|
| 399 |
+
"""Prepare cleaned data table for dimension creation."""
|
| 400 |
+
self.logger.info("Preparing cleaned data...")
|
| 401 |
+
# First, check what columns actually exist in combined_data
|
| 402 |
+
columns_info = self.conn.execute("PRAGMA table_info('combined_data')").fetchall()
|
| 403 |
+
available_columns = [col[1] for col in columns_info]
|
| 404 |
+
self.logger.info(f"Available columns in combined_data: {available_columns}")
|
| 405 |
+
self.conn.execute("""
|
| 406 |
+
CREATE TABLE cleaned_data AS
|
| 407 |
+
SELECT
|
| 408 |
+
ROW_NUMBER() OVER () as original_row_id,
|
| 409 |
+
*
|
| 410 |
+
FROM combined_data
|
| 411 |
+
""")
|
| 412 |
+
|
| 413 |
+
def _create_beneficiary_dimension(self) -> None:
|
| 414 |
+
"""Create beneficiary dimension table."""
|
| 415 |
+
self.logger.info("Creating dim_beneficiary...")
|
| 416 |
+
self.conn.execute("""
|
| 417 |
+
CREATE TABLE dim_beneficiary AS
|
| 418 |
+
SELECT DISTINCT
|
| 419 |
+
ROW_NUMBER() OVER () as beneficiary_key,
|
| 420 |
+
MD5(CONCAT(
|
| 421 |
+
COALESCE(country_of_birth, ''), '|',
|
| 422 |
+
COALESCE(country_of_nationality, ''), '|',
|
| 423 |
+
COALESCE(CAST(ben_year_of_birth AS VARCHAR), ''), '|',
|
| 424 |
+
COALESCE(gender, '')
|
| 425 |
+
)) as beneficiary_id,
|
| 426 |
+
country_of_birth,
|
| 427 |
+
country_of_nationality,
|
| 428 |
+
ben_year_of_birth,
|
| 429 |
+
gender,
|
| 430 |
+
ben_sex,
|
| 431 |
+
ben_country_of_birth,
|
| 432 |
+
ben_current_class,
|
| 433 |
+
ben_education_code,
|
| 434 |
+
ed_level_definition,
|
| 435 |
+
ben_pfield_of_study
|
| 436 |
+
FROM cleaned_data
|
| 437 |
+
WHERE country_of_birth IS NOT NULL
|
| 438 |
+
OR country_of_nationality IS NOT NULL
|
| 439 |
+
OR ben_year_of_birth IS NOT NULL
|
| 440 |
+
OR gender IS NOT NULL
|
| 441 |
+
""")
|
| 442 |
+
|
| 443 |
+
def _create_employer_dimension(self) -> None:
|
| 444 |
+
"""Create employer dimension table."""
|
| 445 |
+
self.logger.info("Creating dim_employer...")
|
| 446 |
+
self.conn.execute("""
|
| 447 |
+
CREATE TABLE dim_employer AS
|
| 448 |
+
SELECT DISTINCT
|
| 449 |
+
ROW_NUMBER() OVER () as employer_key,
|
| 450 |
+
MD5(CONCAT(
|
| 451 |
+
COALESCE(employer_name, ''), '|',
|
| 452 |
+
COALESCE(fein, '')
|
| 453 |
+
)) as employer_id,
|
| 454 |
+
employer_name,
|
| 455 |
+
fein,
|
| 456 |
+
mail_addr,
|
| 457 |
+
city,
|
| 458 |
+
state,
|
| 459 |
+
zip
|
| 460 |
+
FROM cleaned_data
|
| 461 |
+
WHERE employer_name IS NOT NULL OR fein IS NOT NULL
|
| 462 |
+
""")
|
| 463 |
+
|
| 464 |
+
def _create_job_dimension(self) -> None:
|
| 465 |
+
"""Create job dimension table."""
|
| 466 |
+
self.logger.info("Creating dim_job...")
|
| 467 |
+
self.conn.execute("""
|
| 468 |
+
CREATE TABLE dim_job AS
|
| 469 |
+
SELECT DISTINCT
|
| 470 |
+
ROW_NUMBER() OVER () as job_key,
|
| 471 |
+
MD5(CONCAT(
|
| 472 |
+
COALESCE(job_title, ''), '|',
|
| 473 |
+
COALESCE(naics_code, '')
|
| 474 |
+
)) as job_id,
|
| 475 |
+
job_title,
|
| 476 |
+
dot_code,
|
| 477 |
+
naics_code,
|
| 478 |
+
wage_amt,
|
| 479 |
+
wage_unit,
|
| 480 |
+
full_time_ind,
|
| 481 |
+
ben_comp_paid,
|
| 482 |
+
worksite_city,
|
| 483 |
+
worksite_state
|
| 484 |
+
FROM cleaned_data
|
| 485 |
+
WHERE job_title IS NOT NULL OR naics_code IS NOT NULL
|
| 486 |
+
""")
|
| 487 |
+
|
| 488 |
+
def _create_agent_dimension(self) -> None:
|
| 489 |
+
"""Create agent dimension table."""
|
| 490 |
+
self.logger.info("Creating dim_agent...")
|
| 491 |
+
self.conn.execute("""
|
| 492 |
+
CREATE TABLE dim_agent AS
|
| 493 |
+
SELECT DISTINCT
|
| 494 |
+
ROW_NUMBER() OVER () as agent_key,
|
| 495 |
+
MD5(CONCAT(
|
| 496 |
+
COALESCE(agent_first_name, ''), '|',
|
| 497 |
+
COALESCE(agent_last_name, '')
|
| 498 |
+
)) as agent_id,
|
| 499 |
+
agent_first_name,
|
| 500 |
+
agent_last_name
|
| 501 |
+
FROM cleaned_data
|
| 502 |
+
WHERE agent_first_name IS NOT NULL OR agent_last_name IS NOT NULL
|
| 503 |
+
""")
|
| 504 |
+
|
| 505 |
+
def _create_status_dimension(self) -> None:
|
| 506 |
+
"""Create status dimension table."""
|
| 507 |
+
self.logger.info("Creating dim_status...")
|
| 508 |
+
self.conn.execute("""
|
| 509 |
+
CREATE TABLE dim_status AS
|
| 510 |
+
SELECT DISTINCT
|
| 511 |
+
ROW_NUMBER() OVER () as status_key,
|
| 512 |
+
status_type,
|
| 513 |
+
first_decision
|
| 514 |
+
FROM cleaned_data
|
| 515 |
+
WHERE status_type IS NOT NULL OR first_decision IS NOT NULL
|
| 516 |
+
""")
|
| 517 |
+
|
| 518 |
+
def _create_date_dimension(self) -> None:
|
| 519 |
+
"""Create date dimension table."""
|
| 520 |
+
self.logger.info("Creating dim_date...")
|
| 521 |
+
self.conn.execute("""
|
| 522 |
+
CREATE TABLE dim_date AS
|
| 523 |
+
WITH all_dates AS (
|
| 524 |
+
-- Handle MM/DD/YYYY format
|
| 525 |
+
SELECT TRY_STRPTIME(rec_date, '%m/%d/%Y') as date_value
|
| 526 |
+
FROM cleaned_data
|
| 527 |
+
WHERE rec_date IS NOT NULL
|
| 528 |
+
AND rec_date NOT LIKE '%(%'
|
| 529 |
+
AND LENGTH(rec_date) >= 8
|
| 530 |
+
AND rec_date ~ '^[0-9/-]+$'
|
| 531 |
+
AND TRY_STRPTIME(rec_date, '%m/%d/%Y') IS NOT NULL
|
| 532 |
+
|
| 533 |
+
UNION
|
| 534 |
+
|
| 535 |
+
-- Handle YYYY-MM-DD format
|
| 536 |
+
SELECT TRY_STRPTIME(rec_date, '%Y-%m-%d') as date_value
|
| 537 |
+
FROM cleaned_data
|
| 538 |
+
WHERE rec_date IS NOT NULL
|
| 539 |
+
AND rec_date NOT LIKE '%(%'
|
| 540 |
+
AND LENGTH(rec_date) >= 8
|
| 541 |
+
AND rec_date ~ '^[0-9-]+$'
|
| 542 |
+
AND TRY_STRPTIME(rec_date, '%Y-%m-%d') IS NOT NULL
|
| 543 |
+
|
| 544 |
+
UNION
|
| 545 |
+
|
| 546 |
+
-- Handle first_decision_date MM/DD/YYYY format
|
| 547 |
+
SELECT TRY_STRPTIME(first_decision_date, '%m/%d/%Y') as date_value
|
| 548 |
+
FROM cleaned_data
|
| 549 |
+
WHERE first_decision_date IS NOT NULL
|
| 550 |
+
AND first_decision_date NOT LIKE '%(%'
|
| 551 |
+
AND LENGTH(first_decision_date) >= 8
|
| 552 |
+
AND first_decision_date ~ '^[0-9/-]+$'
|
| 553 |
+
AND TRY_STRPTIME(first_decision_date, '%m/%d/%Y') IS NOT NULL
|
| 554 |
+
|
| 555 |
+
UNION
|
| 556 |
+
|
| 557 |
+
-- Handle first_decision_date YYYY-MM-DD format
|
| 558 |
+
SELECT TRY_STRPTIME(first_decision_date, '%Y-%m-%d') as date_value
|
| 559 |
+
FROM cleaned_data
|
| 560 |
+
WHERE first_decision_date IS NOT NULL
|
| 561 |
+
AND first_decision_date NOT LIKE '%(%'
|
| 562 |
+
AND LENGTH(first_decision_date) >= 8
|
| 563 |
+
AND first_decision_date ~ '^[0-9-]+$'
|
| 564 |
+
AND TRY_STRPTIME(first_decision_date, '%Y-%m-%d') IS NOT NULL
|
| 565 |
+
|
| 566 |
+
UNION
|
| 567 |
+
|
| 568 |
+
-- Handle valid_from MM/DD/YYYY format
|
| 569 |
+
SELECT TRY_STRPTIME(valid_from, '%m/%d/%Y') as date_value
|
| 570 |
+
FROM cleaned_data
|
| 571 |
+
WHERE valid_from IS NOT NULL
|
| 572 |
+
AND valid_from NOT LIKE '%(%'
|
| 573 |
+
AND LENGTH(valid_from) >= 8
|
| 574 |
+
AND valid_from ~ '^[0-9/-]+$'
|
| 575 |
+
AND TRY_STRPTIME(valid_from, '%m/%d/%Y') IS NOT NULL
|
| 576 |
+
|
| 577 |
+
UNION
|
| 578 |
+
|
| 579 |
+
-- Handle valid_from YYYY-MM-DD format
|
| 580 |
+
SELECT TRY_STRPTIME(valid_from, '%Y-%m-%d') as date_value
|
| 581 |
+
FROM cleaned_data
|
| 582 |
+
WHERE valid_from IS NOT NULL
|
| 583 |
+
AND valid_from NOT LIKE '%(%'
|
| 584 |
+
AND LENGTH(valid_from) >= 8
|
| 585 |
+
AND valid_from ~ '^[0-9-]+$'
|
| 586 |
+
AND TRY_STRPTIME(valid_from, '%Y-%m-%d') IS NOT NULL
|
| 587 |
+
|
| 588 |
+
UNION
|
| 589 |
+
|
| 590 |
+
-- Handle valid_to MM/DD/YYYY format
|
| 591 |
+
SELECT TRY_STRPTIME(valid_to, '%m/%d/%Y') as date_value
|
| 592 |
+
FROM cleaned_data
|
| 593 |
+
WHERE valid_to IS NOT NULL
|
| 594 |
+
AND valid_to NOT LIKE '%(%'
|
| 595 |
+
AND LENGTH(valid_to) >= 8
|
| 596 |
+
AND valid_to ~ '^[0-9/]+$'
|
| 597 |
+
AND valid_to LIKE '%/%/%'
|
| 598 |
+
AND TRY_STRPTIME(valid_to, '%m/%d/%Y') IS NOT NULL
|
| 599 |
+
|
| 600 |
+
UNION
|
| 601 |
+
|
| 602 |
+
-- Handle valid_to YYYY-MM-DD format
|
| 603 |
+
SELECT TRY_STRPTIME(valid_to, '%Y-%m-%d') as date_value
|
| 604 |
+
FROM cleaned_data
|
| 605 |
+
WHERE valid_to IS NOT NULL
|
| 606 |
+
AND valid_to NOT LIKE '%(%'
|
| 607 |
+
AND LENGTH(valid_to) >= 8
|
| 608 |
+
AND valid_to ~ '^[0-9-]+$'
|
| 609 |
+
AND TRY_STRPTIME(valid_to, '%Y-%m-%d') IS NOT NULL
|
| 610 |
+
)
|
| 611 |
+
SELECT DISTINCT
|
| 612 |
+
date_value as date,
|
| 613 |
+
EXTRACT(YEAR FROM date_value) as year,
|
| 614 |
+
EXTRACT(MONTH FROM date_value) as month,
|
| 615 |
+
EXTRACT(QUARTER FROM date_value) as quarter,
|
| 616 |
+
EXTRACT(DOW FROM date_value) as day_of_week,
|
| 617 |
+
MONTHNAME(date_value) as month_name,
|
| 618 |
+
'Q' || CAST(EXTRACT(QUARTER FROM date_value) AS VARCHAR) as quarter_name,
|
| 619 |
+
CASE
|
| 620 |
+
WHEN EXTRACT(MONTH FROM date_value) >= 10
|
| 621 |
+
THEN EXTRACT(YEAR FROM date_value)
|
| 622 |
+
ELSE EXTRACT(YEAR FROM date_value) - 1
|
| 623 |
+
END as fiscal_year
|
| 624 |
+
FROM all_dates
|
| 625 |
+
WHERE date_value IS NOT NULL
|
| 626 |
+
ORDER BY date_value
|
| 627 |
+
""")
|
| 628 |
+
|
| 629 |
+
def create_fact_table(self) -> None:
|
| 630 |
+
"""Create the fact table with foreign keys."""
|
| 631 |
+
self.logger.info("Creating fact table in DuckDB...")
|
| 632 |
+
|
| 633 |
+
self.conn.execute("""
|
| 634 |
+
CREATE TABLE fact_h1b_applications AS
|
| 635 |
+
SELECT
|
| 636 |
+
ROW_NUMBER() OVER () as record_id,
|
| 637 |
+
|
| 638 |
+
COALESCE(db.beneficiary_key, -1) as beneficiary_key,
|
| 639 |
+
COALESCE(de.employer_key, -1) as employer_key,
|
| 640 |
+
COALESCE(dj.job_key, -1) as job_key,
|
| 641 |
+
COALESCE(da.agent_key, -1) as agent_key,
|
| 642 |
+
COALESCE(ds.status_key, -1) as status_key,
|
| 643 |
+
|
| 644 |
+
-- Handle multiple date formats for rec_date
|
| 645 |
+
CASE
|
| 646 |
+
WHEN cd.rec_date IS NOT NULL AND cd.rec_date NOT LIKE '%(%'
|
| 647 |
+
THEN CASE
|
| 648 |
+
WHEN TRY_STRPTIME(cd.rec_date, '%m/%d/%Y') IS NOT NULL
|
| 649 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%m/%d/%Y')) AS INTEGER)
|
| 650 |
+
WHEN TRY_STRPTIME(cd.rec_date, '%Y-%m-%d') IS NOT NULL
|
| 651 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.rec_date, '%Y-%m-%d')) AS INTEGER)
|
| 652 |
+
ELSE NULL
|
| 653 |
+
END
|
| 654 |
+
ELSE NULL
|
| 655 |
+
END as rec_date_key,
|
| 656 |
+
|
| 657 |
+
-- Handle multiple date formats for first_decision_date
|
| 658 |
+
CASE
|
| 659 |
+
WHEN cd.first_decision_date IS NOT NULL AND cd.first_decision_date NOT LIKE '%(%'
|
| 660 |
+
THEN CASE
|
| 661 |
+
WHEN TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y') IS NOT NULL
|
| 662 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%m/%d/%Y')) AS INTEGER)
|
| 663 |
+
WHEN TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d') IS NOT NULL
|
| 664 |
+
THEN CAST(STRFTIME('%Y%m%d', TRY_STRPTIME(cd.first_decision_date, '%Y-%m-%d')) AS INTEGER)
|
| 665 |
+
ELSE NULL
|
| 666 |
+
END
|
| 667 |
+
ELSE NULL
|
| 668 |
+
END as first_decision_date_key,
|
| 669 |
+
|
| 670 |
+
cd.lottery_year,
|
| 671 |
+
cd.ben_multi_reg_ind,
|
| 672 |
+
cd.receipt_number,
|
| 673 |
+
cd.source_file,
|
| 674 |
+
cd.fiscal_year
|
| 675 |
+
|
| 676 |
+
FROM cleaned_data cd
|
| 677 |
+
|
| 678 |
+
LEFT JOIN dim_beneficiary db ON
|
| 679 |
+
cd.original_row_id = db.beneficiary_key AND
|
| 680 |
+
COALESCE(cd.country_of_birth, '') = COALESCE(db.country_of_birth, '') AND
|
| 681 |
+
COALESCE(cd.country_of_nationality, '') = COALESCE(db.country_of_nationality, '') AND
|
| 682 |
+
COALESCE(cd.ben_year_of_birth, '0') = COALESCE(db.ben_year_of_birth, '0') AND
|
| 683 |
+
COALESCE(cd.gender, '') = COALESCE(db.gender, '')
|
| 684 |
+
|
| 685 |
+
LEFT JOIN dim_employer de ON
|
| 686 |
+
cd.original_row_id = de.employer_key AND
|
| 687 |
+
COALESCE(cd.employer_name, '') = COALESCE(de.employer_name, '') AND
|
| 688 |
+
COALESCE(cd.fein, '') = COALESCE(de.fein, '')
|
| 689 |
+
|
| 690 |
+
LEFT JOIN dim_job dj ON
|
| 691 |
+
cd.original_row_id = dj.job_key AND
|
| 692 |
+
COALESCE(cd.job_title, '') = COALESCE(dj.job_title, '') AND
|
| 693 |
+
COALESCE(cd.naics_code, '') = COALESCE(dj.naics_code, '')
|
| 694 |
+
|
| 695 |
+
LEFT JOIN dim_agent da ON
|
| 696 |
+
cd.original_row_id = da.agent_key AND
|
| 697 |
+
COALESCE(cd.agent_first_name, '') = COALESCE(da.agent_first_name, '') AND
|
| 698 |
+
COALESCE(cd.agent_last_name, '') = COALESCE(da.agent_last_name, '')
|
| 699 |
+
|
| 700 |
+
LEFT JOIN dim_status ds ON
|
| 701 |
+
cd.original_row_id = ds.status_key AND
|
| 702 |
+
COALESCE(cd.status_type, '') = COALESCE(ds.status_type, '') AND
|
| 703 |
+
COALESCE(cd.first_decision, '') = COALESCE(ds.first_decision, '')
|
| 704 |
+
""")
|
| 705 |
+
|
| 706 |
+
def create_lookup_tables(self) -> None:
|
| 707 |
+
"""Create lookup tables for reference data."""
|
| 708 |
+
self.logger.info("Creating lookup tables in DuckDB...")
|
| 709 |
+
|
| 710 |
+
# Country codes lookup
|
| 711 |
+
self.conn.execute("""
|
| 712 |
+
CREATE TABLE lookup_country_codes AS
|
| 713 |
+
SELECT * FROM VALUES
|
| 714 |
+
('IND', 'India', 'Asia'),
|
| 715 |
+
('CHN', 'China', 'Asia'),
|
| 716 |
+
('KOR', 'South Korea', 'Asia'),
|
| 717 |
+
('CAN', 'Canada', 'North America'),
|
| 718 |
+
('NPL', 'Nepal', 'Asia'),
|
| 719 |
+
('USA', 'United States', 'North America')
|
| 720 |
+
AS t(country_code, country_name, region)
|
| 721 |
+
""")
|
| 722 |
+
|
| 723 |
+
# Education levels
|
| 724 |
+
self.conn.execute("""
|
| 725 |
+
CREATE TABLE lookup_education_levels AS
|
| 726 |
+
SELECT * FROM VALUES
|
| 727 |
+
('A', 'No Diploma', 'Basic'),
|
| 728 |
+
('B', 'High School', 'Basic'),
|
| 729 |
+
('C', 'Some College', 'Undergraduate'),
|
| 730 |
+
('D', 'College No Degree', 'Undergraduate'),
|
| 731 |
+
('E', 'Associates', 'Undergraduate'),
|
| 732 |
+
('F', 'Bachelors', 'Undergraduate'),
|
| 733 |
+
('G', 'Masters', 'Graduate'),
|
| 734 |
+
('H', 'Professional', 'Graduate'),
|
| 735 |
+
('I', 'Doctorate', 'Graduate')
|
| 736 |
+
AS t(education_code, education_level, education_category)
|
| 737 |
+
""")
|
| 738 |
+
|
| 739 |
+
# Application status types
|
| 740 |
+
self.conn.execute("""
|
| 741 |
+
CREATE TABLE lookup_status_types AS
|
| 742 |
+
SELECT * FROM VALUES
|
| 743 |
+
('ELIGIBLE', 'Application is eligible for lottery', 'Lottery'),
|
| 744 |
+
('SELECTED', 'Selected in H-1B lottery', 'Lottery'),
|
| 745 |
+
('CREATED', 'Application record created', 'Administrative')
|
| 746 |
+
AS t(status_type, status_description, status_category)
|
| 747 |
+
""")
|
| 748 |
+
|
| 749 |
+
|
| 750 |
+
class DatabaseOptimizer:
|
| 751 |
+
"""Handles database optimization and indexing."""
|
| 752 |
+
|
| 753 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
| 754 |
+
"""
|
| 755 |
+
Initialize database optimizer.
|
| 756 |
+
|
| 757 |
+
Args:
|
| 758 |
+
conn: DuckDB connection object.
|
| 759 |
+
logger: Logger instance for tracking operations.
|
| 760 |
+
"""
|
| 761 |
+
self.conn = conn
|
| 762 |
+
self.logger = logger
|
| 763 |
+
|
| 764 |
+
def create_indexes(self) -> None:
|
| 765 |
+
"""Create indexes for better query performance."""
|
| 766 |
+
self.logger.info("Creating indexes in DuckDB...")
|
| 767 |
+
|
| 768 |
+
indexes = [
|
| 769 |
+
("idx_fact_beneficiary", "fact_h1b_applications", "beneficiary_key"),
|
| 770 |
+
("idx_fact_employer", "fact_h1b_applications", "employer_key"),
|
| 771 |
+
("idx_fact_job", "fact_h1b_applications", "job_key"),
|
| 772 |
+
("idx_fact_lottery_year", "fact_h1b_applications", "lottery_year"),
|
| 773 |
+
("idx_fact_fiscal_year", "fact_h1b_applications", "fiscal_year"),
|
| 774 |
+
("idx_fact_rec_date", "fact_h1b_applications", "rec_date_key"),
|
| 775 |
+
("idx_dim_beneficiary_id", "dim_beneficiary", "beneficiary_id"),
|
| 776 |
+
("idx_dim_employer_id", "dim_employer", "employer_id"),
|
| 777 |
+
("idx_dim_job_id", "dim_job", "job_id"),
|
| 778 |
+
]
|
| 779 |
+
|
| 780 |
+
for index_name, table_name, column_name in indexes:
|
| 781 |
+
try:
|
| 782 |
+
self.conn.execute(f"CREATE INDEX {index_name} ON {table_name}({column_name})")
|
| 783 |
+
except Exception as e:
|
| 784 |
+
self.logger.warning(f"Could not create index {index_name}: {e}")
|
| 785 |
+
|
| 786 |
+
self.logger.info("Indexes created successfully!")
|
| 787 |
+
|
| 788 |
+
|
| 789 |
+
class DataQualityChecker:
|
| 790 |
+
"""Performs data quality checks and validation."""
|
| 791 |
+
|
| 792 |
+
def __init__(self, conn: duckdb.DuckDBPyConnection, logger: logging.Logger):
|
| 793 |
+
"""
|
| 794 |
+
Initialize data quality checker.
|
| 795 |
+
|
| 796 |
+
Args:
|
| 797 |
+
conn: DuckDB connection object.
|
| 798 |
+
logger: Logger instance for tracking operations.
|
| 799 |
+
"""
|
| 800 |
+
self.conn = conn
|
| 801 |
+
self.logger = logger
|
| 802 |
+
|
| 803 |
+
def run_all_checks(self) -> bool:
|
| 804 |
+
"""
|
| 805 |
+
Run all data quality checks.
|
| 806 |
+
|
| 807 |
+
Returns:
|
| 808 |
+
True if all checks pass, False otherwise.
|
| 809 |
+
"""
|
| 810 |
+
self.logger.info("Running data quality checks...")
|
| 811 |
+
|
| 812 |
+
try:
|
| 813 |
+
self._check_table_counts()
|
| 814 |
+
self._check_fact_table_integrity()
|
| 815 |
+
return True
|
| 816 |
+
except Exception as e:
|
| 817 |
+
self.logger.error(f"Error in data quality checks: {e}")
|
| 818 |
+
return False
|
| 819 |
+
|
| 820 |
+
def _check_table_counts(self) -> None:
|
| 821 |
+
"""Check row counts for all tables."""
|
| 822 |
+
tables_query = """
|
| 823 |
+
SELECT table_name, estimated_size as row_count
|
| 824 |
+
FROM duckdb_tables()
|
| 825 |
+
WHERE schema_name = 'main'
|
| 826 |
+
ORDER BY table_name
|
| 827 |
+
"""
|
| 828 |
+
tables_info = self.conn.execute(tables_query).fetchall()
|
| 829 |
+
|
| 830 |
+
self.logger.info("Table row counts:")
|
| 831 |
+
for table_name, _ in tables_info:
|
| 832 |
+
if not table_name.startswith('raw_'):
|
| 833 |
+
count = self.conn.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
|
| 834 |
+
self.logger.info(f" {table_name}: {count:,} records")
|
| 835 |
+
|
| 836 |
+
def _check_fact_table_integrity(self) -> None:
|
| 837 |
+
"""Check fact table for duplicates and integrity."""
|
| 838 |
+
dup_check = self.conn.execute("""
|
| 839 |
+
SELECT COUNT(*) as total_records,
|
| 840 |
+
COUNT(DISTINCT record_id) as unique_records
|
| 841 |
+
FROM fact_h1b_applications
|
| 842 |
+
""").fetchone()
|
| 843 |
+
|
| 844 |
+
self.logger.info(f"Fact table: {dup_check[0]:,} total records, {dup_check[1]:,} unique records")
|
| 845 |
+
|
| 846 |
+
|
| 847 |
+
class DatabasePersistence:
|
| 848 |
+
"""Handles database persistence operations."""
|
| 849 |
+
|
| 850 |
+
def __init__(self, logger: logging.Logger):
|
| 851 |
+
"""
|
| 852 |
+
Initialize database persistence handler.
|
| 853 |
+
|
| 854 |
+
Args:
|
| 855 |
+
logger: Logger instance for tracking operations.
|
| 856 |
+
"""
|
| 857 |
+
self.logger = logger
|
| 858 |
+
|
| 859 |
+
def save_to_persistent_database(self, source_conn: duckdb.DuckDBPyConnection,
|
| 860 |
+
target_path: str) -> None:
|
| 861 |
+
"""
|
| 862 |
+
Save tables to a persistent database file.
|
| 863 |
+
|
| 864 |
+
Args:
|
| 865 |
+
source_conn: Source database connection.
|
| 866 |
+
target_path: Path to the target persistent database file.
|
| 867 |
+
"""
|
| 868 |
+
self.logger.info(f"Saving to persistent database: {target_path}")
|
| 869 |
+
|
| 870 |
+
# Remove existing file if it exists
|
| 871 |
+
if os.path.exists(target_path):
|
| 872 |
+
os.remove(target_path)
|
| 873 |
+
self.logger.info(f"Removed existing database file: {target_path}")
|
| 874 |
+
|
| 875 |
+
# Create persistent database connection
|
| 876 |
+
with duckdb.connect(target_path) as persistent_conn:
|
| 877 |
+
# Get tables to copy (exclude temporary tables)
|
| 878 |
+
tables_to_keep = source_conn.execute("""
|
| 879 |
+
SELECT table_name
|
| 880 |
+
FROM information_schema.tables
|
| 881 |
+
WHERE table_name NOT LIKE 'raw_%'
|
| 882 |
+
AND table_name NOT IN ('combined_data', 'cleaned_data')
|
| 883 |
+
AND table_schema = 'main'
|
| 884 |
+
""").fetchall()
|
| 885 |
+
|
| 886 |
+
# Copy tables
|
| 887 |
+
for table_info in tables_to_keep:
|
| 888 |
+
table_name = table_info[0]
|
| 889 |
+
df = source_conn.execute(f"SELECT * FROM {table_name}").df()
|
| 890 |
+
persistent_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
|
| 891 |
+
self.logger.info(f"Copied table {table_name} to persistent database")
|
| 892 |
+
|
| 893 |
+
self.logger.info(f"Persistent database saved to: {target_path}")
|
| 894 |
+
|
| 895 |
+
|
| 896 |
+
# Configuration and Constants
|
| 897 |
+
class Config:
|
| 898 |
+
"""Configuration class for the H1B data pipeline."""
|
| 899 |
+
|
| 900 |
+
CSV_FILES = [
|
| 901 |
+
'./data/TRK_13139_FY2021.csv',
|
| 902 |
+
'./data/TRK_13139_FY2022.csv',
|
| 903 |
+
'./data/TRK_13139_FY2023.csv',
|
| 904 |
+
'./data/TRK_13139_FY2024_single_reg.csv',
|
| 905 |
+
'./data/TRK_13139_FY2024_multi_reg.csv'
|
| 906 |
+
]
|
| 907 |
+
|
| 908 |
+
XLSX_FILE = './data/TRK_13139_I129_H1B_Registrations_FY21_FY24_FOIA_FIN.xlsx'
|
| 909 |
+
PERSISTENT_DB_PATH = './data/h1bs_analytics.duckdb'
|
| 910 |
+
MISSING_DATA_THRESHOLD = 0.99
|
| 911 |
+
|
| 912 |
+
|
| 913 |
+
def main():
|
| 914 |
+
"""Main execution function for the H1B data pipeline."""
|
| 915 |
+
print("Starting H1B Data Analytics Pipeline...")
|
| 916 |
+
print("All imports successful!")
|
| 917 |
+
|
| 918 |
+
# Check memory usage
|
| 919 |
+
MemoryManager.check_memory_usage()
|
| 920 |
+
|
| 921 |
+
# Validate input files
|
| 922 |
+
existing_files, missing_files = FileValidator.validate_files(Config.CSV_FILES)
|
| 923 |
+
if missing_files:
|
| 924 |
+
print(f"Warning: {len(missing_files)} files are missing")
|
| 925 |
+
|
| 926 |
+
# Run the pipeline
|
| 927 |
+
try:
|
| 928 |
+
with H1BDataPipeline() as pipeline:
|
| 929 |
+
# Load data
|
| 930 |
+
data_loader = DataLoader(pipeline.conn, pipeline.logger)
|
| 931 |
+
data_loader.load_csv_files(existing_files)
|
| 932 |
+
|
| 933 |
+
# Transform data
|
| 934 |
+
transformer = DataTransformer(pipeline.conn, pipeline.logger)
|
| 935 |
+
transformer.create_combined_table()
|
| 936 |
+
kept_columns = transformer.remove_columns_with_missing_data(
|
| 937 |
+
'combined_data', Config.MISSING_DATA_THRESHOLD
|
| 938 |
+
)
|
| 939 |
+
print(f"Kept {len(kept_columns)} columns after cleaning")
|
| 940 |
+
|
| 941 |
+
# Create dimensional model
|
| 942 |
+
modeler = DimensionalModeler(pipeline.conn, pipeline.logger)
|
| 943 |
+
modeler.create_all_dimensions()
|
| 944 |
+
modeler.create_fact_table()
|
| 945 |
+
modeler.create_lookup_tables()
|
| 946 |
+
|
| 947 |
+
# Optimize database
|
| 948 |
+
optimizer = DatabaseOptimizer(pipeline.conn, pipeline.logger)
|
| 949 |
+
optimizer.create_indexes()
|
| 950 |
+
|
| 951 |
+
# Run quality checks
|
| 952 |
+
quality_checker = DataQualityChecker(pipeline.conn, pipeline.logger)
|
| 953 |
+
quality_checker.run_all_checks()
|
| 954 |
+
|
| 955 |
+
# Save to persistent database
|
| 956 |
+
persistence = DatabasePersistence(pipeline.logger)
|
| 957 |
+
persistence.save_to_persistent_database(
|
| 958 |
+
pipeline.conn, Config.PERSISTENT_DB_PATH
|
| 959 |
+
)
|
| 960 |
+
|
| 961 |
+
# Final memory check
|
| 962 |
+
MemoryManager.check_memory_usage()
|
| 963 |
+
MemoryManager.clear_memory()
|
| 964 |
+
|
| 965 |
+
print("Pipeline completed successfully!")
|
| 966 |
+
|
| 967 |
+
except Exception as e:
|
| 968 |
+
print(f"Pipeline failed with error: {e}")
|
| 969 |
+
traceback.print_exc()
|
| 970 |
+
|
| 971 |
+
|
| 972 |
+
if __name__ == "__main__":
|
| 973 |
+
main()
|