Mandr1 commited on
Commit
ffeba1e
ยท
verified ยท
1 Parent(s): 6406cf5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ import joblib
5
+ # from pyspark.sql import SparkSession # No longer needed for inference
6
+ # from pyspark.sql.functions import col, max as spark_max # No longer needed for inference
7
+ from pyspark.sql.types import StringType, IntegerType, StructType, StructField # Still needed for schema definition if Spark is used elsewhere in the app.py, but not for this specific prediction path.
8
+
9
+ # ==================================================
10
+ # BAGIAN 0: INITIAL SETUP & LOAD SAVED MODELS
11
+ # ==================================================
12
+ print("Loading saved scikit-learn models and preprocessor...")
13
+
14
+ # Load the saved preprocessor
15
+ try:
16
+ preprocessor = joblib.load('preprocessor.pkl')
17
+ print("โœ… Preprocessor loaded successfully.")
18
+ except FileNotFoundError:
19
+ print("โŒ Error: 'preprocessor.pkl' not found. Please ensure it's in the same directory.")
20
+ exit()
21
+
22
+ # Load the trained scikit-learn models
23
+ try:
24
+ lr_model = joblib.load('lr_model.pkl')
25
+ dt_model = joblib.load('dt_model.pkl')
26
+ rf_model = joblib.load('rf_model.pkl')
27
+ loaded_models = {
28
+ 'Linear Regression': lr_model,
29
+ 'Decision Tree': dt_model,
30
+ 'Random Forest': rf_model
31
+ }
32
+ print("โœ… Scikit-learn models loaded successfully.")
33
+ except FileNotFoundError:
34
+ print("โŒ Error: One or more model .pkl files not found. Please ensure they are in the same directory.")
35
+ exit()
36
+
37
+ # Load and clean job_salary_mean.csv using pandas for benchmarks
38
+ try:
39
+ pd_df_raw = pd.read_csv('job_salary_mean.csv')
40
+ pd_df_clean = pd_df_raw.rename(columns={
41
+ "Judul Pekerjaan": "judul",
42
+ "Perusahaan": "perusahaan",
43
+ "Lokasi": "lokasi",
44
+ "Gaji_Rata2": "gaji"
45
+ })
46
+ pd_df_clean['judul_clean'] = pd_df_clean['judul'].str.lower()
47
+ pd_df_clean['lokasi_clean'] = pd_df_clean['lokasi'].str.lower()
48
+ pd_df_clean = pd_df_clean.dropna()
49
+ print(f"โœ… Pandas DataFrame for benchmarks loaded and cleaned. Total rows: {len(pd_df_clean)}")
50
+ except FileNotFoundError:
51
+ print("โŒ Error: 'job_salary_mean.csv' not found. Please ensure it's in the same directory.")
52
+ exit()
53
+
54
+ # ==================================================
55
+ # BAGIAN 6 (FINAL): DASHBOARD DENGAN DATABASE WILAYAH RESMI
56
+ # ==================================================
57
+
58
+ # ---------------------------------------------------------
59
+ # A. PERSIAPAN MASTER DATA WILAYAH (Dari File CSV Baru)
60
+ # ---------------------------------------------------------
61
+ print("Sedang memproses Database Wilayah Indonesia...")
62
+
63
+ # 1. Baca Dataset Kabupaten/Kota
64
+ try:
65
+ geo_df = pd.read_csv('dataset kabupaten indonesia.csv')
66
+
67
+ # Rename kolom agar jelas: 'name' -> 'kota', 'Unnamed: 3' -> 'provinsi'
68
+ geo_df = geo_df[['name', 'Unnamed: 3']].rename(columns={'name': 'kota', 'Unnamed: 3': 'provinsi'})
69
+
70
+ # Bersihkan Nama Kota (Hapus "KABUPATEN " dan "KOTA ") & Lowercase
71
+ # Contoh: "KABUPATEN ACEH BARAT" -> "aceh barat"
72
+ geo_df['kota_clean'] = geo_df['kota'].astype(str).str.replace('KABUPATEN ', '').str.replace('KOTA ', '').str.lower().str.strip()
73
+ geo_df['provinsi'] = geo_df['provinsi'].astype(str).str.upper().str.strip()
74
+
75
+ # Buat Kamus Pencarian (Dictionary)
76
+ # Format: {'aceh barat': 'ACEH', 'surabaya': 'JAWA TIMUR', ...}
77
+ kamus_wilayah = pd.Series(geo_df.provinsi.values, index=geo_df.kota_clean).to_dict()
78
+
79
+ print(f"โœ… Berhasil memuat {len(kamus_wilayah)} wilayah administrasi Indonesia.")
80
+
81
+ except FileNotFoundError:
82
+ print("โŒ ERROR: File 'dataset kabupaten indonesia.csv' tidak ditemukan. Upload dulu!")
83
+ kamus_wilayah = {}
84
+
85
+ # 2. Mapping Provinsi ke Pulau (Logic Tambahan)
86
+ def get_pulau_from_provinsi(provinsi):
87
+ p = provinsi.upper()
88
+ if any(x in p for x in ['JAWA', 'DKI', 'BANTEN', 'YOGYAKARTA']): return "PULAU JAWA"
89
+ if any(x in p for x in ['SUMATERA', 'ACEH', 'RIAU', 'JAMBI', 'BENGKULU', 'LAMPUNG', 'BANGKA']): return "PULAU SUMATERA"
90
+ if any(x in p for x in ['KALIMANTAN']): return "PULAU KALIMANTAN"
91
+ if any(x in p for x in ['SULAWESI', 'GORONTALO']): return "PULAU SULAWESI"
92
+ if any(x in p for x in ['BALI', 'NUSA TENGGARA']): return "BALI & NUSA TENGGARA"
93
+ if any(x in p for x in ['PAPUA', 'MALUKU']): return "PAPUA & MALUKU"
94
+ return "INDONESIA (LAINNYA)"
95
+
96
+ # ---------------------------------------------------------
97
+ # B. FUNGSI CERDAS: DETEKSI LOKASI USER
98
+ # ---------------------------------------------------------
99
+ def deteksi_info_lokasi(input_user):
100
+ text = input_user.lower().strip()
101
+
102
+ # Cek apakah input user mengandung nama kota yang ada di database
103
+ provinsi_terdeteksi = "INDONESIA" # Default
104
+
105
+ for kota_db, prov_db in kamus_wilayah.items():
106
+ # Jika user ngetik "Simeulue" dan di db ada "simeulue", maka ketemu!
107
+ if kota_db in text:
108
+ provinsi_terdeteksi = prov_db
109
+ break
110
+
111
+ pulau_terdeteksi = get_pulau_from_provinsi(provinsi_terdeteksi)
112
+ return provinsi_terdeteksi, pulau_terdeteksi
113
+
114
+ # ---------------------------------------------------------
115
+ # C. FUNGSI ANALISIS UTAMA
116
+ # ---------------------------------------------------------
117
+ def analisis_gaji_final(judul_input, lokasi_input, model_choice):
118
+ # 1. Prediksi ML (Menggunakan Scikit-learn model)
119
+ model_pipeline = loaded_models[model_choice]
120
+
121
+ # Prepare input for scikit-learn pipeline (pandas DataFrame)
122
+ input_df = pd.DataFrame({
123
+ 'judul_clean': [judul_input.lower()],
124
+ 'lokasi_clean': [lokasi_input.lower()],
125
+ 'perusahaan': ['unknown_company_for_prediction'] # Placeholder for 'perusahaan'
126
+ })
127
+
128
+ try:
129
+ prediksi_user = model_pipeline.predict(input_df)[0]
130
+ # Ensure prediction is non-negative
131
+ prediksi_user = max(0, prediksi_user)
132
+ except Exception as e:
133
+ return f"<h1>โš ๏ธ Error during prediction: {e}</h1>", None
134
+
135
+ # 2. Deteksi Wilayah Cerdas
136
+ provinsi_found, pulau_found = deteksi_info_lokasi(lokasi_input)
137
+
138
+ # 3. Logika Benchmark (Pembanding) - Menggunakan pd_df_clean
139
+ judul_lower = judul_input.lower()
140
+
141
+ # A. Max Gaji Pekerjaan (Nasional)
142
+ # Filter jobs where judul_clean contains the input judul_lower
143
+ filtered_jobs = pd_df_clean[pd_df_clean['judul_clean'].str.contains(judul_lower, na=False)]
144
+ if not filtered_jobs.empty:
145
+ max_gaji_job = filtered_jobs['gaji'].max()
146
+ else:
147
+ max_gaji_job = prediksi_user * 1.2 # Fallback if no matching jobs found
148
+
149
+ # B. Max Gaji Regional (Berdasarkan Pulau yang ditemukan)
150
+ keyword_pencarian = pulau_found.replace("PULAU ", "").lower() # Misal "jawa", "sumatera"
151
+ # Filter locations where lokasi_clean contains the keyword_pencarian
152
+ filtered_locations = pd_df_clean[pd_df_clean['lokasi_clean'].str.contains(keyword_pencarian, na=False)]
153
+ if not filtered_locations.empty:
154
+ max_gaji_region = filtered_locations['gaji'].max()
155
+ else:
156
+ max_gaji_region = prediksi_user * 1.5 # Fallback if no matching locations found
157
+
158
+ # 4. Visualisasi Matplotlib
159
+ plt.style.use('seaborn-v0_8-whitegrid')
160
+ fig, ax = plt.subplots(figsize=(10, 5.5))
161
+
162
+ labels = [f"Estimasi Anda\n({lokasi_input})", f"Max Posisi '{judul_input}'\n(Nasional)", f"Max Regional\n({pulau_found})"]
163
+ values = [prediksi_user, max_gaji_job, max_gaji_region]
164
+ colors = ['#0ea5e9', '#94a3b8', '#f59e0b'] # Biru Langit, Abu, Oranye
165
+
166
+ bars = ax.bar(labels, values, color=colors, edgecolor='black', alpha=0.9)
167
+
168
+ # Garis referensi gaji user
169
+ ax.axhline(y=prediksi_user, color='#0ea5e9', linestyle='--', linewidth=2, label="Posisi Anda")
170
+
171
+ for bar in bars:
172
+ height = bar.get_height()
173
+ ax.text(bar.get_x() + bar.get_width()/2., height + (height*0.015),
174
+ f'Rp {height/1000000:.1f} Jt',
175
+ ha='center', va='bottom', fontweight='bold', fontsize=11)
176
+
177
+ ax.set_title(f"Analisis Gaji: {judul_input} @ {provinsi_found} (Model: {model_choice}) ", fontsize=14, fontweight='bold', pad=15)
178
+ ax.set_ylabel("Gaji (Rupiah)")
179
+ ax.grid(axis='y', linestyle='--', alpha=0.5)
180
+
181
+ # 5. Generate Output HTML
182
+ html_output = f"""
183
+ <div style="font-family: sans-serif; padding: 20px; border: 1px solid #e2e8f0; border-radius: 12px; background: linear-gradient(to right, #f8fafc, #ffffff);">
184
+ <h2 style="color: #0f172a; margin-bottom: 5px;">๐Ÿ’ฐ Estimasi: Rp {int(prediksi_user):,.0f}</h2>
185
+ <span style="background-color: #e0f2fe; color: #0369a1; padding: 4px 10px; border-radius: 20px; font-size: 0.85em; font-weight: bold;">
186
+ ๐Ÿ“ {provinsi_found} / {pulau_found}
187
+ </span>
188
+ <p style="margin-top: 15px; color: #475569; line-height: 1.5;">
189
+ Sistem mendeteksi lokasi Anda berada di provinsi <b>{provinsi_found}</b>.
190
+ Berdasarkan data historis, standar gaji pasar untuk <b>{judul_input}</b> di wilayah ini adalah seperti di atas.
191
+ </p>
192
+ <div style="margin-top: 15px; padding: 10px; background-color: #fff7ed; border-left: 4px solid #f97316; color: #9a3412; font-size: 0.9em;">
193
+ ๐Ÿ’ก <b>Insight Regional:</b> Batas atas gaji tertinggi (semua sektor) di {pulau_found} tercatat mencapai <b>Rp {int(max_gaji_region):,.0f}</b>.
194
+ </div>
195
+ </div>
196
+ """
197
+
198
+ return html_output, fig
199
+
200
+ # ---------------------------------------------------------
201
+ # D. INTERFACE GRADIO
202
+ # ---------------------------------------------------------
203
+ theme = gr.themes.Soft(primary_hue="cyan", secondary_hue="slate")
204
+
205
+ with gr.Blocks(theme=theme, title="Salary AI") as demo:
206
+ gr.Markdown("# ๐Ÿ‡ฎ๐Ÿ‡ฉ AI Salary Predictor & Geo-Intelligence")
207
+ gr.Markdown("Prediksi gaji menggunakan Scikit-learn Models + Database Wilayah BPS Indonesia.")
208
+
209
+ with gr.Row():
210
+ with gr.Column():
211
+ t1 = gr.Textbox(label="Posisi Pekerjaan", placeholder="Contoh: Guru, Driver, Manager")
212
+ t2 = gr.Textbox(label="Kabupaten / Kota", placeholder="Contoh: Simeulue, Surakarta, Malang")
213
+ model_selector = gr.Dropdown(
214
+ label="Pilih Model Prediksi",
215
+ choices=list(loaded_models.keys()),
216
+ value='Decision Tree' # Default selected model
217
+ )
218
+ btn = gr.Button("๐Ÿ” Analisis Sekarang", variant="primary")
219
+ with gr.Column():
220
+ out_html = gr.HTML(label="Hasil Analisis")
221
+
222
+ out_plot = gr.Plot(label="Grafik Komparasi")
223
+
224
+ btn.click(analisis_gaji_final, inputs=[t1, t2, model_selector], outputs=[out_html, out_plot])
225
+
226
+ print("Menjalankan Aplikasi Final...")
227
+ demo.launch(share=True, debug=True)