MuhammadHaris01 commited on
Commit
7cd7daf
Β·
verified Β·
1 Parent(s): af3b896

Upload 4 files

Browse files
Files changed (4) hide show
  1. Analyze.py +335 -0
  2. Dockerfile +19 -0
  3. app.py +144 -0
  4. requirements.txt +5 -0
Analyze.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ analyze.py β€” FYP: Extracting Market Trends from Real-World Job Postings
3
+ Student: Muhammad Haris, BZU Multan
4
+
5
+ Run this script ONCE to generate precomputed_data.json
6
+ Command: python analyze.py
7
+
8
+ This script:
9
+ 1. Loads lukebarousse/data_jobs from HuggingFace
10
+ 2. Cleans and processes data with pandas
11
+ 3. Computes all metrics needed by the dashboard UI
12
+ 4. Saves everything to precomputed_data.json
13
+ """
14
+
15
+ import json
16
+ import ast
17
+ import pandas as pd
18
+ import numpy as np
19
+ from datasets import load_dataset
20
+ from datetime import datetime
21
+
22
+ print("=" * 60)
23
+ print("FYP Analysis Script β€” Muhammad Haris")
24
+ print("=" * 60)
25
+
26
+ # ─────────────────────────────────────────────
27
+ # 1. LOAD DATASET
28
+ # ─────────────────────────────────────────────
29
+ print("\n[1/6] Loading dataset from HuggingFace...")
30
+ dataset = load_dataset("lukebarousse/data_jobs", split="train")
31
+ df = dataset.to_pandas()
32
+ print(f" Loaded {len(df):,} rows, {len(df.columns)} columns")
33
+ print(f" Columns: {list(df.columns)}")
34
+
35
+ # ─────────────────────────────────────────────
36
+ # 2. DATA CLEANING
37
+ # ─────────────────────────────────────────────
38
+ print("\n[2/6] Cleaning data...")
39
+
40
+ # Drop rows with no job title
41
+ df = df.dropna(subset=["job_title_short"])
42
+
43
+ # Parse job_posted_date to datetime
44
+ df["job_posted_date"] = pd.to_datetime(df["job_posted_date"], errors="coerce")
45
+
46
+ # Clean salary columns β€” keep only yearly salaries
47
+ df["salary_year_avg"] = pd.to_numeric(df["salary_year_avg"], errors="coerce")
48
+ df["salary_year_avg"] = df["salary_year_avg"].where(
49
+ (df["salary_year_avg"] >= 20000) & (df["salary_year_avg"] <= 600000)
50
+ )
51
+
52
+ # Normalize job title casing
53
+ df["job_title_short"] = df["job_title_short"].str.strip()
54
+
55
+ # Parse job_skills β€” stored as string representation of a list
56
+ def parse_skills(val):
57
+ if pd.isna(val):
58
+ return []
59
+ if isinstance(val, list):
60
+ return val
61
+ try:
62
+ parsed = ast.literal_eval(val)
63
+ if isinstance(parsed, list):
64
+ return [s.strip().lower() for s in parsed]
65
+ except Exception:
66
+ pass
67
+ return []
68
+
69
+ df["skills_list"] = df["job_skills"].apply(parse_skills)
70
+
71
+ # Add month + year columns for trend analysis
72
+ df["year"] = df["job_posted_date"].dt.year
73
+ df["month"] = df["job_posted_date"].dt.month
74
+ df["year_month"] = df["job_posted_date"].dt.to_period("M").astype(str)
75
+
76
+ print(f" After cleaning: {len(df):,} rows")
77
+ print(f" Salary data available for: {df['salary_year_avg'].notna().sum():,} rows")
78
+ print(f" Rows with skills: {(df['skills_list'].str.len() > 0).sum():,}")
79
+
80
+ # ─────────────────────────────────────────────
81
+ # 3. SUMMARY STATS (for Dashboard cards)
82
+ # ─────────────────────────────────────────────
83
+ print("\n[3/6] Computing summary statistics...")
84
+
85
+ total_jobs = int(len(df))
86
+ avg_salary = int(df["salary_year_avg"].dropna().mean())
87
+ active_companies = int(df["company_name"].nunique())
88
+
89
+ # Market growth: compare last 3 months vs previous 3 months
90
+ df_dated = df.dropna(subset=["job_posted_date"])
91
+ if len(df_dated) > 0:
92
+ latest_date = df_dated["job_posted_date"].max()
93
+ cutoff_recent = latest_date - pd.DateOffset(months=3)
94
+ cutoff_older = latest_date - pd.DateOffset(months=6)
95
+
96
+ recent = len(df_dated[df_dated["job_posted_date"] >= cutoff_recent])
97
+ older = len(df_dated[
98
+ (df_dated["job_posted_date"] >= cutoff_older) &
99
+ (df_dated["job_posted_date"] < cutoff_recent)
100
+ ])
101
+ growth_pct = round(((recent - older) / max(older, 1)) * 100, 1) if older > 0 else 0.0
102
+ else:
103
+ growth_pct = 0.0
104
+
105
+ summary_stats = {
106
+ "total_jobs": total_jobs,
107
+ "avg_salary": avg_salary,
108
+ "active_companies": active_companies,
109
+ "market_growth": f"{growth_pct:+.1f}%",
110
+ }
111
+ print(f" Total jobs: {total_jobs:,}")
112
+ print(f" Avg salary: ${avg_salary:,}")
113
+ print(f" Companies: {active_companies:,}")
114
+ print(f" Market growth: {growth_pct:+.1f}%")
115
+
116
+ # ─────────────────────────────────────────────
117
+ # 4. JOB TITLE ANALYSIS
118
+ # ─────────────────────────────────────────────
119
+ print("\n[4/6] Analyzing job titles...")
120
+
121
+ # Top job titles by count
122
+ title_counts = df["job_title_short"].value_counts()
123
+ top_titles = title_counts.head(10)
124
+
125
+ top_titles_list = [
126
+ {"title": str(t), "count": int(c), "pct": round(int(c) / total_jobs * 100, 1)}
127
+ for t, c in top_titles.items()
128
+ ]
129
+
130
+ # Salary by job title
131
+ salary_by_title = (
132
+ df.groupby("job_title_short")["salary_year_avg"]
133
+ .agg(["mean", "median", "min", "max", "count"])
134
+ .round(0)
135
+ .dropna()
136
+ .sort_values("median", ascending=False)
137
+ .head(10)
138
+ )
139
+
140
+ salary_by_title_list = [
141
+ {
142
+ "title": str(title),
143
+ "avg": int(row["mean"]),
144
+ "median": int(row["median"]),
145
+ "min": int(row["min"]),
146
+ "max": int(row["max"]),
147
+ "count": int(row["count"]),
148
+ }
149
+ for title, row in salary_by_title.iterrows()
150
+ ]
151
+
152
+ print(f" Top title: {top_titles_list[0]['title']} ({top_titles_list[0]['count']:,} postings)")
153
+
154
+ # ─────────────────────────────────────────────
155
+ # 5. SKILLS ANALYSIS
156
+ # ─────────────────────────────────────────────
157
+ print("\n[5/6] Analyzing skills...")
158
+
159
+ # Explode skills into individual rows
160
+ df_skills = df.explode("skills_list")
161
+ df_skills = df_skills[df_skills["skills_list"].str.len() > 0]
162
+ df_skills = df_skills.rename(columns={"skills_list": "skill"})
163
+
164
+ # Overall top skills by demand (% of job postings that mention skill)
165
+ skill_counts = df_skills["skill"].value_counts()
166
+ top_skills = skill_counts.head(20)
167
+
168
+ top_skills_list = [
169
+ {
170
+ "skill": str(s),
171
+ "count": int(c),
172
+ "pct": round(int(c) / total_jobs * 100, 1),
173
+ }
174
+ for s, c in top_skills.items()
175
+ ]
176
+
177
+ # Skills by job title (top 5 skills per top 6 titles)
178
+ skills_by_title = {}
179
+ for title in title_counts.head(6).index:
180
+ title_df = df_skills[df_skills["job_title_short"] == title]
181
+ title_skill_counts = title_df["skill"].value_counts().head(8)
182
+ title_total = title_counts[title]
183
+ skills_by_title[str(title)] = [
184
+ {
185
+ "skill": str(s),
186
+ "count": int(c),
187
+ "pct": round(int(c) / title_total * 100, 1),
188
+ }
189
+ for s, c in title_skill_counts.items()
190
+ ]
191
+
192
+ # Salary vs skill demand (optimal skills)
193
+ # For each top skill: median salary of jobs that require it + demand %
194
+ skill_salary = (
195
+ df_skills.groupby("skill")["salary_year_avg"]
196
+ .agg(["median", "count"])
197
+ .dropna()
198
+ .reset_index()
199
+ )
200
+ skill_salary = skill_salary[skill_salary["count"] >= 100] # min 100 postings
201
+ skill_salary["demand_pct"] = (skill_salary["count"] / total_jobs * 100).round(1)
202
+ skill_salary["median_salary"] = skill_salary["median"].round(0).astype(int)
203
+ skill_salary = skill_salary.sort_values("median", ascending=False).head(20)
204
+
205
+ optimal_skills_list = [
206
+ {
207
+ "skill": str(row["skill"]),
208
+ "median_salary": int(row["median_salary"]),
209
+ "demand_pct": float(row["demand_pct"]),
210
+ "count": int(row["count"]),
211
+ }
212
+ for _, row in skill_salary.iterrows()
213
+ ]
214
+
215
+ print(f" Top skill: {top_skills_list[0]['skill']} ({top_skills_list[0]['pct']}% of postings)")
216
+ print(f" Unique skills found: {len(skill_counts):,}")
217
+
218
+ # ─────────────────────────────────────────────
219
+ # 5b. SKILL TRENDS OVER TIME
220
+ # ─────────────────────────────────────────────
221
+ print(" Computing skill trends over time...")
222
+
223
+ top_10_skills = [s["skill"] for s in top_skills_list[:10]]
224
+
225
+ skill_trend_df = df_skills[
226
+ (df_skills["skill"].isin(top_10_skills)) &
227
+ (df_skills["year_month"].notna())
228
+ ]
229
+
230
+ # Count postings per skill per month
231
+ skill_trend = (
232
+ skill_trend_df.groupby(["year_month", "skill"])
233
+ .size()
234
+ .reset_index(name="count")
235
+ )
236
+
237
+ # Pivot to wide format
238
+ skill_trend_pivot = skill_trend.pivot(
239
+ index="year_month", columns="skill", values="count"
240
+ ).fillna(0).reset_index()
241
+
242
+ skill_trend_pivot = skill_trend_pivot.sort_values("year_month")
243
+
244
+ skill_trends_list = skill_trend_pivot.to_dict(orient="records")
245
+ # Convert float counts to int
246
+ for row in skill_trends_list:
247
+ for k, v in row.items():
248
+ if k != "year_month":
249
+ row[k] = int(v)
250
+
251
+ # ─────────────────────────────────────────────
252
+ # 5c. SALARY TRENDS OVER TIME
253
+ # ─────────────────────────────────────────────
254
+ salary_trend = (
255
+ df.dropna(subset=["year_month", "salary_year_avg"])
256
+ .groupby("year_month")["salary_year_avg"]
257
+ .agg(["mean", "median", "min", "max"])
258
+ .round(0)
259
+ .reset_index()
260
+ .sort_values("year_month")
261
+ )
262
+
263
+ salary_trends_list = [
264
+ {
265
+ "month": str(row["year_month"]),
266
+ "avg": int(row["mean"]),
267
+ "median": int(row["median"]),
268
+ "min": int(row["min"]),
269
+ "max": int(row["max"]),
270
+ }
271
+ for _, row in salary_trend.iterrows()
272
+ ]
273
+
274
+ # ─────────────────────────────────────────────
275
+ # 6. LOCATION & REMOTE ANALYSIS
276
+ # ─────────────────────────────────────────────
277
+ print("\n[6/6] Analyzing location and remote work...")
278
+
279
+ # Remote vs on-site vs hybrid
280
+ if "job_work_from_home" in df.columns:
281
+ remote_counts = df["job_work_from_home"].value_counts()
282
+ remote_true = int(remote_counts.get(True, 0))
283
+ remote_false = int(remote_counts.get(False, 0))
284
+ total_with_remote = remote_true + remote_false
285
+ remote_breakdown = {
286
+ "remote": remote_true,
287
+ "onsite": remote_false,
288
+ "remote_pct": round(remote_true / max(total_with_remote, 1) * 100, 1),
289
+ "onsite_pct": round(remote_false / max(total_with_remote, 1) * 100, 1),
290
+ }
291
+ else:
292
+ remote_breakdown = {"remote": 0, "onsite": 0, "remote_pct": 0, "onsite_pct": 0}
293
+
294
+ # Top countries
295
+ if "job_country" in df.columns:
296
+ country_counts = df["job_country"].value_counts().head(10)
297
+ top_countries = [
298
+ {"country": str(c), "count": int(n)}
299
+ for c, n in country_counts.items()
300
+ ]
301
+ else:
302
+ top_countries = []
303
+
304
+ print(f" Remote jobs: {remote_breakdown['remote_pct']}%")
305
+
306
+ # ─────────────────────────────────────────────
307
+ # SAVE ALL OUTPUT
308
+ # ─────────────────────────────────────────────
309
+ output = {
310
+ "meta": {
311
+ "generated_at": datetime.utcnow().isoformat(),
312
+ "total_rows_processed": total_jobs,
313
+ "dataset": "lukebarousse/data_jobs",
314
+ },
315
+ "summary_stats": summary_stats,
316
+ "top_titles": top_titles_list,
317
+ "salary_by_title": salary_by_title_list,
318
+ "top_skills": top_skills_list,
319
+ "skills_by_title": skills_by_title,
320
+ "optimal_skills": optimal_skills_list,
321
+ "skill_trends": skill_trends_list,
322
+ "salary_trends": salary_trends_list,
323
+ "remote_breakdown": remote_breakdown,
324
+ "top_countries": top_countries,
325
+ }
326
+
327
+ output_path = "precomputed_data.json"
328
+ with open(output_path, "w") as f:
329
+ json.dump(output, f, indent=2, default=str)
330
+
331
+ print("\n" + "=" * 60)
332
+ print(f"SUCCESS β€” saved to {output_path}")
333
+ print(f"Keys: {list(output.keys())}")
334
+ print("=" * 60)
335
+ print("\nNext step: run app.py to serve this data via Flask API")
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install dependencies
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Copy source files
10
+ COPY analyze.py .
11
+ COPY app.py .
12
+
13
+ # Run analyze.py first to generate precomputed_data.json
14
+ # then start Flask
15
+ RUN python analyze.py
16
+
17
+ EXPOSE 7860
18
+
19
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py β€” Flask API for FYP Dashboard
3
+ Serves precomputed_data.json generated by analyze.py
4
+
5
+ Endpoints:
6
+ GET /api/summary β†’ summary stats (cards)
7
+ GET /api/titles β†’ top job titles + salary by title
8
+ GET /api/skills β†’ top skills, skills by title, optimal skills
9
+ GET /api/trends β†’ salary trends + skill trends over time
10
+ GET /api/location β†’ remote breakdown + top countries
11
+ GET /api/all β†’ everything in one call (used by dashboard)
12
+ """
13
+
14
+ import json
15
+ import os
16
+ from flask import Flask, jsonify
17
+ from flask_cors import CORS
18
+
19
+ app = Flask(__name__)
20
+
21
+ # Allow your Vercel frontend domain β€” update this after deploying
22
+ CORS(app, origins=[
23
+ "http://localhost:3000", # local dev
24
+ "https://*.vercel.app", # any vercel preview
25
+ ])
26
+
27
+ # ─────────────────────────────────────────────
28
+ # Load precomputed data once at startup
29
+ # ─────────────────────────────────────────────
30
+ DATA_PATH = os.path.join(os.path.dirname(__file__), "precomputed_data.json")
31
+
32
+ def load_data():
33
+ if not os.path.exists(DATA_PATH):
34
+ raise FileNotFoundError(
35
+ f"precomputed_data.json not found at {DATA_PATH}\n"
36
+ "Run analyze.py first to generate it."
37
+ )
38
+ with open(DATA_PATH, "r") as f:
39
+ return json.load(f)
40
+
41
+ try:
42
+ DATA = load_data()
43
+ print(f"[OK] Loaded precomputed data")
44
+ print(f" Generated at: {DATA['meta']['generated_at']}")
45
+ print(f" Total rows: {DATA['meta']['total_rows_processed']:,}")
46
+ except FileNotFoundError as e:
47
+ print(f"[ERROR] {e}")
48
+ DATA = None
49
+
50
+
51
+ def data_required(fn):
52
+ """Decorator β€” returns 503 if data not loaded."""
53
+ from functools import wraps
54
+ @wraps(fn)
55
+ def wrapper(*args, **kwargs):
56
+ if DATA is None:
57
+ return jsonify({
58
+ "error": "Data not ready. Run analyze.py first."
59
+ }), 503
60
+ return fn(*args, **kwargs)
61
+ return wrapper
62
+
63
+
64
+ # ─────────────────────────────────────────────
65
+ # Routes
66
+ # ─────────────────────────────────────────────
67
+
68
+ @app.route("/", methods=["GET"])
69
+ def index():
70
+ if DATA is None:
71
+ return jsonify({"status": "error", "message": "Run analyze.py first"}), 503
72
+ return jsonify({
73
+ "status": "ok",
74
+ "generated_at": DATA["meta"]["generated_at"],
75
+ "total_rows": DATA["meta"]["total_rows_processed"],
76
+ "endpoints": [
77
+ "/api/summary",
78
+ "/api/titles",
79
+ "/api/skills",
80
+ "/api/trends",
81
+ "/api/location",
82
+ "/api/all",
83
+ ]
84
+ })
85
+
86
+
87
+ @app.route("/api/summary", methods=["GET"])
88
+ @data_required
89
+ def get_summary():
90
+ return jsonify(DATA["summary_stats"])
91
+
92
+
93
+ @app.route("/api/titles", methods=["GET"])
94
+ @data_required
95
+ def get_titles():
96
+ return jsonify({
97
+ "top_titles": DATA["top_titles"],
98
+ "salary_by_title": DATA["salary_by_title"],
99
+ })
100
+
101
+
102
+ @app.route("/api/skills", methods=["GET"])
103
+ @data_required
104
+ def get_skills():
105
+ return jsonify({
106
+ "top_skills": DATA["top_skills"],
107
+ "skills_by_title": DATA["skills_by_title"],
108
+ "optimal_skills": DATA["optimal_skills"],
109
+ })
110
+
111
+
112
+ @app.route("/api/trends", methods=["GET"])
113
+ @data_required
114
+ def get_trends():
115
+ return jsonify({
116
+ "salary_trends": DATA["salary_trends"],
117
+ "skill_trends": DATA["skill_trends"],
118
+ })
119
+
120
+
121
+ @app.route("/api/location", methods=["GET"])
122
+ @data_required
123
+ def get_location():
124
+ return jsonify({
125
+ "remote_breakdown": DATA["remote_breakdown"],
126
+ "top_countries": DATA["top_countries"],
127
+ })
128
+
129
+
130
+ @app.route("/api/all", methods=["GET"])
131
+ @data_required
132
+ def get_all():
133
+ """Single endpoint β€” dashboard calls this once on load."""
134
+ return jsonify(DATA)
135
+
136
+
137
+ # ─────────────────────────────────────────────
138
+ # Run
139
+ # ─────────────────────────────────────────────
140
+ if __name__ == "__main__":
141
+ port = int(os.environ.get("PORT", 5000))
142
+ debug = os.environ.get("FLASK_ENV") != "production"
143
+ print(f"Starting Flask on port {port} (debug={debug})")
144
+ app.run(host="0.0.0.0", port=port, debug=debug)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ pandas
4
+ numpy
5
+ datasets