Spaces:

shravanijadhav264
/

JobShield-AI

Sleeping

App Files Files Community

JobShield-AI / train_model.py

shravanijadhav264

Initial clean commit

984c70c 16 days ago

raw

history blame contribute delete

1.94 kB

	import pandas as pd
	import re
	import pickle
	import os

	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import classification_report

	# ---------------- PATH ----------------
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DATA_PATH = os.path.join(BASE_DIR, "data", "fake_job_postings.csv")
	MODEL_DIR = os.path.join(BASE_DIR, "model")

	os.makedirs(MODEL_DIR, exist_ok=True)

	# ---------------- LOAD ----------------
	df = pd.read_csv(DATA_PATH)

	# ---------------- CLEAN (IMPROVED) ----------------
	def clean(text):
	text = str(text).lower()
	text = re.sub(r"\s+", " ", text)
	return text

	df["text"] = (
	df["title"].fillna("") + " " +
	df["description"].fillna("") + " " +
	df["location"].fillna("") + " " +
	df["company_profile"].fillna("") + " " +
	df["requirements"].fillna("")
	)
	df["text"] = df["text"].apply(clean)

	X = df["text"]
	y = df["fraudulent"]

	# ---------------- SPLIT ----------------
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42
	)

	# ---------------- PIPELINE (IMPORTANT UPGRADE) ----------------
	pipeline = Pipeline([
	("tfidf", TfidfVectorizer(
	max_features=8000,
	ngram_range=(1, 2), # BIG IMPROVEMENT
	stop_words="english"
	)),
	("clf", LogisticRegression(
	max_iter=2000,
	class_weight="balanced"
	))
	])

	# ---------------- TRAIN ----------------
	pipeline.fit(X_train, y_train)

	# ---------------- EVALUATE ----------------
	y_pred = pipeline.predict(X_test)
	print("\nMODEL REPORT:\n")
	print(classification_report(y_test, y_pred))

	# ---------------- SAVE ONLY PIPELINE ----------------
	with open(os.path.join(MODEL_DIR, "scam_model.pkl"), "wb") as f:
	pickle.dump(pipeline, f)

	print("\nModel trained and saved successfully!")