Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| import pickle | |
| import os | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.metrics import classification_report | |
| # ---------------- PATH ---------------- | |
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_PATH = os.path.join(BASE_DIR, "data", "fake_job_postings.csv") | |
| MODEL_DIR = os.path.join(BASE_DIR, "model") | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| # ---------------- LOAD ---------------- | |
| df = pd.read_csv(DATA_PATH) | |
| # ---------------- CLEAN (IMPROVED) ---------------- | |
| def clean(text): | |
| text = str(text).lower() | |
| text = re.sub(r"\s+", " ", text) | |
| return text | |
| df["text"] = ( | |
| df["title"].fillna("") + " " + | |
| df["description"].fillna("") + " " + | |
| df["location"].fillna("") + " " + | |
| df["company_profile"].fillna("") + " " + | |
| df["requirements"].fillna("") | |
| ) | |
| df["text"] = df["text"].apply(clean) | |
| X = df["text"] | |
| y = df["fraudulent"] | |
| # ---------------- SPLIT ---------------- | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42 | |
| ) | |
| # ---------------- PIPELINE (IMPORTANT UPGRADE) ---------------- | |
| pipeline = Pipeline([ | |
| ("tfidf", TfidfVectorizer( | |
| max_features=8000, | |
| ngram_range=(1, 2), # BIG IMPROVEMENT | |
| stop_words="english" | |
| )), | |
| ("clf", LogisticRegression( | |
| max_iter=2000, | |
| class_weight="balanced" | |
| )) | |
| ]) | |
| # ---------------- TRAIN ---------------- | |
| pipeline.fit(X_train, y_train) | |
| # ---------------- EVALUATE ---------------- | |
| y_pred = pipeline.predict(X_test) | |
| print("\nMODEL REPORT:\n") | |
| print(classification_report(y_test, y_pred)) | |
| # ---------------- SAVE ONLY PIPELINE ---------------- | |
| with open(os.path.join(MODEL_DIR, "scam_model.pkl"), "wb") as f: | |
| pickle.dump(pipeline, f) | |
| print("\nModel trained and saved successfully!") |