import pandas as pd
import re
import pickle
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# ---------------- PATH ----------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, "data", "fake_job_postings.csv")
MODEL_DIR = os.path.join(BASE_DIR, "model")

os.makedirs(MODEL_DIR, exist_ok=True)

# ---------------- LOAD ----------------
df = pd.read_csv(DATA_PATH)

# ---------------- CLEAN (IMPROVED) ----------------
def clean(text):
    text = str(text).lower()
    text = re.sub(r"\s+", " ", text)
    return text

df["text"] = (
    df["title"].fillna("") + " " +
    df["description"].fillna("") + " " +
    df["location"].fillna("") + " " +
    df["company_profile"].fillna("") + " " +
    df["requirements"].fillna("")
)
df["text"] = df["text"].apply(clean)

X = df["text"]
y = df["fraudulent"]

# ---------------- SPLIT ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------- PIPELINE (IMPORTANT UPGRADE) ----------------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=8000,
        ngram_range=(1, 2),   # BIG IMPROVEMENT
        stop_words="english"
    )),
    ("clf", LogisticRegression(
    max_iter=2000,
    class_weight="balanced"
))
])

# ---------------- TRAIN ----------------
pipeline.fit(X_train, y_train)

# ---------------- EVALUATE ----------------
y_pred = pipeline.predict(X_test)
print("\nMODEL REPORT:\n")
print(classification_report(y_test, y_pred))

# ---------------- SAVE ONLY PIPELINE ----------------
with open(os.path.join(MODEL_DIR, "scam_model.pkl"), "wb") as f:
    pickle.dump(pipeline, f)

print("\nModel trained and saved successfully!")