ShubhamAC's picture
Initial commit: ML inference and evaluation pipeline
0b01ce8
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
# Load dataset (example CSV format)
# Expected columns: text, label
data = pd.read_csv("data/train.csv")
X = data["text"].astype(str)
y = data["label"]
# Split
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=42,
stratify=y
)
# Vectorizer
vectorizer = TfidfVectorizer(
max_features=10000,
ngram_range=(1, 2),
min_df=3,
max_df=0.9,
stop_words="english"
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Model
model = LogisticRegression(
max_iter=1000,
C=0.6,
class_weight="balanced",
n_jobs=-1
)
model.fit(X_train_vec, y_train)
# Evaluation
y_pred = model.predict(X_test_vec)
f1 = f1_score(y_test, y_pred)
print("Test F1-score:", round(f1, 3))
# Save artifacts
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(model, "models/logistic_model.pkl")
print("Model artifacts saved.")