Fraud_Detection / train_log.py
cmasukume's picture
Upload 21 files
045d34f verified
import argparse
import os
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import schedule
import time
def main(data_path):
print(f"Reading data from: {data_path}")
# Set the tracking URI to your MLflow server
mlflow.set_tracking_uri("http://127.0.0.1:5000") # Replace with your MLflow tracking server URI
# Load data
try:
data = pd.read_csv(data_path)
except Exception as e:
print(f"Error reading the data file: {e}")
return
# Preprocess and split data
X = data.drop(columns='Class')
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
# Evaluate model
train_accuracy = accuracy_score(y_train, model.predict(X_train))
test_accuracy = accuracy_score(y_test, model.predict(X_test))
# Log model with MLflow
with mlflow.start_run() as run:
mlflow.log_param('random_state', 42)
mlflow.log_metric('train_accuracy', train_accuracy)
mlflow.log_metric('test_accuracy', test_accuracy)
mlflow.sklearn.log_model(model, 'model')
# Register the model
mlflow.register_model(
model_uri=f"runs:/{run.info.run_id}/model",
name="LogisticRegressionModel"
)
print(f"Train Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
def job():
parser = argparse.ArgumentParser()
parser.add_argument('--data', type=str, help="Path to the training data", required=True)
args = parser.parse_args()
# Print args.data for debugging
print(f"Data path provided: {args.data}")
# Check if the file exists
if not os.path.isfile(args.data):
print(f"Error: The file {args.data} does not exist.")
else:
main(args.data)
if __name__ == "__main__":
# Schedule the job to run every 30 days
schedule.every(30).days.do(job)
# Run the scheduling loop
while True:
schedule.run_pending()
time.sleep(1)
# run:
# mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 0.0.0.0 --port 5000
# then run:
# c:/Fraud_Detection/Scripts/python.exe c:/Fraud_Detection/train_log.py --data "c:/Fraud_Detection/creditcard.csv"