# File: ai-service/training/train_performance_predictor.py import pandas as pd import xgboost as xgb from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder import joblib import os import sys # Root directory ko path mein add karein ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.append(ROOT_DIR) def train_performance_models(): """ Loads the performance data CSV and trains two separate models: one for predicting 'likes' and one for predicting 'comments'. """ print("--- Starting Performance Predictor Model Training ---") data_path = os.path.join(ROOT_DIR, 'data', 'performance_training_data.csv') try: df = pd.read_csv(data_path) print(f"Loaded {len(df)} rows from {data_path}") if df.empty: print("āš ļø CSV file is empty. Aborting model training.") return except (FileNotFoundError, pd.errors.EmptyDataError): print(f"šŸ”“ ERROR: Data file not found or is empty at {data_path}") return # Features (X) hum in cheezon se likes/comments ka anuman lagayenge features = ['follower_count', 'caption_length', 'campaign_niche', 'content_format'] X = df[features] # Targets (y) y_likes = df['likes'] y_comments = df['comments'] # Preprocessing pipeline (categorical features ke liye) categorical_features = ['campaign_niche', 'content_format'] preprocessor = ColumnTransformer( transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)], remainder='passthrough' ) # ---- Model #1: Likes Predictor ---- print("\n--- Training Likes Predictor Model ---") likes_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('regressor', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)) ]) likes_pipeline.fit(X, y_likes) likes_model_path = os.path.join(ROOT_DIR, 'models', 'likes_predictor_v1.joblib') joblib.dump(likes_pipeline, likes_model_path) print(f"āœ… Likes Predictor model saved to: {likes_model_path}") # ---- Model #2: Comments Predictor ---- print("\n--- Training Comments Predictor Model ---") comments_pipeline = Pipeline(steps=[ ('preprocessor', preprocessor), ('regressor', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, random_state=42)) ]) comments_pipeline.fit(X, y_comments) comments_model_path = os.path.join(ROOT_DIR, 'models', 'comments_predictor_v1.joblib') joblib.dump(comments_pipeline, comments_model_path) print(f"āœ… Comments Predictor model saved to: {comments_model_path}") print("\nšŸŽ‰ All performance models trained and saved successfully!") if __name__ == '__main__': train_performance_models()