Jitendra12421's picture
Upload 5 files
d3ce5a6 verified
# train.py
import asyncio
from datetime import datetime, timedelta
import pandas as pd
from features import DataPipeline
from model import StockNewsModel
def audit_leakage(X_train, X_test, train_df, test_df):
print(f"\n{'=' * 60}")
print(" DATA LEAKAGE AUDIT")
print(f"{'=' * 60}")
train_max = train_df["date"].max()
test_min = test_df["date"].min()
ok1 = train_max < test_min
print(f" [{'OK' if ok1 else 'FAIL'}] Train max date ({train_max}) < Test min date ({test_min})")
train_links = set(train_df["link"])
test_links = set(test_df["link"])
overlap = train_links & test_links
ok2 = len(overlap) == 0
print(f" [{'OK' if ok2 else 'FAIL'}] No shared articles (overlap={len(overlap)})")
future_cols = [c for c in X_train.columns if "next" in c or "future" in c or "target" in c or "label" in c]
ok3 = len(future_cols) == 0
print(f" [{'OK' if ok3 else 'FAIL'}] No future-looking feature names (found: {future_cols})")
ok4 = X_train.shape[1] == X_test.shape[1]
print(f" [{'OK' if ok4 else 'FAIL'}] Same feature count: train={X_train.shape[1]} test={X_test.shape[1]}")
all_ok = ok1 and ok2 and ok3 and ok4
print(f"\n {'OK ALL CHECKS PASSED' if all_ok else 'FAIL LEAKAGE DETECTED'}")
print(f"{'=' * 60}")
return all_ok
async def main():
TICKER = "^NSEI"
TRAIN_DAYS = 180
TEST_DAYS = 14
print(f"\n{'#' * 60}")
print(" LightGBM NEWS IMPACT MODEL - TRAINING")
print(f" Ticker: {TICKER}")
print(f" Train: {TRAIN_DAYS} days | Test: {TEST_DAYS} days")
print(f"{'#' * 60}")
pipeline = DataPipeline(TICKER, train_days=TRAIN_DAYS, test_days=TEST_DAYS)
X_train, y_dir, y_hh, y_ret, X_test, test_df, price_df = await pipeline.build_dataset()
now = datetime.now()
cutoff = (now - timedelta(days=TEST_DAYS)).date()
train_df_audit = pd.DataFrame({
"date": [cutoff - timedelta(days=1)] * len(X_train),
"link": [f"train_{i}" for i in range(len(X_train))],
})
test_df_audit = test_df[["date", "link"]].copy()
audit_leakage(X_train, X_test, train_df_audit, test_df_audit)
model = StockNewsModel()
results = model.train_and_evaluate(X_train, y_dir, y_hh, y_ret, X_test, test_df)
out_model = f'{TICKER.replace("^", "")}_model.pkl'
model.save(out_model)
if __name__ == "__main__":
asyncio.run(main())