File size: 3,883 Bytes

3d212b0

import pandas as pd
import numpy as np
import requests
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

API_KEY = 'a4f54718b17aa482e0b0a9f2e6220fc0'
WEATHER_CACHE = {}

# Helper to map month to season
SEASON_MAP = {1: 'Winter', 2: 'Winter', 12: 'Winter',
              3: 'Spring', 4: 'Spring', 5: 'Spring',
              6: 'Summer', 7: 'Summer', 8: 'Summer',
              9: 'Fall', 10: 'Fall', 11: 'Fall'}

def fetch_weather(city, state, api_key=API_KEY):
    key = f"{city},{state}"
    if key in WEATHER_CACHE:
        return WEATHER_CACHE[key]
    geo_url = f"http://api.openweathermap.org/geo/1.0/direct?q={city},{state},US&limit=1&appid={api_key}"
    try:
        geo_resp = requests.get(geo_url)
        geo_resp.raise_for_status()
        geo_data = geo_resp.json()
        if not geo_data:
            return {'temperature': 20, 'humidity': 50, 'condition': 'Clear'}
        lat, lon = geo_data[0]['lat'], geo_data[0]['lon']
        weather_url = f"https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={api_key}&units=metric"
        weather_resp = requests.get(weather_url)
        weather_resp.raise_for_status()
        data = weather_resp.json()
        weather = {
            'temperature': data['main']['temp'],
            'humidity': data['main']['humidity'],
            'condition': data['weather'][0]['main']
        }
        WEATHER_CACHE[key] = weather
        return weather
    except Exception as e:
        print(f"Weather fetch error for {city}, {state}: {e}")
        return {'temperature': 20, 'humidity': 50, 'condition': 'Clear'}

def extract_season(month):
    return SEASON_MAP.get(month, 'Unknown')

def load_and_prepare_data(csv_path):
    df = pd.read_csv(csv_path)
    # Parse dates
    df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)
    df['order_month'] = df['Order Date'].dt.month
    df['order_day_of_week'] = df['Order Date'].dt.dayofweek
    df['season'] = df['order_month'].apply(extract_season)
    # Simulate discount for training
    np.random.seed(42)
    df['discount'] = (df['Sales'] / df['Sales'].max()) * 20 + np.random.normal(0, 2, len(df))
    # Fetch weather features
    weather_features = df.apply(lambda row: fetch_weather(row['City'], row['State']), axis=1)
    df['temperature'] = [w['temperature'] for w in weather_features]
    df['humidity'] = [w['humidity'] for w in weather_features]
    df['condition'] = [w['condition'] for w in weather_features]
    return df

def train_discount_model(df):
    features = [
        'Category', 'Sub-Category', 'Product ID', 'Sales',
        'City', 'State', 'Segment', 'Ship Mode',
        'order_month', 'order_day_of_week', 'season',
        'temperature', 'humidity', 'condition'
    ]
    X = df[features]
    y = df['discount']
    categorical = [
        'Category', 'Sub-Category', 'Product ID', 'City', 'State',
        'Segment', 'Ship Mode', 'season', 'condition'
    ]
    numeric = ['Sales', 'order_month', 'order_day_of_week', 'temperature', 'humidity']
    preprocessor = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', 'passthrough', numeric)
    ])
    model = Pipeline([
        ('pre', preprocessor),
        ('reg', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    model.fit(X, y)
    return model

if __name__ == "__main__":
    print("Loading and preparing data...")
    df = load_and_prepare_data('train.csv')
    print("Training model...")
    model = train_discount_model(df)
    joblib.dump(model, 'discount_model.joblib')
    print("Model trained and saved as discount_model.joblib.")