|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import requests
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
from sklearn.preprocessing import OneHotEncoder
|
|
|
from sklearn.compose import ColumnTransformer
|
|
|
from sklearn.pipeline import Pipeline
|
|
|
import joblib
|
|
|
|
|
|
API_KEY = 'a4f54718b17aa482e0b0a9f2e6220fc0'
|
|
|
WEATHER_CACHE = {}
|
|
|
|
|
|
|
|
|
SEASON_MAP = {1: 'Winter', 2: 'Winter', 12: 'Winter',
|
|
|
3: 'Spring', 4: 'Spring', 5: 'Spring',
|
|
|
6: 'Summer', 7: 'Summer', 8: 'Summer',
|
|
|
9: 'Fall', 10: 'Fall', 11: 'Fall'}
|
|
|
|
|
|
def fetch_weather(city, state, api_key=API_KEY):
|
|
|
key = f"{city},{state}"
|
|
|
if key in WEATHER_CACHE:
|
|
|
return WEATHER_CACHE[key]
|
|
|
geo_url = f"http://api.openweathermap.org/geo/1.0/direct?q={city},{state},US&limit=1&appid={api_key}"
|
|
|
try:
|
|
|
geo_resp = requests.get(geo_url)
|
|
|
geo_resp.raise_for_status()
|
|
|
geo_data = geo_resp.json()
|
|
|
if not geo_data:
|
|
|
return {'temperature': 20, 'humidity': 50, 'condition': 'Clear'}
|
|
|
lat, lon = geo_data[0]['lat'], geo_data[0]['lon']
|
|
|
weather_url = f"https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={api_key}&units=metric"
|
|
|
weather_resp = requests.get(weather_url)
|
|
|
weather_resp.raise_for_status()
|
|
|
data = weather_resp.json()
|
|
|
weather = {
|
|
|
'temperature': data['main']['temp'],
|
|
|
'humidity': data['main']['humidity'],
|
|
|
'condition': data['weather'][0]['main']
|
|
|
}
|
|
|
WEATHER_CACHE[key] = weather
|
|
|
return weather
|
|
|
except Exception as e:
|
|
|
print(f"Weather fetch error for {city}, {state}: {e}")
|
|
|
return {'temperature': 20, 'humidity': 50, 'condition': 'Clear'}
|
|
|
|
|
|
def extract_season(month):
|
|
|
return SEASON_MAP.get(month, 'Unknown')
|
|
|
|
|
|
def load_and_prepare_data(csv_path):
|
|
|
df = pd.read_csv(csv_path)
|
|
|
|
|
|
df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)
|
|
|
df['order_month'] = df['Order Date'].dt.month
|
|
|
df['order_day_of_week'] = df['Order Date'].dt.dayofweek
|
|
|
df['season'] = df['order_month'].apply(extract_season)
|
|
|
|
|
|
np.random.seed(42)
|
|
|
df['discount'] = (df['Sales'] / df['Sales'].max()) * 20 + np.random.normal(0, 2, len(df))
|
|
|
|
|
|
weather_features = df.apply(lambda row: fetch_weather(row['City'], row['State']), axis=1)
|
|
|
df['temperature'] = [w['temperature'] for w in weather_features]
|
|
|
df['humidity'] = [w['humidity'] for w in weather_features]
|
|
|
df['condition'] = [w['condition'] for w in weather_features]
|
|
|
return df
|
|
|
|
|
|
def train_discount_model(df):
|
|
|
features = [
|
|
|
'Category', 'Sub-Category', 'Product ID', 'Sales',
|
|
|
'City', 'State', 'Segment', 'Ship Mode',
|
|
|
'order_month', 'order_day_of_week', 'season',
|
|
|
'temperature', 'humidity', 'condition'
|
|
|
]
|
|
|
X = df[features]
|
|
|
y = df['discount']
|
|
|
categorical = [
|
|
|
'Category', 'Sub-Category', 'Product ID', 'City', 'State',
|
|
|
'Segment', 'Ship Mode', 'season', 'condition'
|
|
|
]
|
|
|
numeric = ['Sales', 'order_month', 'order_day_of_week', 'temperature', 'humidity']
|
|
|
preprocessor = ColumnTransformer([
|
|
|
('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
|
|
|
('num', 'passthrough', numeric)
|
|
|
])
|
|
|
model = Pipeline([
|
|
|
('pre', preprocessor),
|
|
|
('reg', RandomForestRegressor(n_estimators=100, random_state=42))
|
|
|
])
|
|
|
model.fit(X, y)
|
|
|
return model
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
print("Loading and preparing data...")
|
|
|
df = load_and_prepare_data('train.csv')
|
|
|
print("Training model...")
|
|
|
model = train_discount_model(df)
|
|
|
joblib.dump(model, 'discount_model.joblib')
|
|
|
print("Model trained and saved as discount_model.joblib.") |