Spaces:

mimo1972
/

Fight_Price_Prediction

Sleeping

App Files Files Community

mimo1972 commited on Jan 3

Commit

a0e0a43

verified ·

1 Parent(s): 136f7e9

Upload 4 files

Browse files

Files changed (4) hide show

AirFlights_HistBoost_model.pkl +2 -2
flightprice.py +230 -163
x_test.parquet +2 -2
y_test.parquet +2 -2

AirFlights_HistBoost_model.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3307968d2d458adfa9331b73d24b22e44b7a232847cf5cf5ff01245c8ec61524
-size 541810

 version https://git-lfs.github.com/spec/v1
+oid sha256:04058bf9b544483c06567ddd213884f85d552d6ef02c69e94f1bd8b6a820c580
+size 834018

flightprice.py CHANGED Viewed

@@ -1,163 +1,230 @@
-import streamlit as st
-import pandas as pd
-import numpy as np
-import joblib
-import matplotlib.pyplot as plt
-import seaborn as sns
-from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
-# Set Page Config
-st.set_page_config(page_title="Flight Price Predictor", layout="wide")
-# --- 1. Helper Functions ---
-@st.cache_data
-def load_data():
-    """Loads the test data to get unique values for dropdowns and for evaluation."""
-    x_test = pd.read_parquet('x_test.parquet')
-    y_test = pd.read_parquet('y_test.parquet')
-    return x_test, y_test
-@st.cache_resource
-def load_model():
-    """Loads the trained HistGradientBoosting model."""
-    return joblib.load('AirFlights_HistBoost_model.pkl')
-# Load Data and Model
-try:
-    x_test, y_test = load_data()
-    model = load_model()
-    # Ensure target is 1D array
-    if isinstance(y_test, pd.DataFrame):
-        y_test_series = y_test.iloc[:, 0]
-    else:
-        y_test_series = y_test
-except Exception as e:
-    st.error(f"Error loading files: {e}")
-    st.stop()
-# --- 2. Sidebar Navigation ---
-st.sidebar.title("Navigation")
-page = st.sidebar.radio("Go to", ["✈️ Predict Price", "qh Model Evaluation"])
-# --- PAGE 1: PREDICT PRICE ---
-if page == "✈️ Predict Price":
-    st.title("✈️ Flight Price Prediction")
-    st.markdown("Enter the flight details below to get an estimated price.")
-    # Create a form for user input
-    with st.form("prediction_form"):
-        col1, col2, col3 = st.columns(3)
-        # We extract unique values from x_test to populate dropdowns automatically
-        # This ensures the inputs match exactly what the model learned
-        with col1:
-            airline = st.selectbox("Airline", sorted(x_test['Airline'].unique()))
-            source = st.selectbox("Source", sorted(x_test['Source'].unique()))
-            destination = st.selectbox("Destination", sorted(x_test['Destination'].unique()))
-        with col2:
-            # Categorical Time Features
-            month = st.selectbox("Month", x_test['Month'].unique())
-            day = st.selectbox("Day", x_test['Day'].unique()) # e.g. Weekday or Day of Month
-            dept_quarter = st.selectbox("Departure Time of Day", x_test['Dept_Day_Quarter'].unique())
-        with col3:
-            # Numerical Features
-            stops = st.number_input("Total Stops", min_value=0, max_value=4, step=1, value=0)
-            duration = st.number_input("Duration (minutes)", min_value=30, max_value=3000, step=15, value=120)
-        submitted = st.form_submit_button("Predict Price")
-    if submitted:
-        # 1. Prepare Input Data
-        input_data = pd.DataFrame({
-            'Airline': [airline],
-            'Source': [source],
-            'Destination': [destination],
-            'Total_Stops': [stops],
-            'Duration_minutes': [duration],
-            'Day': [day],
-            'Month': [month],
-            'Dept_Day_Quarter': [dept_quarter]
-        })
-        # Ensure columns are in the exact same order as x_test
-        input_data = input_data[x_test.columns]
-        # 2. Predict (Model returns Log Price)
-        log_prediction = model.predict(input_data)[0]
-        # 3. Inverse Transform (Log -> Real Price)
-        real_price = np.expm1(log_prediction)
-        # 4. Display Result
-        st.success(f"Estimated Ticket Price: ₹ {real_price:,.2f}")
-        # Debug info (optional)
-        with st.expander("See processed input"):
-            st.write(input_data)
-# --- PAGE 2: MODEL EVALUATION ---
-elif page == "qh Model Evaluation":
-    st.title("qh Model Performance Report")
-    st.write("Evaluating the model on `x_test.parquet` and `y_test.parquet`.")
-    if st.button("Run Evaluation"):
-        with st.spinner("Calculating predictions..."):
-            # 1. Predict on Test Set
-            y_pred_log = model.predict(x_test)
-            # 2. Convert to Real Prices
-            y_pred_real = np.expm1(y_pred_log)
-            y_test_real = np.expm1(y_test_series)
-            # 3. Metrics
-            r2 = r2_score(y_test_series, y_pred_log) # R2 on Log scale (Model Metric)
-            r2_real = r2_score(y_test_real, y_pred_real) # R2 on Real scale (Business Metric)
-            mae = mean_absolute_error(y_test_real, y_pred_real)
-            rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_real))
-        # --- Display Metrics ---
-        col1, col2, col3, col4 = st.columns(4)
-        col1.metric("R2 Score (Log)", f"{r2:.4f}")
-        col2.metric("R2 Score (Real)", f"{r2_real:.4f}")
-        col3.metric("MAE (Error)", f"₹ {mae:.0f}")
-        col4.metric("RMSE (Error)", f"₹ {rmse:.0f}")
-        st.markdown("---")
-        # --- Graphs ---
-        tab1, tab2 = st.tabs(["Actual vs Predicted", "Residuals Distribution"])
-        with tab1:
-            st.subheader("Actual Prices vs Predicted Prices")
-            fig, ax = plt.subplots(figsize=(10, 6))
-            sns.scatterplot(x=y_test_real, y=y_pred_real, alpha=0.5, color="blue", ax=ax)
-            # Perfect prediction line
-            min_val = min(y_test_real.min(), y_pred_real.min())
-            max_val = max(y_test_real.max(), y_pred_real.max())
-            ax.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label="Perfect Prediction")
-            ax.set_xlabel("Actual Price")
-            ax.set_ylabel("Predicted Price")
-            ax.legend()
-            st.pyplot(fig)
-        with tab2:
-            st.subheader("Residuals (Error) Distribution")
-            residuals = y_test_real - y_pred_real
-            fig, ax = plt.subplots(figsize=(10, 6))
-            sns.histplot(residuals, kde=True, color="purple", ax=ax)
-            ax.set_xlabel("Error (Actual - Predicted)")
-            ax.set_title("Are the errors centered around 0?")
-            st.pyplot(fig)
-        # --- Data Table ---
-        st.markdown("---")
-        st.subheader("Detailed Test Data & Predictions")
-        results_df = x_test.copy()
-        results_df['Actual_Price'] = y_test_real
-        results_df['Predicted_Price'] = y_pred_real
-        results_df['Difference'] = results_df['Actual_Price'] - results_df['Predicted_Price']
-        st.dataframe(results_df.head(100))

+import streamlit as st
+import pandas as pd
+import numpy as np
+import joblib
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
+# Set Page Config
+st.set_page_config(page_title="Flight Price System", layout="wide", page_icon="✈️")
+# --- 1. Helper Functions ---
+@st.cache_data
+def load_data():
+    try:
+        x_train = pd.read_parquet('x_train.parquet')
+        x_test = pd.read_parquet('x_test.parquet')
+        y_test = pd.read_parquet('y_test.parquet')
+        # CLEANING: Strip whitespace
+        for col in ['Airline', 'Source', 'Destination', 'Route']:
+            if col in x_train.columns:
+                x_train[col] = x_train[col].astype(str).str.strip()
+        return x_train, x_test, y_test
+    except Exception as e:
+        st.error(f"Error loading data files: {e}")
+        return None, None, None
+@st.cache_resource
+def load_model():
+    try:
+        return joblib.load('AirFlights_HistBoost_model.pkl')
+    except Exception as e:
+        st.error(f"Error loading model: {e}")
+        return None
+def get_day_name(day_num, month_name):
+    try:
+        date_str = f"{int(day_num)}-{month_name}-2019"
+        return pd.to_datetime(date_str, format="%d-%B-%Y").day_name()
+    except:
+        return "Monday"
+def get_day_quarter(hour):
+    if 5 <= hour < 12:
+        return 'Morning'
+    elif 12 <= hour < 17:
+        return 'Afternoon'
+    elif 17 <= hour < 21:
+        return 'Evening'
+    else:
+        return 'Night'
+# --- 2. EXHAUSTIVE AIRPORT CODE MAPPING ---
+CITY_TO_CODE = {
+    'Banglore': 'BLR',
+    'Bangalore': 'BLR',
+    'Delhi': 'DEL',
+    'New Delhi': 'DEL',
+    'Kolkata': 'CCU',
+    'Calcutta': 'CCU',
+    'Hyderabad': 'HYD',
+    'Chennai': 'MAA',
+    'Madras': 'MAA',
+    'Mumbai': 'BOM',
+    'Bombay': 'BOM',
+    'Cochin': 'COK',
+    'Kochi': 'COK',
+    'Pune': 'PNQ',
+    'Goa': 'GOI',
+    'Jaipur': 'JAI',
+    'Lucknow': 'LKO',
+    'Patna': 'PAT',
+    'Varanasi': 'VNS',
+    'Bhubaneswar': 'BBI',
+    'Nagpur': 'NAG',
+    'Trivandrum': 'TRV'
+}
+def get_code(city):
+    clean_city = city.strip()
+    return CITY_TO_CODE.get(clean_city, clean_city[:3].upper())
+def is_route_valid(route_str, source_code, dest_code):
+    if pd.isna(route_str) or route_str == 'nan':
+        return False
+    route_upper = route_str.upper()
+    parts = route_upper.replace("→", " ").replace("->", " ").split()
+    if not parts:
+        return False
+    first_stop = parts[0]
+    return (first_stop == source_code) and (dest_code in parts)
+# Load Data
+x_train, x_test, y_test = load_data()
+model = load_model()
+if x_train is None or model is None:
+    st.stop()
+if isinstance(y_test, pd.DataFrame):
+    y_test_series = y_test.iloc[:, 0]
+else:
+    y_test_series = y_test
+# Build Lookup
+route_lookup = {}
+if 'Route' in x_train.columns and 'Total_Stops' in x_train.columns:
+    temp = x_train[['Route', 'Total_Stops']].drop_duplicates(subset=['Route'])
+    route_lookup = temp.set_index('Route')['Total_Stops'].to_dict()
+# --- 3. App Layout ---
+st.sidebar.title("Navigation")
+page = st.sidebar.radio("Go to", ["💰 Price Prediction", "📊 Model Evaluation"])
+if page == "💰 Price Prediction":
+    st.title("✈️ Flight Price Prediction")
+    st.markdown("### Enter Flight Details")
+    # REMOVED st.form HERE so inputs update instantly!
+    c1, c2 = st.columns(2)
+    with c1:
+        st.subheader("Flight Info")
+        # Source (Updates instantly now)
+        source = st.selectbox("Source", sorted(x_train['Source'].unique()))
+        src_code = get_code(source)
+        st.success(f"🛫 Source Code: **{src_code}**")
+        # Destination (Updates instantly now)
+        destination = st.selectbox("Destination", sorted(x_train['Destination'].unique()))
+        dest_code = get_code(destination)
+        st.error(f"🛬 Destination Code: **{dest_code}**")
+        airline = st.selectbox("Airline", sorted(x_train['Airline'].unique()))
+    with c2:
+        st.subheader("Date & Time")
+        if 'Month' in x_train.columns:
+            months = sorted(x_train['Month'].unique())
+        else:
+            months = ['March', 'April', 'May', 'June', 'September', 'December']
+        month = st.selectbox("Month", months)
+        day_number = st.number_input("Day Number", 1, 31, 1)
+        dept_hour = st.number_input("Departure Hour", 0, 23, 10)
+    st.markdown("---")
+    st.subheader("Route Selection")
+    selected_route = None
+    stops_val = 0
+    if 'Route' in x_train.columns:
+        # 1. Get all routes
+        all_routes = sorted(x_train['Route'].unique().astype(str))
+        # 2. FILTER: Show only routes starting with the exact Source Code
+        # Now this runs immediately when you change 'Source' above
+        filtered_routes = []
+        for r in all_routes:
+            parts = r.upper().replace("→", " ").replace("->", " ").split()
+            if parts and parts[0] == src_code:
+                filtered_routes.append(r)
+        if filtered_routes:
+            selected_route_raw = st.selectbox("Select Route", filtered_routes)
+            # 3. VALIDATE
+            if is_route_valid(selected_route_raw, src_code, dest_code):
+                selected_route = selected_route_raw
+                stops_val = route_lookup.get(selected_route, 0)
+                st.metric("Total Stops", stops_val)
+                st.success("✅ Valid Route")
+            else:
+                st.warning(f"⚠️ **{selected_route_raw}** starts at **{src_code}** but does not reach **{dest_code}**. Please check your destination.")
+                selected_route = None
+        else:
+            st.error(f"No routes found starting with code **{src_code}**.")
+    else:
+        st.error("Route column missing.")
+    # We only use a button for the final prediction calculation
+    if st.button("Predict Price", type="primary"):
+        if selected_route:
+            # Prepare Input
+            day_name = get_day_name(day_number, month)
+            quarter = get_day_quarter(dept_hour)
+            input_df = pd.DataFrame({
+                'Airline': [airline], 'Source': [source], 'Destination': [destination],
+                'Month': [month], 'Route': [selected_route],
+                'Day_number': [day_number], 'Dept_hour': [dept_hour],
+                'Day': [day_name], 'Dept_Day_Quarter': [quarter],
+                'Total_Stops': [stops_val]
+            })
+            # Align Cols
+            final_input = pd.DataFrame(columns=x_train.columns)
+            for col in x_train.columns:
+                final_input.loc[0, col] = input_df.iloc[0].get(col, 0)
+            # Types
+            for col in final_input.columns:
+                if x_train[col].dtype == 'object':
+                    final_input[col] = final_input[col].astype(str)
+                else:
+                    final_input[col] = pd.to_numeric(final_input[col])
+            try:
+                pred = model.predict(final_input)[0]
+                st.success(f"### Estimated Price: ₹ {np.expm1(pred):,.2f}")
+            except Exception as e:
+                st.error(f"Error: {e}")
+        else:
+            st.error("Please select a valid route.")
+elif page == "📊 Model Evaluation":
+    st.title("Model Evaluation")
+    if st.button("Evaluate"):
+        with st.spinner("Running..."):
+            y_pred = model.predict(x_test)
+            r2 = r2_score(y_test_series, y_pred)
+            st.metric("R2 Score", f"{r2:.4f}")
+            fig, ax = plt.subplots()
+            sns.scatterplot(x=np.expm1(y_test_series), y=np.expm1(y_pred), ax=ax)
+            ax.plot([0, 80000], [0, 80000], 'r--')
+            st.pyplot(fig)

x_test.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14b2378d0cc0c08968a3ca37404afcb40de80a12b698a41e4cb128c0037aa2e6
-size 25918

 version https://git-lfs.github.com/spec/v1
+oid sha256:008327ee502389d2aff543afc4f4a5749f7147856f9f15fd04e4d8ef744bf509
+size 7260

y_test.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a8ed09322ef92c2d46b031a9f18c0202b5bef664652be589e4c5da5414c8c0f
-size 22385

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce163df1a6d494b9d502d2caed1e8b0cd424e31ff84371bfc21853f09f2343d3
+size 3427