Spaces:

ThejasRao
/

agripredict

Sleeping

App Files Files Community

ThejasRao commited on Nov 15, 2025

Commit

3029a46

verified ·

1 Parent(s): 3f71e24

Upload 5 files

Browse files

Files changed (5) hide show

README.md +34 -0
remove_recent_data.py +60 -0
requirements.txt +15 -0
streamlit_app.py +249 -0
update_all_models.py +225 -0

README.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# AgriPredict (Refactor)
+This repository contains a refactored, modularized version of the Streamlit-based AgriPredict dashboard.
+Structure:
+- `src/agri_predict` - package with modules:
+  - `config.py` - env & MongoDB connection helpers
+  - `constants.py` - shared constants (state/market mapping)
+  - `features.py` - feature engineering functions
+  - `data.py` - data access, preprocessing and scraping helpers
+  - `models.py` - model training, grid search and forecasting
+  - `plotting.py` - plotting and download helpers
+  - `utils.py` - authentication and utility functions
+- `streamlit_app.py` - Streamlit entrypoint
+- `requirements.txt` - Python dependencies
+Run locally:
+1. Create a virtualenv and install dependencies:
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+2. Set `MONGO_URI` in a `.env` file at project root.
+3. Start the app:
+```bash
+streamlit run streamlit_app.py
+```

remove_recent_data.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""Script to remove data after October 25, 2025 from MongoDB for testing the scraper."""
+from datetime import datetime
+from src.agri_predict.config import get_collections
+def remove_data_after_date(cutoff_date_str="2025-10-25"):
+    """Remove all data after the specified date.
+    Args:
+        cutoff_date_str: Date string in format YYYY-MM-DD
+    """
+    cutoff_date = datetime.strptime(cutoff_date_str, "%Y-%m-%d")
+    cols = get_collections()
+    collection = cols['collection']
+    # Count documents before deletion
+    before_count = collection.count_documents({})
+    after_cutoff_count = collection.count_documents({
+        "Reported Date": {"$gt": cutoff_date}
+    })
+    print(f"📊 Database Status:")
+    print(f"   Total documents: {before_count}")
+    print(f"   Documents after {cutoff_date_str}: {after_cutoff_count}")
+    if after_cutoff_count == 0:
+        print(f"✅ No documents found after {cutoff_date_str}")
+        return
+    # Delete documents
+    result = collection.delete_many({
+        "Reported Date": {"$gt": cutoff_date}
+    })
+    print(f"\n🗑️  Deletion Results:")
+    print(f"   Deleted {result.deleted_count} documents")
+    # Verify deletion
+    remaining_count = collection.count_documents({})
+    latest_doc = collection.find_one(sort=[("Reported Date", -1)])
+    print(f"\n✅ After Deletion:")
+    print(f"   Total documents: {remaining_count}")
+    if latest_doc:
+        latest_date = latest_doc.get("Reported Date")
+        print(f"   Latest date in database: {latest_date.strftime('%Y-%m-%d') if latest_date else 'Unknown'}")
+    else:
+        print(f"   Database is empty")
+if __name__ == "__main__":
+    print("="*60)
+    print("🧹 Cleaning MongoDB Data After 2025-10-25")
+    print("="*60 + "\n")
+    remove_data_after_date("2025-10-10")
+    print("\n" + "="*60)
+    print("✅ Cleanup Complete - Ready to test scraper!")
+    print("="*60)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+streamlit
+pandas
+numpy
+scikit-learn
+xgboost
+pymongo
+python-dotenv
+plotly
+certifi
+werkzeug
+statsmodels
+openpyxl
+xlsxwriter
+tqdm
+requests

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""Streamlit entrypoint for AgriPredict (refactored).
+Run with: `streamlit run streamlit_app.py` from project root.
+"""
+import streamlit as st
+from datetime import datetime, timedelta
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+from src.agri_predict import (
+    fetch_and_process_data,
+    fetch_and_store_data,
+    preprocess_data,
+    train_and_forecast,
+    forecast,
+    collection_to_dataframe,
+    get_dataframe_from_collection,
+)
+from src.agri_predict.constants import state_market_dict
+from src.agri_predict.utils import authenticate_user
+from src.agri_predict.config import get_collections
+st.set_page_config(layout="wide")
+@st.cache_resource
+def get_cached_collections():
+    """Cache MongoDB collections to avoid reconnecting on every page load."""
+    return get_collections()
+st.markdown("""
+    <style>
+        .main { max-width: 1200px; margin: 0 auto; }
+        h1 { color: #4CAF50; font-family: 'Arial Black', sans-serif; }
+        .stButton>button { background-color: #4CAF50; color: white; }
+    </style>
+""", unsafe_allow_html=True)
+if 'authenticated' not in st.session_state:
+    st.session_state.authenticated = False
+if st.session_state.authenticated:
+    # Get cached collections only after authentication
+    try:
+        cols = get_cached_collections()
+    except Exception as exc:
+        st.error(f"Configuration error: {exc}")
+        st.stop()
+    collection = cols['collection']
+    impExp = cols['impExp']
+    st.title("🌾 AgriPredict Dashboard")
+    if st.button("Get Live Data Feed"):
+        fetch_and_store_data()
+    view_mode = st.radio("View Mode", ["Statistics", "Plots", "Predictions", "Exim"], horizontal=True, label_visibility="collapsed")
+    if view_mode == "Plots":
+        st.sidebar.header("Filters")
+        selected_period = st.sidebar.selectbox("Select Time Period", ["2 Weeks", "1 Month", "3 Months", "1 Year", "5 Years"], index=1)
+        period_mapping = {"2 Weeks": 14, "1 Month": 30, "3 Months": 90, "1 Year": 365, "2 Years": 730, "5 Years": 1825}
+        st.session_state.selected_period = period_mapping[selected_period]
+        state_options = list(state_market_dict.keys()) + ['India']
+        selected_state = st.sidebar.selectbox("Select", state_options)
+        market_wise = False
+        if selected_state != 'India':
+            market_wise = st.sidebar.checkbox("Market Wise Analysis")
+            if market_wise:
+                markets = state_market_dict.get(selected_state, [])
+                selected_market = st.sidebar.selectbox("Select Market", markets)
+                query_filter = {"State Name": selected_state, "Market Name": selected_market}
+            else:
+                query_filter = {"State Name": selected_state}
+        else:
+            query_filter = {}
+        data_type = st.sidebar.radio("Select Data Type", ["Price", "Volume", "Both"])
+        query_filter["Reported Date"] = {"$gte": datetime.now() - timedelta(days=st.session_state.selected_period)}
+        if st.sidebar.button("✨ Let's go!"):
+            try:
+                cursor = collection.find(query_filter)
+                data = list(cursor)
+                if data:
+                    df = pd.DataFrame(data)
+                    df['Reported Date'] = pd.to_datetime(df['Reported Date'])
+                    df_grouped = df.groupby('Reported Date', as_index=False).agg({'Arrivals (Tonnes)': 'sum', 'Modal Price (Rs./Quintal)': 'mean'})
+                    date_range = pd.date_range(start=df_grouped['Reported Date'].min(), end=df_grouped['Reported Date'].max())
+                    df_grouped = df_grouped.set_index('Reported Date').reindex(date_range).rename_axis('Reported Date').reset_index()
+                    df_grouped['Arrivals (Tonnes)'] = df_grouped['Arrivals (Tonnes)'].ffill().bfill()
+                    df_grouped['Modal Price (Rs./Quintal)'] = df_grouped['Modal Price (Rs./Quintal)'].ffill().bfill()
+                    st.subheader(f"📈 Trends for {selected_state} ({'Market: ' + selected_market if market_wise else 'State'})")
+                    if data_type == "Both":
+                        scaler = MinMaxScaler()
+                        df_grouped[['Scaled Price', 'Scaled Arrivals']] = scaler.fit_transform(df_grouped[['Modal Price (Rs./Quintal)', 'Arrivals (Tonnes)']])
+                        import plotly.graph_objects as go
+                        fig = go.Figure()
+                        fig.add_trace(go.Scatter(x=df_grouped['Reported Date'], y=df_grouped['Scaled Price'], mode='lines', name='Scaled Price', line=dict(width=1, color='green')))
+                        fig.add_trace(go.Scatter(x=df_grouped['Reported Date'], y=df_grouped['Scaled Arrivals'], mode='lines', name='Scaled Arrivals', line=dict(width=1, color='blue')))
+                        fig.update_layout(title="Price and Arrivals Trend", xaxis_title='Date', yaxis_title='Scaled Values', template='plotly_white')
+                        st.plotly_chart(fig, width='stretch')
+                    elif data_type == "Price":
+                        import plotly.graph_objects as go
+                        fig = go.Figure()
+                        fig.add_trace(go.Scatter(x=df_grouped['Reported Date'], y=df_grouped['Modal Price (Rs./Quintal)'], mode='lines', name='Modal Price', line=dict(width=1, color='green')))
+                        fig.update_layout(title="Modal Price Trend", xaxis_title='Date', yaxis_title='Price (/Quintall)', template='plotly_white')
+                        st.plotly_chart(fig, width='stretch')
+                    else:
+                        import plotly.graph_objects as go
+                        fig = go.Figure()
+                        fig.add_trace(go.Scatter(x=df_grouped['Reported Date'], y=df_grouped['Arrivals (Tonnes)'], mode='lines', name='Arrivals', line=dict(width=1, color='blue')))
+                        fig.update_layout(title="Arrivals Trend", xaxis_title='Date', yaxis_title='Volume (in Tonnes)', template='plotly_white')
+                        st.plotly_chart(fig, width='stretch')
+                else:
+                    st.warning("⚠️ No data found for the selected filters.")
+            except Exception as e:
+                st.error(f"❌ Error fetching data: {e}")
+    elif view_mode == "Predictions":
+        st.subheader("📊 Model Analysis")
+        sub_option = st.radio("Select one of the following", ["India", "States", "Market"], horizontal=True)
+        sub_timeline = st.radio("Select one of the following horizons", ["14 days", "1 month", "3 month"], horizontal=True)
+        if sub_option == "States":
+            states = ["Karnataka", "Madhya Pradesh", "Gujarat", "Uttar Pradesh", "Telangana"]
+            selected_state = st.selectbox("Select State for Model Training", states)
+            filter_key = f"state_{selected_state}"
+            if st.button("Train and Forecast"):
+                query_filter = {"State Name": selected_state}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    train_and_forecast(df, filter_key, 14)
+                elif sub_timeline == "1 month":
+                    train_and_forecast(df, filter_key, 30)
+                else:
+                    train_and_forecast(df, filter_key, 90)
+            if st.button("Forecast"):
+                query_filter = {"State Name": selected_state}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    forecast(df, filter_key, 14)
+                elif sub_timeline == "1 month":
+                    forecast(df, filter_key, 30)
+                else:
+                    forecast(df, filter_key, 90)
+        elif sub_option == "Market":
+            market_options = ["Rajkot", "Gondal", "Kalburgi", "Amreli"]
+            selected_market = st.selectbox("Select Market for Model Training", market_options)
+            filter_key = f"market_{selected_market}"
+            if st.button("Train and Forecast"):
+                query_filter = {"Market Name": selected_market}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    train_and_forecast(df, filter_key, 14)
+                elif sub_timeline == "1 month":
+                    train_and_forecast(df, filter_key, 30)
+                else:
+                    train_and_forecast(df, filter_key, 90)
+            elif st.button("Forecast"):
+                query_filter = {"Market Name": selected_market}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    forecast(df, filter_key, 14)
+                elif sub_timeline == "1 month":
+                    forecast(df, filter_key, 30)
+                else:
+                    forecast(df, filter_key, 90)
+        elif sub_option == "India":
+            df = collection_to_dataframe(impExp)
+            if st.button("Train and Forecast"):
+                query_filter = {}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    train_and_forecast(df, "India", 14)
+                elif sub_timeline == "1 month":
+                    train_and_forecast(df, "India", 30)
+                else:
+                    train_and_forecast(df, "India", 90)
+            if st.button("Forecast"):
+                query_filter = {}
+                df = fetch_and_process_data(query_filter)
+                if sub_timeline == "14 days":
+                    forecast(df, "India", 14)
+                elif sub_timeline == "1 month":
+                    forecast(df, "India", 30)
+                else:
+                    forecast(df, "India", 90)
+    elif view_mode == "Statistics":
+        document = collection.find_one()
+        df = get_dataframe_from_collection(collection)
+        from src.agri_predict.plotting import display_statistics
+        display_statistics(df)
+    elif view_mode == "Exim":
+        df = collection_to_dataframe(impExp)
+        plot_option = st.radio("Select the data to visualize:", ["Import Price", "Import Quantity", "Export Price", "Export Quantity"], horizontal=True)
+        time_period = st.selectbox("Select time period:", ["1 Month", "6 Months", "1 Year", "2 Years"])
+        df["Reported Date"] = pd.to_datetime(df["Reported Date"], format="%Y-%m-%d")
+        if time_period == "1 Month":
+            start_date = pd.Timestamp.now() - pd.DateOffset(months=1)
+        elif time_period == "6 Months":
+            start_date = pd.Timestamp.now() - pd.DateOffset(months=6)
+        elif time_period == "1 Year":
+            start_date = pd.Timestamp.now() - pd.DateOffset(years=1)
+        else:
+            start_date = pd.Timestamp.now() - pd.DateOffset(years=2)
+        filtered_df = df[df["Reported Date"] >= start_date]
+        if plot_option == "Import Price":
+            grouped_df = filtered_df.groupby("Reported Date", as_index=False)["VALUE_IMPORT"].mean().rename(columns={"VALUE_IMPORT": "Average Import Price"})
+            y_axis_label = "Average Import Price (Rs.)"
+        elif plot_option == "Import Quantity":
+            grouped_df = filtered_df.groupby("Reported Date", as_index=False)["QUANTITY_IMPORT"].sum().rename(columns={"QUANTITY_IMPORT": "Total Import Quantity"})
+            y_axis_label = "Total Import Quantity (Tonnes)"
+        elif plot_option == "Export Price":
+            grouped_df = filtered_df.groupby("Reported Date", as_index=False)["VALUE_EXPORT"].mean().rename(columns={"VALUE_EXPORT": "Average Export Price"})
+            y_axis_label = "Average Export Price (Rs.)"
+        else:
+            grouped_df = filtered_df.groupby("Reported Date", as_index=False)["QUANTITY_IMPORT"].sum().rename(columns={"QUANTITY_IMPORT": "Total Export Quantity"})
+            y_axis_label = "Total Export Quantity (Tonnes)"
+        import plotly.express as px
+        fig = px.line(grouped_df, x="Reported Date", y=grouped_df.columns[1], title=f"{plot_option} Over Time", labels={"Reported Date": "Date", grouped_df.columns[1]: y_axis_label})
+        st.plotly_chart(fig)
+else:
+    with st.form("login_form"):
+        st.subheader("Please log in")
+        username = st.text_input("Username")
+        password = st.text_input("Password", type="password")
+        login_button = st.form_submit_button("Login")
+        if login_button:
+            # Get cached collections for authentication
+            try:
+                cols = get_cached_collections()
+                users_collection = cols['users_collection']
+            except Exception as exc:
+                st.error(f"Database connection error: {exc}")
+                st.stop()
+            if authenticate_user(username, password, users_collection):
+                st.session_state.authenticated = True
+                st.session_state['username'] = username
+                st.write("Login successful!")
+                st.rerun()
+            else:
+                st.error("Invalid username or password")

update_all_models.py ADDED Viewed

	@@ -0,0 +1,225 @@

+"""
+Script to train and update all models for India, States, and Markets.
+Run this script to update all forecasting models without using the UI.
+"""
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from xgboost import XGBRegressor
+from tqdm import tqdm
+from src.agri_predict import fetch_and_process_data
+from src.agri_predict.constants import state_market_dict
+from src.agri_predict.features import (
+    create_forecasting_features,
+    create_forecasting_features_1m,
+    create_forecasting_features_3m,
+)
+from src.agri_predict.config import get_collections
+# Define forecast horizons
+FORECAST_HORIZONS = [14, 30, 90]  # 14 days, 1 month, 3 months
+def train_model_batch(df, filter_key, days):
+    """Train model without UI components for batch processing."""
+    cols = get_collections()
+    # Select feature creation function based on horizon
+    if days == 14:
+        df_features = create_forecasting_features(df)
+        split_date = '2024-01-01'
+        collection_key = 'best_params_collection'
+    elif days == 30:
+        df_features = create_forecasting_features_1m(df)
+        split_date = '2023-01-01'
+        collection_key = 'best_params_collection_1m'
+    else:  # 90 days
+        df_features = create_forecasting_features_3m(df)
+        split_date = '2023-01-01'
+        collection_key = 'best_params_collection_3m'
+    # Split data
+    train_df = df_features[df_features['Reported Date'] < split_date]
+    test_df = df_features[df_features['Reported Date'] >= split_date]
+    X_train = train_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
+    y_train = train_df['Modal Price (Rs./Quintal)']
+    X_test = test_df.drop(columns=['Modal Price (Rs./Quintal)', 'Reported Date'])
+    y_test = test_df['Modal Price (Rs./Quintal)']
+    # Hyperparameter tuning with progress bar
+    param_grid = {
+        'learning_rate': [0.01, 0.1, 0.2],
+        'max_depth': [3, 5, 7],
+        'n_estimators': [50, 100, 150],
+        'booster': ['gbtree', 'dart']
+    }
+    model = XGBRegressor()
+    best_score = float('-inf')
+    best_params = None
+    total_combinations = len(param_grid['learning_rate']) * len(param_grid['max_depth']) * \
+                        len(param_grid['n_estimators']) * len(param_grid['booster'])
+    with tqdm(total=total_combinations, desc=f"  Tuning hyperparameters") as pbar:
+        for learning_rate in param_grid['learning_rate']:
+            for max_depth in param_grid['max_depth']:
+                for n_estimators in param_grid['n_estimators']:
+                    for booster in param_grid['booster']:
+                        model.set_params(
+                            learning_rate=learning_rate,
+                            max_depth=max_depth,
+                            n_estimators=n_estimators,
+                            booster=booster
+                        )
+                        model.fit(X_train, y_train)
+                        score = model.score(X_test, y_test)
+                        if score > best_score:
+                            best_score = score
+                            best_params = {
+                                'learning_rate': learning_rate,
+                                'max_depth': max_depth,
+                                'n_estimators': n_estimators,
+                                'booster': booster
+                            }
+                        pbar.update(1)
+    # Train final model with best params
+    best_model = XGBRegressor(**best_params)
+    best_model.fit(X_train, y_train)
+    y_pred = best_model.predict(X_test)
+    # Calculate metrics
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    mae = mean_absolute_error(y_test, y_pred)
+    # Save to MongoDB
+    cols[collection_key].replace_one(
+        {'filter_key': filter_key},
+        {
+            **best_params,
+            'filter_key': filter_key,
+            'last_updated': pd.Timestamp.now().isoformat(),
+            'rmse': rmse,
+            'mae': mae,
+            'score': best_score
+        },
+        upsert=True
+    )
+    return best_params, rmse, mae
+def update_india_models():
+    """Update models for all of India."""
+    print("\n" + "="*60)
+    print("UPDATING INDIA MODELS")
+    print("="*60)
+    query_filter = {}
+    df = fetch_and_process_data(query_filter)
+    if df is not None:
+        for days in FORECAST_HORIZONS:
+            horizon_name = "14 days" if days == 14 else "1 month" if days == 30 else "3 months"
+            print(f"\n[India] Training {horizon_name} forecast model...")
+            try:
+                best_params, rmse, mae = train_model_batch(df, "India", days)
+                print(f"✅ [India] {horizon_name} model updated successfully")
+                print(f"   RMSE: {rmse:.2f}, MAE: {mae:.2f}")
+            except Exception as e:
+                print(f"❌ [India] Error updating {horizon_name} model: {e}")
+    else:
+        print("❌ [India] No data available")
+def update_state_models():
+    """Update models for all states."""
+    print("\n" + "="*60)
+    print("UPDATING STATE MODELS")
+    print("="*60)
+    states = ["Karnataka", "Madhya Pradesh", "Gujarat", "Uttar Pradesh", "Telangana"]
+    for state in states:
+        print(f"\n--- Processing State: {state} ---")
+        query_filter = {"State Name": state}
+        df = fetch_and_process_data(query_filter)
+        if df is not None:
+            filter_key = f"state_{state}"
+            for days in FORECAST_HORIZONS:
+                horizon_name = "14 days" if days == 14 else "1 month" if days == 30 else "3 months"
+                print(f"[{state}] Training {horizon_name} forecast model...")
+                try:
+                    best_params, rmse, mae = train_model_batch(df, filter_key, days)
+                    print(f"✅ [{state}] {horizon_name} model updated successfully")
+                    print(f"   RMSE: {rmse:.2f}, MAE: {mae:.2f}")
+                except Exception as e:
+                    print(f"❌ [{state}] Error updating {horizon_name} model: {e}")
+        else:
+            print(f"❌ [{state}] No data available")
+def update_market_models():
+    """Update models for specific markets."""
+    print("\n" + "="*60)
+    print("UPDATING MARKET MODELS")
+    print("="*60)
+    markets = ["Rajkot", "Gondal", "Kalburgi", "Amreli"]
+    for market in markets:
+        print(f"\n--- Processing Market: {market} ---")
+        query_filter = {"Market Name": market}
+        df = fetch_and_process_data(query_filter)
+        if df is not None:
+            filter_key = f"market_{market}"
+            for days in FORECAST_HORIZONS:
+                horizon_name = "14 days" if days == 14 else "1 month" if days == 30 else "3 months"
+                print(f"[{market}] Training {horizon_name} forecast model...")
+                try:
+                    best_params, rmse, mae = train_model_batch(df, filter_key, days)
+                    print(f"✅ [{market}] {horizon_name} model updated successfully")
+                    print(f"   RMSE: {rmse:.2f}, MAE: {mae:.2f}")
+                except Exception as e:
+                    print(f"❌ [{market}] Error updating {horizon_name} model: {e}")
+        else:
+            print(f"❌ [{market}] No data available")
+def main():
+    """Main function to update all models."""
+    print("\n" + "🌾" * 30)
+    print("AGRIPREDICT - BATCH MODEL UPDATE")
+    print("🌾" * 30)
+    print("\nThis script will train and update all forecasting models.")
+    print("This may take several minutes to complete.\n")
+    try:
+        # Update India models
+        update_india_models()
+        # Update State models
+        update_state_models()
+        # Update Market models
+        update_market_models()
+        print("\n" + "="*60)
+        print("✅ ALL MODELS UPDATED SUCCESSFULLY")
+        print("="*60)
+    except KeyboardInterrupt:
+        print("\n\n⚠️ Process interrupted by user")
+    except Exception as e:
+        print(f"\n\n❌ Fatal error: {e}")
+if __name__ == "__main__":
+    main()