Spaces:

AI-Manith
/

bookRecEngine

Sleeping

App Files Files Community

AI-Manith commited on Apr 6, 2025

Commit

86b447d

verified ·

1 Parent(s): 1e61b24

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -16

app.py CHANGED Viewed

@@ -8,6 +8,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.neighbors import NearestNeighbors
 import matplotlib.pyplot as plt
 import seaborn as sns
 # Set page configuration
 st.set_page_config(
@@ -21,33 +23,73 @@ GITHUB_CSV_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngin
 GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl"
 GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl"
 # Define the preprocessing function
 def preprocess_text(text):
     return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
-# Load models from GitHub - using st.cache_resource to load only once
 @st.cache_resource
-def load_models_from_github():
     try:
-        # Load TF-IDF vectorizer
-        tfidf_response = requests.get(GITHUB_TFIDF_URL)
-        tfidf = pickle.loads(tfidf_response.content)
-        # Load KNN model
-        knn_response = requests.get(GITHUB_KNN_URL)
-        knn_model = pickle.loads(knn_response.content)
         return tfidf, knn_model
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None
-# Load the dataset from GitHub - using st.cache_data to load only once
 @st.cache_data
-def load_data_from_github():
     try:
-        # Load CSV directly using Polars
-        df_cleaned = pl.read_csv(GITHUB_CSV_URL)
         # Clean and prepare the data
         df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres'])
@@ -69,8 +111,8 @@ def load_data_from_github():
 # Load models and data at startup - this happens only once due to caching
 with st.spinner("Loading models and data (this will only happen once)..."):
-    tfidf, knn_model = load_models_from_github()
-    df_cleaned = load_data_from_github()
     if tfidf is not None and knn_model is not None and df_cleaned is not None:
         models_loaded = True
@@ -82,7 +124,7 @@ st.title("📚 Book Recommendation System")
 st.markdown("Enter a book summary and genres to get personalized book recommendations!")
 if not models_loaded:
-    st.error("Failed to load models or data. Please check the GitHub URLs.")
 else:
     st.success("Models and data loaded successfully!")
@@ -184,7 +226,7 @@ st.sidebar.info(
     The recommendations are based on textual similarity between your input and
     our database of books from Goodreads.
-    Models and data are loaded directly from GitHub.
     """
 )

 from sklearn.neighbors import NearestNeighbors
 import matplotlib.pyplot as plt
 import seaborn as sns
+import os
+import time
 # Set page configuration
 st.set_page_config(
 GITHUB_KNN_URL = "https://media.githubusercontent.com/media/Manithj/bookRecEngine/refs/heads/main/knn_model.pkl"
 GITHUB_TFIDF_URL = "https://raw.githubusercontent.com/Manithj/bookRecEngine/main/tfidf_vectorizer.pkl"
+# Local file paths for saved models and dataset
+MODEL_DIR = "models"
+DATA_DIR = "data"
+KNN_PATH = os.path.join(MODEL_DIR, "knn_model.pkl")
+TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
+CSV_PATH = os.path.join(DATA_DIR, "goodreadsV2.csv")
+# Create directories if they don't exist
+os.makedirs(MODEL_DIR, exist_ok=True)
+os.makedirs(DATA_DIR, exist_ok=True)
 # Define the preprocessing function
 def preprocess_text(text):
     return re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
+# Download and save files if they don't exist locally
+def download_and_save_file(url, save_path, is_binary=True):
+    if not os.path.exists(save_path):
+        with st.spinner(f"Downloading {os.path.basename(save_path)}..."):
+            response = requests.get(url)
+            if response.status_code == 200:
+                mode = "wb" if is_binary else "w"
+                with open(save_path, mode) as f:
+                    f.write(response.content)
+                st.success(f"Downloaded {os.path.basename(save_path)}")
+                # Add a small delay to ensure file is completely written
+                time.sleep(1)
+            else:
+                st.error(f"Failed to download from {url}, status code: {response.status_code}")
+                return False
+    return True
+# Load models from local storage or download if needed
 @st.cache_resource
+def load_models():
     try:
+        # Download models if they don't exist locally
+        tfidf_downloaded = download_and_save_file(GITHUB_TFIDF_URL, TFIDF_PATH)
+        knn_downloaded = download_and_save_file(GITHUB_KNN_URL, KNN_PATH)
+        if not (tfidf_downloaded and knn_downloaded):
+            return None, None
+        # Load models from local storage
+        with open(TFIDF_PATH, 'rb') as f:
+            tfidf = pickle.load(f)
+        with open(KNN_PATH, 'rb') as f:
+            knn_model = pickle.load(f)
         return tfidf, knn_model
     except Exception as e:
         st.error(f"Error loading models: {e}")
         return None, None
+# Load the dataset from local storage or download if needed
 @st.cache_data
+def load_data():
     try:
+        # Download dataset if it doesn't exist locally
+        csv_downloaded = download_and_save_file(GITHUB_CSV_URL, CSV_PATH, is_binary=True)
+        if not csv_downloaded:
+            return None
+        # Load CSV from local storage
+        df_cleaned = pl.read_csv(CSV_PATH)
         # Clean and prepare the data
         df_cleaned = df_cleaned.drop_nulls(subset=['name', 'summary', 'genres'])
 # Load models and data at startup - this happens only once due to caching
 with st.spinner("Loading models and data (this will only happen once)..."):
+    tfidf, knn_model = load_models()
+    df_cleaned = load_data()
     if tfidf is not None and knn_model is not None and df_cleaned is not None:
         models_loaded = True
 st.markdown("Enter a book summary and genres to get personalized book recommendations!")
 if not models_loaded:
+    st.error("Failed to load models or data. Please check the file paths and URLs.")
 else:
     st.success("Models and data loaded successfully!")
     The recommendations are based on textual similarity between your input and
     our database of books from Goodreads.
+    Models and data are stored locally on the server after initial download.
     """
 )