Spaces:

steviel
/

ICML2025

Runtime error

App Files Files Community

stefanoviel commited on Jul 8, 2025

Commit

1a67af9

1 Parent(s): beda8ad

caching

Browse files

Files changed (1) hide show

src/streamlit_app.py +30 -37

src/streamlit_app.py CHANGED Viewed

@@ -20,12 +20,13 @@ CSV_FILE = 'papers_with_abstracts_parallel.csv'
 # --- Caching Functions ---
 def load_embedding_model():
     """Loads the Sentence Transformer model and caches it."""
     return SentenceTransformer(EMBEDDING_MODEL)
 def load_spell_checker():
     """Loads the SpellChecker object and caches it."""
     return SpellChecker()
@@ -34,54 +35,56 @@ def load_spell_checker():
 def create_and_save_embeddings(model, data_df):
     """
     Generates and saves document embeddings and the dataframe.
-    This function is called only once if the files don't exist.
     """
     st.info("First time setup: Generating and saving embeddings. This may take a moment...")
-    # Combine title and abstract for richer embeddings
-    data_df['text_to_embed'] = data_df['title'] + ". " + data_df['abstract'].fillna('')
-    # Generate embeddings
-    corpus_embeddings = model.encode(data_df['text_to_embed'].tolist(), convert_to_tensor=True, show_progress_bar=True)
-    # Save embeddings and dataframe to /tmp directory
     try:
         torch.save(corpus_embeddings, EMBEDDINGS_FILE)
         data_df.to_pickle(DATA_FILE)
-        st.success("Embeddings and data saved successfully!")
     except Exception as e:
-        st.warning(f"Could not save embeddings to disk: {e}. Will regenerate on each session.")
     return corpus_embeddings, data_df
 def load_data_and_embeddings():
     """
-    Loads the saved embeddings and dataframe from disk.
-    If files don't exist, it calls the creation function.
     """
     model = load_embedding_model()
-    # Check if files exist and are readable
-    if os.path.exists(EMBEDDINGS_FILE) and os.path.exists(DATA_FILE):
         try:
-            corpus_embeddings = torch.load(EMBEDDINGS_FILE)
             data_df = pd.read_pickle(DATA_FILE)
             return model, corpus_embeddings, data_df
         except Exception as e:
-            st.warning(f"Could not load saved embeddings: {e}. Regenerating...")
-    # Load the raw data from CSV
     try:
         data_df = pd.read_csv(CSV_FILE)
         corpus_embeddings, data_df = create_and_save_embeddings(model, data_df)
     except FileNotFoundError:
-        st.error(f"CSV file '{CSV_FILE}' not found. Please ensure it's in your repository.")
         st.stop()
     except Exception as e:
-        st.error(f"Error loading data: {e}")
         st.stop()
     return model, corpus_embeddings, data_df
 def correct_query_spelling(query, spell_checker):
     """
     Corrects potential spelling mistakes in the user's query.
@@ -150,12 +153,13 @@ The search is performed by comparing the semantic meaning of your query with the
 Spelling mistakes in your query will be automatically corrected.
 """)
-# Load all necessary data
 try:
     model, corpus_embeddings, data_df = load_data_and_embeddings()
     spell_checker = load_spell_checker()
-    # --- User Inputs: Search Bar and Slider ---
     col1, col2 = st.columns([4, 1])
     with col1:
         search_query = st.text_input(
@@ -166,37 +170,26 @@ try:
         top_k_results = st.number_input(
             "Number of results",
             min_value=1,
-            max_value=100, # Set a reasonable max
             value=10,
             help="Select the number of top results to display."
         )
     if search_query:
-        # --- Perform Typo Correction ---
         corrected_query = correct_query_spelling(search_query, spell_checker)
-        # If a correction was made, notify the user
         if corrected_query.lower() != search_query.lower():
             st.info(f"Did you mean: **{corrected_query}**? \n\n*Showing results for the corrected query.*")
-        final_query = corrected_query
-        # --- Perform Search ---
-        search_results = semantic_search(final_query, model, corpus_embeddings, data_df, top_k=top_k_results)
-        st.subheader(f"Found {len(search_results)} results for '{final_query}'")
-        # --- Display Results ---
         if search_results:
             for result in search_results:
                 with st.container(border=True):
-                    # Title as a clickable link
                     st.markdown(f"### [{result['title']}]({result['link']})")
-                    # Authors
                     st.caption(f"**Authors:** {result['authors']}")
-                    # Expander for the abstract
                     if pd.notna(result['abstract']):
                         with st.expander("View Abstract"):
                             st.write(result['abstract'])
@@ -204,5 +197,5 @@ try:
             st.warning("No results found. Try a different query.")
 except Exception as e:
-    st.error(f"An error occurred: {e}")
     st.info("Please ensure all required libraries are installed and the CSV file is present in your repository.")

 # --- Caching Functions ---
+# --- Caching Functions (Unchanged but crucial) ---
+@st.cache_resource
 def load_embedding_model():
     """Loads the Sentence Transformer model and caches it."""
     return SentenceTransformer(EMBEDDING_MODEL)
+@st.cache_resource
 def load_spell_checker():
     """Loads the SpellChecker object and caches it."""
     return SpellChecker()
 def create_and_save_embeddings(model, data_df):
     """
     Generates and saves document embeddings and the dataframe.
+    This function is called only once if the files don't exist in the persistent directory.
     """
     st.info("First time setup: Generating and saving embeddings. This may take a moment...")
+    data_df['text_to_embed'] = data_df['title'].fillna('') + ". " + data_df['abstract'].fillna('')
+    corpus_embeddings = model.encode(
+        data_df['text_to_embed'].tolist(),
+        convert_to_tensor=True,
+        show_progress_bar=True
+    )
     try:
         torch.save(corpus_embeddings, EMBEDDINGS_FILE)
         data_df.to_pickle(DATA_FILE)
+        st.success("Embeddings and data saved successfully for future sessions!")
     except Exception as e:
+        st.warning(f"Could not save embeddings to persistent storage: {e}. Will regenerate on next session.")
     return corpus_embeddings, data_df
+@st.cache_data
 def load_data_and_embeddings():
     """
+    Loads data and embeddings. It first tries to load from the persistent directory.
+    If files don't exist, it creates them. The results are cached for the current session.
     """
     model = load_embedding_model()
+    if DATA_FILE.exists() and EMBEDDINGS_FILE.exists():
         try:
             data_df = pd.read_pickle(DATA_FILE)
+            corpus_embeddings = torch.load(EMBEDDINGS_FILE)
             return model, corpus_embeddings, data_df
         except Exception as e:
+            st.warning(f"Could not load saved files: {e}. Regenerating...")
+    # Fallback to creating embeddings if they don't exist
     try:
         data_df = pd.read_csv(CSV_FILE)
         corpus_embeddings, data_df = create_and_save_embeddings(model, data_df)
     except FileNotFoundError:
+        st.error(f"The required data file '{CSV_FILE}' was not found. Please make sure it's in your repository root.")
         st.stop()
     except Exception as e:
+        st.error(f"An unexpected error occurred while loading data: {e}")
         st.stop()
     return model, corpus_embeddings, data_df
+# ... (The rest of your functions `correct_query_spelling` and `semantic_search` remain the same) ...
 def correct_query_spelling(query, spell_checker):
     """
     Corrects potential spelling mistakes in the user's query.
 Spelling mistakes in your query will be automatically corrected.
 """)
+# --- App Logic ---
 try:
+    # Load all necessary data using the corrected function
     model, corpus_embeddings, data_df = load_data_and_embeddings()
     spell_checker = load_spell_checker()
+    # --- User Inputs ---
     col1, col2 = st.columns([4, 1])
     with col1:
         search_query = st.text_input(
         top_k_results = st.number_input(
             "Number of results",
             min_value=1,
+            max_value=100,
             value=10,
             help="Select the number of top results to display."
         )
     if search_query:
         corrected_query = correct_query_spelling(search_query, spell_checker)
         if corrected_query.lower() != search_query.lower():
             st.info(f"Did you mean: **{corrected_query}**? \n\n*Showing results for the corrected query.*")
+        search_results = semantic_search(corrected_query, model, corpus_embeddings, data_df, top_k=top_k_results)
+        st.subheader(f"Found {len(search_results)} results for '{corrected_query}'")
         if search_results:
             for result in search_results:
                 with st.container(border=True):
                     st.markdown(f"### [{result['title']}]({result['link']})")
                     st.caption(f"**Authors:** {result['authors']}")
                     if pd.notna(result['abstract']):
                         with st.expander("View Abstract"):
                             st.write(result['abstract'])
             st.warning("No results found. Try a different query.")
 except Exception as e:
+    st.error(f"An error occurred during app execution: {e}")
     st.info("Please ensure all required libraries are installed and the CSV file is present in your repository.")