Spaces:

Zwounds
/

LibraryRAG

Sleeping

App Files Files Community

Zwounds commited on Apr 1, 2025

Commit

d93b2e5

verified ·

1 Parent(s): 93c51f9

Upload app.py

Browse files

Files changed (1) hide show

app.py +107 -88

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ from tqdm import tqdm
 from datasets import load_dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
@@ -88,14 +90,12 @@ def load_dataset_from_hf():
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
-        # Verify required columns
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
-            return None # Return None on error
-        # Ensure embeddings are lists of floats
         logging.info("Ensuring embeddings are in list format...")
         if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
@@ -111,7 +111,7 @@ def load_dataset_from_hf():
         if df.empty:
             st.error("No valid data loaded from the dataset after processing embeddings.")
             logging.error("DataFrame empty after embedding processing.")
-            return None # Return None on error
         return df
@@ -122,7 +122,7 @@ def load_dataset_from_hf():
         st.error(f"Failed to load data from dataset: {e}")
         logging.exception(f"An unexpected error occurred during data load: {e}")
-    return None # Return None on any error
 # --- Initialize Clients and Models ---
 generation_client = initialize_hf_client()
@@ -130,90 +130,112 @@ embedding_model = load_local_embedding_model()
 # ---
 # --- Setup ChromaDB Collection (using Session State) ---
-if 'chroma_collection' not in st.session_state:
-    st.session_state.chroma_collection = None
-    if embedding_model and generation_client: # Only proceed if models/clients loaded
-        with st.spinner("Loading and preparing vector database..."):
-            df = load_dataset_from_hf()
-            if df is not None and not df.empty:
-                try:
-                    logging.info("Initializing Ephemeral ChromaDB client...")
-                    chroma_client = chromadb.EphemeralClient() # Use Ephemeral Client
-                    # Delete collection if it somehow exists (unlikely for ephemeral)
-                    try:
-                        chroma_client.delete_collection(name=COLLECTION_NAME)
-                        logging.info(f"Deleted existing collection (if any): {COLLECTION_NAME}")
-                    except: pass
-                    logging.info(f"Creating collection: {COLLECTION_NAME}")
-                    collection_instance = chroma_client.create_collection(
-                        name=COLLECTION_NAME,
-                        metadata={"hnsw:space": "cosine"}
-                    )
-                    logging.info(f"Adding {len(df)} documents to ChromaDB in batches of {ADD_BATCH_SIZE}...")
-                    start_time = time.time()
-                    error_count = 0
-                    num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
-                    for i in range(num_batches):
-                        start_idx = i * ADD_BATCH_SIZE
-                        end_idx = start_idx + ADD_BATCH_SIZE
-                        batch_df = df.iloc[start_idx:end_idx]
-                        try:
-                            # Prepare and clean metadata for the batch
-                            metadatas_list_raw = batch_df['metadata'].tolist()
-                            cleaned_metadatas = []
-                            for item in metadatas_list_raw:
-                                cleaned_dict = {}
-                                if isinstance(item, dict):
-                                    current_meta = item
                                 else:
-                                    try: current_meta = json.loads(item) if isinstance(item, str) else {}
-                                    except: current_meta = {}
-                                if isinstance(current_meta, dict):
-                                    for key, value in current_meta.items():
-                                        if value is None: cleaned_dict[key] = ""
-                                        elif isinstance(value, (str, int, float, bool)): cleaned_dict[key] = value
-                                        else:
-                                            try: cleaned_dict[key] = str(value)
-                                            except: pass # Skip unconvertible types
-                                cleaned_metadatas.append(cleaned_dict)
-                            # Add the batch
-                            collection_instance.add(
-                                ids=batch_df['id'].tolist(),
-                                embeddings=batch_df['embedding'].tolist(),
-                                documents=batch_df['document'].tolist(),
-                                metadatas=cleaned_metadatas
-                            )
-                        except Exception as e:
-                            logging.error(f"Error adding batch {i+1}/{num_batches} to Chroma: {e}")
-                            error_count += 1
-                    end_time = time.time()
-                    logging.info(f"Finished loading data into ChromaDB. Took {end_time - start_time:.2f} seconds.")
-                    if error_count > 0: logging.warning(f"Encountered errors in {error_count} batches during add.")
-                    final_count = collection_instance.count()
-                    logging.info(f"Final document count in Chroma collection: {final_count}")
-                    if final_count > 0:
-                        st.session_state.chroma_collection = collection_instance
-                        st.success("Vector database loaded successfully!")
-                    else:
-                        st.error("Failed to load documents into the vector database.")
-                except Exception as setup_e:
-                    st.error(f"Failed to setup ChromaDB: {setup_e}")
-                    logging.exception(f"Failed to setup ChromaDB: {setup_e}")
             else:
-                 st.error("Failed to load data from the dataset. Cannot initialize database.")
-# Assign collection from session state for use in the app
-collection = st.session_state.get('chroma_collection', None)
 # ---
 # --- Helper Functions ---
@@ -235,7 +257,6 @@ def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MO
 def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
     """Uses LLM (HF Inference API) to generate alternative phrasings."""
-    # ... (rest of function remains the same) ...
     prompt = f"""Given the user query: "{query}"
 Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
 Focus on synonyms, different levels of specificity, and related concepts.
@@ -268,10 +289,8 @@ Output:"""
         logging.error(f"Failed to generate query variations: {e}")
         return []
 def generate_prompt(query, context_chunks):
     """Generates a prompt for the LLM."""
-    # ... (function remains the same) ...
     context_str = "\n\n".join(context_chunks)
     liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
     prompt = f"""Based on the following context from the library guides, answer the user's question.

 from datasets import load_dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
+# Import config if needed for EphemeralClient settings, though default might be fine
+import chromadb.config
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
+            return None
         logging.info("Ensuring embeddings are in list format...")
         if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
         if df.empty:
             st.error("No valid data loaded from the dataset after processing embeddings.")
             logging.error("DataFrame empty after embedding processing.")
+            return None
         return df
         st.error(f"Failed to load data from dataset: {e}")
         logging.exception(f"An unexpected error occurred during data load: {e}")
+    return None
 # --- Initialize Clients and Models ---
 generation_client = initialize_hf_client()
 # ---
 # --- Setup ChromaDB Collection (using Session State) ---
+# This function now attempts to load or create the collection and stores it in session state
+def setup_chroma_collection():
+    if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
+        logging.info("Using existing Chroma collection from session state.")
+        return st.session_state.chroma_collection
+    # Proceed with setup only if essential components are loaded
+    if not embedding_model or not generation_client:
+        st.error("Cannot setup ChromaDB: Required models/clients failed to initialize.")
+        return None
+    with st.spinner("Loading and preparing vector database..."):
+        df = load_dataset_from_hf()
+        if df is None or df.empty:
+            st.error("Failed to load embedding data. Cannot initialize vector database.")
+            return None
+        try:
+            logging.info("Initializing Ephemeral ChromaDB client...")
+            # Use EphemeralClient explicitly
+            chroma_client = chromadb.EphemeralClient(
+                settings=chromadb.config.Settings(
+                    anonymized_telemetry=False, # Optional: Disable telemetry
+                    allow_reset=True # Optional: Allows resetting
+                )
+            )
+            # Check if collection exists and delete if it does (robustness)
+            try:
+                existing_collections = [col.name for col in chroma_client.list_collections()]
+                if COLLECTION_NAME in existing_collections:
+                    chroma_client.delete_collection(name=COLLECTION_NAME)
+                    logging.info(f"Deleted existing collection: {COLLECTION_NAME}")
+            except Exception as delete_e:
+                 logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
+            logging.info(f"Creating collection: {COLLECTION_NAME}")
+            collection_instance = chroma_client.create_collection(
+                name=COLLECTION_NAME,
+                metadata={"hnsw:space": "cosine"} # No embedding function needed here
+            )
+            logging.info(f"Adding {len(df)} documents to ChromaDB in batches of {ADD_BATCH_SIZE}...")
+            start_time = time.time()
+            error_count = 0
+            num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
+            for i in range(num_batches):
+                start_idx = i * ADD_BATCH_SIZE
+                end_idx = start_idx + ADD_BATCH_SIZE
+                batch_df = df.iloc[start_idx:end_idx]
+                try:
+                    # Prepare and clean metadata for the batch
+                    metadatas_list_raw = batch_df['metadata'].tolist()
+                    cleaned_metadatas = []
+                    for item in metadatas_list_raw:
+                        cleaned_dict = {}
+                        current_meta = item if isinstance(item, dict) else {}
+                        if not isinstance(item, dict):
+                            try: current_meta = json.loads(item) if isinstance(item, str) else {}
+                            except: current_meta = {}
+                        if isinstance(current_meta, dict):
+                            for key, value in current_meta.items():
+                                if value is None: cleaned_dict[key] = ""
+                                elif isinstance(value, (str, int, float, bool)): cleaned_dict[key] = value
                                 else:
+                                    try: cleaned_dict[key] = str(value)
+                                    except: pass
+                        cleaned_metadatas.append(cleaned_dict)
+                    # Add the batch
+                    collection_instance.add(
+                        ids=batch_df['id'].tolist(),
+                        embeddings=batch_df['embedding'].tolist(),
+                        documents=batch_df['document'].tolist(),
+                        metadatas=cleaned_metadatas
+                    )
+                except Exception as e:
+                    logging.error(f"Error adding batch {i+1}/{num_batches} to Chroma: {e}")
+                    error_count += 1
+            end_time = time.time()
+            logging.info(f"Finished loading data into ChromaDB. Took {end_time - start_time:.2f} seconds.")
+            if error_count > 0: logging.warning(f"Encountered errors in {error_count} batches during add.")
+            final_count = collection_instance.count()
+            logging.info(f"Final document count in Chroma collection: {final_count}")
+            if final_count > 0:
+                st.session_state.chroma_collection = collection_instance
+                st.success("Vector database loaded successfully!")
+                return collection_instance
             else:
+                st.error("Failed to load documents into the vector database.")
+                return None
+        except Exception as setup_e:
+            st.error(f"Failed to setup ChromaDB: {setup_e}")
+            logging.exception(f"Failed to setup ChromaDB: {setup_e}")
+            return None
+# --- Initialize collection ---
+# Call the setup function which populates session state if needed
+collection = setup_chroma_collection()
 # ---
 # --- Helper Functions ---
 def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
     """Uses LLM (HF Inference API) to generate alternative phrasings."""
     prompt = f"""Given the user query: "{query}"
 Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
 Focus on synonyms, different levels of specificity, and related concepts.
         logging.error(f"Failed to generate query variations: {e}")
         return []
 def generate_prompt(query, context_chunks):
     """Generates a prompt for the LLM."""
     context_str = "\n\n".join(context_chunks)
     liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
     prompt = f"""Based on the following context from the library guides, answer the user's question.