Spaces:

Zwounds
/

LibraryRAG

Sleeping

App Files Files Community

Zwounds commited on Apr 1, 2025

Commit

93c51f9

verified ·

1 Parent(s): c51456e

Upload app.py

Browse files

Files changed (1) hide show

app.py +121 -139

app.py CHANGED Viewed

@@ -5,43 +5,38 @@ import sys
 import json
 import os
 from dotenv import load_dotenv
-from huggingface_hub import InferenceClient, hf_hub_download # Added for dataset download
 import numpy as np
 import time
 from tqdm import tqdm
-# Need datasets, pandas, sentence-transformers
-from datasets import load_dataset, DatasetDict, Dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
-# Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
-# import chromadb.utils.embedding_functions as embedding_functions
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
 # ---
 # --- Configuration ---
-# DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
 COLLECTION_NAME = "libguides_content"
 LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
 HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
 HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
 PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
-# INPUT_FILE = 'extracted_content.jsonl' # No longer needed for app runtime
-# EMBEDDING_BATCH_SIZE = 100 # Batch size for adding docs to ChromaDB (now done during load)
 ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
 TOP_K = 10
 INITIAL_N_RESULTS = 50
-API_RETRY_DELAY = 2
 MAX_NEW_TOKENS = 512
 # ---
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
-# --- Load API Key and Initialize HF Generation Client ---
 @st.cache_resource
 def initialize_hf_client():
     generation_client_instance = None
     try:
         load_dotenv()
@@ -60,12 +55,9 @@ def initialize_hf_client():
         st.stop()
     return None
-generation_client = initialize_hf_client()
-# ---
-# --- Load Local Embedding Model (for Queries) ---
 @st.cache_resource
 def load_local_embedding_model():
     logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
     try:
          import torch
@@ -84,37 +76,26 @@ def load_local_embedding_model():
         st.stop()
     return None
-embedding_model = load_local_embedding_model()
-# ---
-# --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
 @st.cache_resource
-def load_data_and_setup_chroma():
-    # Ensure dependent resources are loaded first
-    if not generation_client or not embedding_model:
-         st.error("Required clients/models not initialized. Cannot proceed.")
-         st.stop()
     try:
-        logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
-        try:
-            parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
-            logging.info(f"Downloaded dataset file to: {parquet_path}")
-        except Exception as download_e:
-             logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
-             st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
-             st.stop()
         logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
-            st.stop()
         logging.info("Ensuring embeddings are in list format...")
         if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
@@ -130,131 +111,131 @@ def load_data_and_setup_chroma():
         if df.empty:
             st.error("No valid data loaded from the dataset after processing embeddings.")
             logging.error("DataFrame empty after embedding processing.")
-            st.stop()
-        logging.info("Initializing in-memory ChromaDB client...")
-        # Explicitly configure for in-memory using DuckDB+Parquet
-        settings = chromadb.config.Settings(
-            chroma_api_impl="local",
-            chroma_db_impl="duckdb+parquet",
-            persist_directory=None # Ensure no persistence is attempted
-        )
-        chroma_client = chromadb.Client(settings=settings)
-        try:
-            chroma_client.delete_collection(name=COLLECTION_NAME)
-            logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
-        except: pass
-        logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
-        collection = chroma_client.create_collection(
-            name=COLLECTION_NAME,
-            metadata={"hnsw:space": "cosine"}
-        )
-        logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
-        start_time = time.time()
-        error_count = 0
-        num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
-        progress_bar = st.progress(0, text="Loading embeddings into memory...")
-        for i in range(num_batches):
-            start_idx = i * ADD_BATCH_SIZE
-            end_idx = start_idx + ADD_BATCH_SIZE
-            batch_df = df.iloc[start_idx:end_idx]
-            try:
-                # Prepare metadata for the batch
-                metadatas_list_raw = batch_df['metadata'].tolist()
-                cleaned_metadatas = []
-                for item in metadatas_list_raw:
-                    cleaned_dict = {}
-                    # Handle potential non-dict items loaded from parquet/dataset
-                    if isinstance(item, dict):
-                        current_meta = item
-                    else:
-                        try: # Attempt to parse if it's a JSON string
-                            current_meta = json.loads(item) if isinstance(item, str) else {}
-                        except:
-                            current_meta = {} # Default to empty dict if not dict or valid JSON
-                    # Clean None values within the dictionary
-                    if isinstance(current_meta, dict):
-                        for key, value in current_meta.items():
-                            if value is None:
-                                cleaned_dict[key] = "" # Replace None with empty string
-                            elif isinstance(value, (str, int, float, bool)):
-                                cleaned_dict[key] = value # Keep allowed types
-                            else:
-                                try: # Attempt to convert others to string
-                                    cleaned_dict[key] = str(value)
-                                    logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
-                                except:
-                                    logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
-                    cleaned_metadatas.append(cleaned_dict)
-                # Add the batch with cleaned metadata
-                collection.add(
-                    ids=batch_df['id'].tolist(),
-                    embeddings=batch_df['embedding'].tolist(),
-                    documents=batch_df['document'].tolist(),
-                    metadatas=cleaned_metadatas # Use the cleaned list
-                )
-            except Exception as e:
-                logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
-                error_count += 1
-            progress_bar.progress((i + 1) / num_batches, text=f"Loading embeddings... Batch {i+1}/{num_batches}")
-        progress_bar.empty()
-        end_time = time.time()
-        logging.info(f"Finished loading data into in-memory ChromaDB. Took {end_time - start_time:.2f} seconds.")
-        if error_count > 0:
-            logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
-        # Verify count after adding
-        final_count = collection.count()
-        logging.info(f"Final document count in Chroma collection: {final_count}")
-        if final_count == 0 and len(df) > 0:
-             st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
-             # Don't necessarily stop, but warn the user.
-        st.success("Embeddings loaded successfully!")
-        return collection
     except ImportError as e:
         st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
-        logging.error(f"ImportError during dataset loading/Chroma setup: {e}")
-        st.stop()
     except Exception as e:
-        st.error(f"Failed to load data and initialize ChromaDB: {e}")
-        logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
-        st.stop()
-    return None
-# --- Load data and collection ---
-collection = load_data_and_setup_chroma()
 # ---
 # --- Helper Functions ---
 def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
     """Sends the prompt to the HF Inference API using the initialized client."""
     if not client_instance:
-        client_instance = generation_client
-    if not client_instance:
-         logging.error("HF Inference client not initialized in query_hf_inference.")
          return "Error: HF Inference client failed to initialize."
     try:
         response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
         if not response_text:
-             logging.warning(f"Received empty response from HF Inference API ({model_name}) for prompt: {prompt[:100]}...")
              return "Error: Received empty response from generation model."
         return response_text.strip()
     except Exception as e:
-        logging.exception(f"An unexpected error occurred while querying HF Inference API ({model_name}): {e}")
         return f"Error: An unexpected error occurred while generating the answer using {model_name}."
 def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
     """Uses LLM (HF Inference API) to generate alternative phrasings."""
     prompt = f"""Given the user query: "{query}"
 Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
 Focus on synonyms, different levels of specificity, and related concepts.
@@ -287,8 +268,10 @@ Output:"""
         logging.error(f"Failed to generate query variations: {e}")
         return []
 def generate_prompt(query, context_chunks):
     """Generates a prompt for the LLM."""
     context_str = "\n\n".join(context_chunks)
     liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
     prompt = f"""Based on the following context from the library guides, answer the user's question.
@@ -306,14 +289,13 @@ Answer:"""
     return prompt
 # --- Streamlit App UI ---
-st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
-# User input (only proceed if collection loaded)
 if collection:
     query = st.text_area("Enter your question:", height=100)
 else:
-    # Error handled during load_data_and_setup_chroma
-    st.error("Application initialization failed. Cannot proceed.")
     st.stop()
 # --- Routing Prompt Definition ---

 import json
 import os
 from dotenv import load_dotenv
+from huggingface_hub import InferenceClient, hf_hub_download
 import numpy as np
 import time
 from tqdm import tqdm
+from datasets import load_dataset
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 # --- Page Config (MUST BE FIRST Streamlit call) ---
 st.set_page_config(layout="wide")
 # ---
 # --- Configuration ---
 COLLECTION_NAME = "libguides_content"
 LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
 HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
 HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
 PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
 ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
 TOP_K = 10
 INITIAL_N_RESULTS = 50
 MAX_NEW_TOKENS = 512
 # ---
 # Setup logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
+# --- Cached Resource Loading ---
 @st.cache_resource
 def initialize_hf_client():
+    """Initializes and returns the HF Inference Client for generation."""
     generation_client_instance = None
     try:
         load_dotenv()
         st.stop()
     return None
 @st.cache_resource
 def load_local_embedding_model():
+    """Loads and returns the local Sentence Transformer model for query embedding."""
     logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
     try:
          import torch
         st.stop()
     return None
 @st.cache_resource
+def load_dataset_from_hf():
+    """Downloads the dataset parquet file and loads it into a Pandas DataFrame."""
     try:
+        logging.info(f"Downloading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
+        parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
+        logging.info(f"Downloaded dataset file to: {parquet_path}")
         logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
         df = pd.read_parquet(parquet_path)
         logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
+        # Verify required columns
         required_cols = ['id', 'document', 'embedding', 'metadata']
         if not all(col in df.columns for col in required_cols):
             st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
             logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
+            return None # Return None on error
+        # Ensure embeddings are lists of floats
         logging.info("Ensuring embeddings are in list format...")
         if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
              df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
         if df.empty:
             st.error("No valid data loaded from the dataset after processing embeddings.")
             logging.error("DataFrame empty after embedding processing.")
+            return None # Return None on error
+        return df
     except ImportError as e:
         st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
+        logging.error(f"ImportError during dataset loading: {e}")
     except Exception as e:
+        st.error(f"Failed to load data from dataset: {e}")
+        logging.exception(f"An unexpected error occurred during data load: {e}")
+    return None # Return None on any error
+# --- Initialize Clients and Models ---
+generation_client = initialize_hf_client()
+embedding_model = load_local_embedding_model()
+# ---
+# --- Setup ChromaDB Collection (using Session State) ---
+if 'chroma_collection' not in st.session_state:
+    st.session_state.chroma_collection = None
+    if embedding_model and generation_client: # Only proceed if models/clients loaded
+        with st.spinner("Loading and preparing vector database..."):
+            df = load_dataset_from_hf()
+            if df is not None and not df.empty:
+                try:
+                    logging.info("Initializing Ephemeral ChromaDB client...")
+                    chroma_client = chromadb.EphemeralClient() # Use Ephemeral Client
+                    # Delete collection if it somehow exists (unlikely for ephemeral)
+                    try:
+                        chroma_client.delete_collection(name=COLLECTION_NAME)
+                        logging.info(f"Deleted existing collection (if any): {COLLECTION_NAME}")
+                    except: pass
+                    logging.info(f"Creating collection: {COLLECTION_NAME}")
+                    collection_instance = chroma_client.create_collection(
+                        name=COLLECTION_NAME,
+                        metadata={"hnsw:space": "cosine"}
+                    )
+                    logging.info(f"Adding {len(df)} documents to ChromaDB in batches of {ADD_BATCH_SIZE}...")
+                    start_time = time.time()
+                    error_count = 0
+                    num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
+                    for i in range(num_batches):
+                        start_idx = i * ADD_BATCH_SIZE
+                        end_idx = start_idx + ADD_BATCH_SIZE
+                        batch_df = df.iloc[start_idx:end_idx]
+                        try:
+                            # Prepare and clean metadata for the batch
+                            metadatas_list_raw = batch_df['metadata'].tolist()
+                            cleaned_metadatas = []
+                            for item in metadatas_list_raw:
+                                cleaned_dict = {}
+                                if isinstance(item, dict):
+                                    current_meta = item
+                                else:
+                                    try: current_meta = json.loads(item) if isinstance(item, str) else {}
+                                    except: current_meta = {}
+                                if isinstance(current_meta, dict):
+                                    for key, value in current_meta.items():
+                                        if value is None: cleaned_dict[key] = ""
+                                        elif isinstance(value, (str, int, float, bool)): cleaned_dict[key] = value
+                                        else:
+                                            try: cleaned_dict[key] = str(value)
+                                            except: pass # Skip unconvertible types
+                                cleaned_metadatas.append(cleaned_dict)
+                            # Add the batch
+                            collection_instance.add(
+                                ids=batch_df['id'].tolist(),
+                                embeddings=batch_df['embedding'].tolist(),
+                                documents=batch_df['document'].tolist(),
+                                metadatas=cleaned_metadatas
+                            )
+                        except Exception as e:
+                            logging.error(f"Error adding batch {i+1}/{num_batches} to Chroma: {e}")
+                            error_count += 1
+                    end_time = time.time()
+                    logging.info(f"Finished loading data into ChromaDB. Took {end_time - start_time:.2f} seconds.")
+                    if error_count > 0: logging.warning(f"Encountered errors in {error_count} batches during add.")
+                    final_count = collection_instance.count()
+                    logging.info(f"Final document count in Chroma collection: {final_count}")
+                    if final_count > 0:
+                        st.session_state.chroma_collection = collection_instance
+                        st.success("Vector database loaded successfully!")
+                    else:
+                        st.error("Failed to load documents into the vector database.")
+                except Exception as setup_e:
+                    st.error(f"Failed to setup ChromaDB: {setup_e}")
+                    logging.exception(f"Failed to setup ChromaDB: {setup_e}")
+            else:
+                 st.error("Failed to load data from the dataset. Cannot initialize database.")
+# Assign collection from session state for use in the app
+collection = st.session_state.get('chroma_collection', None)
 # ---
 # --- Helper Functions ---
 def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
     """Sends the prompt to the HF Inference API using the initialized client."""
+    if not client_instance: client_instance = generation_client
     if not client_instance:
+         logging.error("HF Inference client not initialized.")
          return "Error: HF Inference client failed to initialize."
     try:
         response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
         if not response_text:
+             logging.warning(f"Received empty response from HF Inference API ({model_name}).")
              return "Error: Received empty response from generation model."
         return response_text.strip()
     except Exception as e:
+        logging.exception(f"Error querying HF Inference API ({model_name}): {e}")
         return f"Error: An unexpected error occurred while generating the answer using {model_name}."
 def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
     """Uses LLM (HF Inference API) to generate alternative phrasings."""
+    # ... (rest of function remains the same) ...
     prompt = f"""Given the user query: "{query}"
 Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
 Focus on synonyms, different levels of specificity, and related concepts.
         logging.error(f"Failed to generate query variations: {e}")
         return []
 def generate_prompt(query, context_chunks):
     """Generates a prompt for the LLM."""
+    # ... (function remains the same) ...
     context_str = "\n\n".join(context_chunks)
     liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
     prompt = f"""Based on the following context from the library guides, answer the user's question.
     return prompt
 # --- Streamlit App UI ---
+st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)")
+# User input (only proceed if collection is ready)
 if collection:
     query = st.text_area("Enter your question:", height=100)
 else:
+    st.error("Application initialization failed: Vector database not loaded.")
     st.stop()
 # --- Routing Prompt Definition ---