Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -5,43 +5,38 @@ import sys
|
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
-
from huggingface_hub import InferenceClient, hf_hub_download
|
| 9 |
import numpy as np
|
| 10 |
import time
|
| 11 |
from tqdm import tqdm
|
| 12 |
-
|
| 13 |
-
from datasets import load_dataset, DatasetDict, Dataset
|
| 14 |
import pandas as pd
|
| 15 |
from sentence_transformers import SentenceTransformer
|
| 16 |
-
# Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
|
| 17 |
-
# import chromadb.utils.embedding_functions as embedding_functions
|
| 18 |
|
| 19 |
# --- Page Config (MUST BE FIRST Streamlit call) ---
|
| 20 |
st.set_page_config(layout="wide")
|
| 21 |
# ---
|
| 22 |
|
| 23 |
# --- Configuration ---
|
| 24 |
-
# DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
|
| 25 |
COLLECTION_NAME = "libguides_content"
|
| 26 |
LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
|
| 27 |
HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
|
| 28 |
HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
|
| 29 |
PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
|
| 30 |
-
# INPUT_FILE = 'extracted_content.jsonl' # No longer needed for app runtime
|
| 31 |
-
# EMBEDDING_BATCH_SIZE = 100 # Batch size for adding docs to ChromaDB (now done during load)
|
| 32 |
ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
|
| 33 |
TOP_K = 10
|
| 34 |
INITIAL_N_RESULTS = 50
|
| 35 |
-
API_RETRY_DELAY = 2
|
| 36 |
MAX_NEW_TOKENS = 512
|
| 37 |
# ---
|
| 38 |
|
| 39 |
# Setup logging
|
| 40 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
|
| 41 |
|
| 42 |
-
# ---
|
|
|
|
| 43 |
@st.cache_resource
|
| 44 |
def initialize_hf_client():
|
|
|
|
| 45 |
generation_client_instance = None
|
| 46 |
try:
|
| 47 |
load_dotenv()
|
|
@@ -60,12 +55,9 @@ def initialize_hf_client():
|
|
| 60 |
st.stop()
|
| 61 |
return None
|
| 62 |
|
| 63 |
-
generation_client = initialize_hf_client()
|
| 64 |
-
# ---
|
| 65 |
-
|
| 66 |
-
# --- Load Local Embedding Model (for Queries) ---
|
| 67 |
@st.cache_resource
|
| 68 |
def load_local_embedding_model():
|
|
|
|
| 69 |
logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
|
| 70 |
try:
|
| 71 |
import torch
|
|
@@ -84,37 +76,26 @@ def load_local_embedding_model():
|
|
| 84 |
st.stop()
|
| 85 |
return None
|
| 86 |
|
| 87 |
-
embedding_model = load_local_embedding_model()
|
| 88 |
-
# ---
|
| 89 |
-
|
| 90 |
-
# --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
|
| 91 |
@st.cache_resource
|
| 92 |
-
def
|
| 93 |
-
|
| 94 |
-
if not generation_client or not embedding_model:
|
| 95 |
-
st.error("Required clients/models not initialized. Cannot proceed.")
|
| 96 |
-
st.stop()
|
| 97 |
-
|
| 98 |
try:
|
| 99 |
-
logging.info(f"
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
logging.info(f"Downloaded dataset file to: {parquet_path}")
|
| 103 |
-
except Exception as download_e:
|
| 104 |
-
logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
|
| 105 |
-
st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
|
| 106 |
-
st.stop()
|
| 107 |
|
| 108 |
logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
|
| 109 |
df = pd.read_parquet(parquet_path)
|
| 110 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 111 |
|
|
|
|
| 112 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 113 |
if not all(col in df.columns for col in required_cols):
|
| 114 |
st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 115 |
logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
|
| 116 |
-
|
| 117 |
|
|
|
|
| 118 |
logging.info("Ensuring embeddings are in list format...")
|
| 119 |
if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
|
| 120 |
df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
|
|
@@ -130,131 +111,131 @@ def load_data_and_setup_chroma():
|
|
| 130 |
if df.empty:
|
| 131 |
st.error("No valid data loaded from the dataset after processing embeddings.")
|
| 132 |
logging.error("DataFrame empty after embedding processing.")
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
-
# Explicitly configure for in-memory using DuckDB+Parquet
|
| 137 |
-
settings = chromadb.config.Settings(
|
| 138 |
-
chroma_api_impl="local",
|
| 139 |
-
chroma_db_impl="duckdb+parquet",
|
| 140 |
-
persist_directory=None # Ensure no persistence is attempted
|
| 141 |
-
)
|
| 142 |
-
chroma_client = chromadb.Client(settings=settings)
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
chroma_client.delete_collection(name=COLLECTION_NAME)
|
| 146 |
-
logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
|
| 147 |
-
except: pass
|
| 148 |
-
|
| 149 |
-
logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
|
| 150 |
-
collection = chroma_client.create_collection(
|
| 151 |
-
name=COLLECTION_NAME,
|
| 152 |
-
metadata={"hnsw:space": "cosine"}
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
|
| 156 |
-
start_time = time.time()
|
| 157 |
-
error_count = 0
|
| 158 |
-
num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
|
| 159 |
-
progress_bar = st.progress(0, text="Loading embeddings into memory...")
|
| 160 |
-
|
| 161 |
-
for i in range(num_batches):
|
| 162 |
-
start_idx = i * ADD_BATCH_SIZE
|
| 163 |
-
end_idx = start_idx + ADD_BATCH_SIZE
|
| 164 |
-
batch_df = df.iloc[start_idx:end_idx]
|
| 165 |
-
|
| 166 |
-
try:
|
| 167 |
-
# Prepare metadata for the batch
|
| 168 |
-
metadatas_list_raw = batch_df['metadata'].tolist()
|
| 169 |
-
cleaned_metadatas = []
|
| 170 |
-
for item in metadatas_list_raw:
|
| 171 |
-
cleaned_dict = {}
|
| 172 |
-
# Handle potential non-dict items loaded from parquet/dataset
|
| 173 |
-
if isinstance(item, dict):
|
| 174 |
-
current_meta = item
|
| 175 |
-
else:
|
| 176 |
-
try: # Attempt to parse if it's a JSON string
|
| 177 |
-
current_meta = json.loads(item) if isinstance(item, str) else {}
|
| 178 |
-
except:
|
| 179 |
-
current_meta = {} # Default to empty dict if not dict or valid JSON
|
| 180 |
-
|
| 181 |
-
# Clean None values within the dictionary
|
| 182 |
-
if isinstance(current_meta, dict):
|
| 183 |
-
for key, value in current_meta.items():
|
| 184 |
-
if value is None:
|
| 185 |
-
cleaned_dict[key] = "" # Replace None with empty string
|
| 186 |
-
elif isinstance(value, (str, int, float, bool)):
|
| 187 |
-
cleaned_dict[key] = value # Keep allowed types
|
| 188 |
-
else:
|
| 189 |
-
try: # Attempt to convert others to string
|
| 190 |
-
cleaned_dict[key] = str(value)
|
| 191 |
-
logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
|
| 192 |
-
except:
|
| 193 |
-
logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
|
| 194 |
-
cleaned_metadatas.append(cleaned_dict)
|
| 195 |
-
|
| 196 |
-
# Add the batch with cleaned metadata
|
| 197 |
-
collection.add(
|
| 198 |
-
ids=batch_df['id'].tolist(),
|
| 199 |
-
embeddings=batch_df['embedding'].tolist(),
|
| 200 |
-
documents=batch_df['document'].tolist(),
|
| 201 |
-
metadatas=cleaned_metadatas # Use the cleaned list
|
| 202 |
-
)
|
| 203 |
-
except Exception as e:
|
| 204 |
-
logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
|
| 205 |
-
error_count += 1
|
| 206 |
-
progress_bar.progress((i + 1) / num_batches, text=f"Loading embeddings... Batch {i+1}/{num_batches}")
|
| 207 |
-
|
| 208 |
-
progress_bar.empty()
|
| 209 |
-
end_time = time.time()
|
| 210 |
-
logging.info(f"Finished loading data into in-memory ChromaDB. Took {end_time - start_time:.2f} seconds.")
|
| 211 |
-
if error_count > 0:
|
| 212 |
-
logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
|
| 213 |
-
|
| 214 |
-
# Verify count after adding
|
| 215 |
-
final_count = collection.count()
|
| 216 |
-
logging.info(f"Final document count in Chroma collection: {final_count}")
|
| 217 |
-
if final_count == 0 and len(df) > 0:
|
| 218 |
-
st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
|
| 219 |
-
# Don't necessarily stop, but warn the user.
|
| 220 |
-
|
| 221 |
-
st.success("Embeddings loaded successfully!")
|
| 222 |
-
return collection
|
| 223 |
|
| 224 |
except ImportError as e:
|
| 225 |
st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
|
| 226 |
-
logging.error(f"ImportError during dataset loading
|
| 227 |
-
st.stop()
|
| 228 |
except Exception as e:
|
| 229 |
-
st.error(f"Failed to load data
|
| 230 |
-
logging.exception(f"An unexpected error occurred during data load
|
| 231 |
-
|
| 232 |
-
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
# ---
|
| 237 |
|
| 238 |
# --- Helper Functions ---
|
| 239 |
def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
|
| 240 |
"""Sends the prompt to the HF Inference API using the initialized client."""
|
|
|
|
| 241 |
if not client_instance:
|
| 242 |
-
|
| 243 |
-
if not client_instance:
|
| 244 |
-
logging.error("HF Inference client not initialized in query_hf_inference.")
|
| 245 |
return "Error: HF Inference client failed to initialize."
|
| 246 |
try:
|
| 247 |
response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
|
| 248 |
if not response_text:
|
| 249 |
-
logging.warning(f"Received empty response from HF Inference API ({model_name})
|
| 250 |
return "Error: Received empty response from generation model."
|
| 251 |
return response_text.strip()
|
| 252 |
except Exception as e:
|
| 253 |
-
logging.exception(f"
|
| 254 |
return f"Error: An unexpected error occurred while generating the answer using {model_name}."
|
| 255 |
|
| 256 |
def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
|
| 257 |
"""Uses LLM (HF Inference API) to generate alternative phrasings."""
|
|
|
|
| 258 |
prompt = f"""Given the user query: "{query}"
|
| 259 |
Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
|
| 260 |
Focus on synonyms, different levels of specificity, and related concepts.
|
|
@@ -287,8 +268,10 @@ Output:"""
|
|
| 287 |
logging.error(f"Failed to generate query variations: {e}")
|
| 288 |
return []
|
| 289 |
|
|
|
|
| 290 |
def generate_prompt(query, context_chunks):
|
| 291 |
"""Generates a prompt for the LLM."""
|
|
|
|
| 292 |
context_str = "\n\n".join(context_chunks)
|
| 293 |
liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
|
| 294 |
prompt = f"""Based on the following context from the library guides, answer the user's question.
|
|
@@ -306,14 +289,13 @@ Answer:"""
|
|
| 306 |
return prompt
|
| 307 |
|
| 308 |
# --- Streamlit App UI ---
|
| 309 |
-
st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)")
|
| 310 |
|
| 311 |
-
# User input (only proceed if collection
|
| 312 |
if collection:
|
| 313 |
query = st.text_area("Enter your question:", height=100)
|
| 314 |
else:
|
| 315 |
-
|
| 316 |
-
st.error("Application initialization failed. Cannot proceed.")
|
| 317 |
st.stop()
|
| 318 |
|
| 319 |
# --- Routing Prompt Definition ---
|
|
|
|
| 5 |
import json
|
| 6 |
import os
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
+
from huggingface_hub import InferenceClient, hf_hub_download
|
| 9 |
import numpy as np
|
| 10 |
import time
|
| 11 |
from tqdm import tqdm
|
| 12 |
+
from datasets import load_dataset
|
|
|
|
| 13 |
import pandas as pd
|
| 14 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# --- Page Config (MUST BE FIRST Streamlit call) ---
|
| 17 |
st.set_page_config(layout="wide")
|
| 18 |
# ---
|
| 19 |
|
| 20 |
# --- Configuration ---
|
|
|
|
| 21 |
COLLECTION_NAME = "libguides_content"
|
| 22 |
LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
|
| 23 |
HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
|
| 24 |
HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
|
| 25 |
PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
|
|
|
|
|
|
|
| 26 |
ADD_BATCH_SIZE = 500 # Batch size for adding to in-memory Chroma
|
| 27 |
TOP_K = 10
|
| 28 |
INITIAL_N_RESULTS = 50
|
|
|
|
| 29 |
MAX_NEW_TOKENS = 512
|
| 30 |
# ---
|
| 31 |
|
| 32 |
# Setup logging
|
| 33 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', stream=sys.stderr)
|
| 34 |
|
| 35 |
+
# --- Cached Resource Loading ---
|
| 36 |
+
|
| 37 |
@st.cache_resource
|
| 38 |
def initialize_hf_client():
|
| 39 |
+
"""Initializes and returns the HF Inference Client for generation."""
|
| 40 |
generation_client_instance = None
|
| 41 |
try:
|
| 42 |
load_dotenv()
|
|
|
|
| 55 |
st.stop()
|
| 56 |
return None
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
@st.cache_resource
|
| 59 |
def load_local_embedding_model():
|
| 60 |
+
"""Loads and returns the local Sentence Transformer model for query embedding."""
|
| 61 |
logging.info(f"Loading local embedding model for queries: {LOCAL_EMBEDDING_MODEL}")
|
| 62 |
try:
|
| 63 |
import torch
|
|
|
|
| 76 |
st.stop()
|
| 77 |
return None
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
@st.cache_resource
|
| 80 |
+
def load_dataset_from_hf():
|
| 81 |
+
"""Downloads the dataset parquet file and loads it into a Pandas DataFrame."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
try:
|
| 83 |
+
logging.info(f"Downloading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
|
| 84 |
+
parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
|
| 85 |
+
logging.info(f"Downloaded dataset file to: {parquet_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
|
| 88 |
df = pd.read_parquet(parquet_path)
|
| 89 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 90 |
|
| 91 |
+
# Verify required columns
|
| 92 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 93 |
if not all(col in df.columns for col in required_cols):
|
| 94 |
st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 95 |
logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
|
| 96 |
+
return None # Return None on error
|
| 97 |
|
| 98 |
+
# Ensure embeddings are lists of floats
|
| 99 |
logging.info("Ensuring embeddings are in list format...")
|
| 100 |
if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
|
| 101 |
df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
|
|
|
|
| 111 |
if df.empty:
|
| 112 |
st.error("No valid data loaded from the dataset after processing embeddings.")
|
| 113 |
logging.error("DataFrame empty after embedding processing.")
|
| 114 |
+
return None # Return None on error
|
| 115 |
|
| 116 |
+
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
except ImportError as e:
|
| 119 |
st.error(f"ImportError: {e}. Required libraries might be missing (datasets, pandas, pyarrow). Check requirements.txt.")
|
| 120 |
+
logging.error(f"ImportError during dataset loading: {e}")
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
+
st.error(f"Failed to load data from dataset: {e}")
|
| 123 |
+
logging.exception(f"An unexpected error occurred during data load: {e}")
|
| 124 |
+
|
| 125 |
+
return None # Return None on any error
|
| 126 |
+
|
| 127 |
+
# --- Initialize Clients and Models ---
|
| 128 |
+
generation_client = initialize_hf_client()
|
| 129 |
+
embedding_model = load_local_embedding_model()
|
| 130 |
+
# ---
|
| 131 |
+
|
| 132 |
+
# --- Setup ChromaDB Collection (using Session State) ---
|
| 133 |
+
if 'chroma_collection' not in st.session_state:
|
| 134 |
+
st.session_state.chroma_collection = None
|
| 135 |
+
if embedding_model and generation_client: # Only proceed if models/clients loaded
|
| 136 |
+
with st.spinner("Loading and preparing vector database..."):
|
| 137 |
+
df = load_dataset_from_hf()
|
| 138 |
+
if df is not None and not df.empty:
|
| 139 |
+
try:
|
| 140 |
+
logging.info("Initializing Ephemeral ChromaDB client...")
|
| 141 |
+
chroma_client = chromadb.EphemeralClient() # Use Ephemeral Client
|
| 142 |
+
|
| 143 |
+
# Delete collection if it somehow exists (unlikely for ephemeral)
|
| 144 |
+
try:
|
| 145 |
+
chroma_client.delete_collection(name=COLLECTION_NAME)
|
| 146 |
+
logging.info(f"Deleted existing collection (if any): {COLLECTION_NAME}")
|
| 147 |
+
except: pass
|
| 148 |
+
|
| 149 |
+
logging.info(f"Creating collection: {COLLECTION_NAME}")
|
| 150 |
+
collection_instance = chroma_client.create_collection(
|
| 151 |
+
name=COLLECTION_NAME,
|
| 152 |
+
metadata={"hnsw:space": "cosine"}
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
logging.info(f"Adding {len(df)} documents to ChromaDB in batches of {ADD_BATCH_SIZE}...")
|
| 156 |
+
start_time = time.time()
|
| 157 |
+
error_count = 0
|
| 158 |
+
num_batches = (len(df) + ADD_BATCH_SIZE - 1) // ADD_BATCH_SIZE
|
| 159 |
+
|
| 160 |
+
for i in range(num_batches):
|
| 161 |
+
start_idx = i * ADD_BATCH_SIZE
|
| 162 |
+
end_idx = start_idx + ADD_BATCH_SIZE
|
| 163 |
+
batch_df = df.iloc[start_idx:end_idx]
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
# Prepare and clean metadata for the batch
|
| 167 |
+
metadatas_list_raw = batch_df['metadata'].tolist()
|
| 168 |
+
cleaned_metadatas = []
|
| 169 |
+
for item in metadatas_list_raw:
|
| 170 |
+
cleaned_dict = {}
|
| 171 |
+
if isinstance(item, dict):
|
| 172 |
+
current_meta = item
|
| 173 |
+
else:
|
| 174 |
+
try: current_meta = json.loads(item) if isinstance(item, str) else {}
|
| 175 |
+
except: current_meta = {}
|
| 176 |
+
|
| 177 |
+
if isinstance(current_meta, dict):
|
| 178 |
+
for key, value in current_meta.items():
|
| 179 |
+
if value is None: cleaned_dict[key] = ""
|
| 180 |
+
elif isinstance(value, (str, int, float, bool)): cleaned_dict[key] = value
|
| 181 |
+
else:
|
| 182 |
+
try: cleaned_dict[key] = str(value)
|
| 183 |
+
except: pass # Skip unconvertible types
|
| 184 |
+
cleaned_metadatas.append(cleaned_dict)
|
| 185 |
+
|
| 186 |
+
# Add the batch
|
| 187 |
+
collection_instance.add(
|
| 188 |
+
ids=batch_df['id'].tolist(),
|
| 189 |
+
embeddings=batch_df['embedding'].tolist(),
|
| 190 |
+
documents=batch_df['document'].tolist(),
|
| 191 |
+
metadatas=cleaned_metadatas
|
| 192 |
+
)
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logging.error(f"Error adding batch {i+1}/{num_batches} to Chroma: {e}")
|
| 195 |
+
error_count += 1
|
| 196 |
+
|
| 197 |
+
end_time = time.time()
|
| 198 |
+
logging.info(f"Finished loading data into ChromaDB. Took {end_time - start_time:.2f} seconds.")
|
| 199 |
+
if error_count > 0: logging.warning(f"Encountered errors in {error_count} batches during add.")
|
| 200 |
+
|
| 201 |
+
final_count = collection_instance.count()
|
| 202 |
+
logging.info(f"Final document count in Chroma collection: {final_count}")
|
| 203 |
+
if final_count > 0:
|
| 204 |
+
st.session_state.chroma_collection = collection_instance
|
| 205 |
+
st.success("Vector database loaded successfully!")
|
| 206 |
+
else:
|
| 207 |
+
st.error("Failed to load documents into the vector database.")
|
| 208 |
|
| 209 |
+
except Exception as setup_e:
|
| 210 |
+
st.error(f"Failed to setup ChromaDB: {setup_e}")
|
| 211 |
+
logging.exception(f"Failed to setup ChromaDB: {setup_e}")
|
| 212 |
+
else:
|
| 213 |
+
st.error("Failed to load data from the dataset. Cannot initialize database.")
|
| 214 |
+
|
| 215 |
+
# Assign collection from session state for use in the app
|
| 216 |
+
collection = st.session_state.get('chroma_collection', None)
|
| 217 |
# ---
|
| 218 |
|
| 219 |
# --- Helper Functions ---
|
| 220 |
def query_hf_inference(prompt, client_instance=None, model_name=HF_GENERATION_MODEL):
|
| 221 |
"""Sends the prompt to the HF Inference API using the initialized client."""
|
| 222 |
+
if not client_instance: client_instance = generation_client
|
| 223 |
if not client_instance:
|
| 224 |
+
logging.error("HF Inference client not initialized.")
|
|
|
|
|
|
|
| 225 |
return "Error: HF Inference client failed to initialize."
|
| 226 |
try:
|
| 227 |
response_text = client_instance.text_generation(prompt, max_new_tokens=MAX_NEW_TOKENS)
|
| 228 |
if not response_text:
|
| 229 |
+
logging.warning(f"Received empty response from HF Inference API ({model_name}).")
|
| 230 |
return "Error: Received empty response from generation model."
|
| 231 |
return response_text.strip()
|
| 232 |
except Exception as e:
|
| 233 |
+
logging.exception(f"Error querying HF Inference API ({model_name}): {e}")
|
| 234 |
return f"Error: An unexpected error occurred while generating the answer using {model_name}."
|
| 235 |
|
| 236 |
def generate_query_variations(query, llm_func, model_name=HF_GENERATION_MODEL, num_variations=3):
|
| 237 |
"""Uses LLM (HF Inference API) to generate alternative phrasings."""
|
| 238 |
+
# ... (rest of function remains the same) ...
|
| 239 |
prompt = f"""Given the user query: "{query}"
|
| 240 |
Generate {num_variations} alternative phrasings or related queries someone might use to find the same information.
|
| 241 |
Focus on synonyms, different levels of specificity, and related concepts.
|
|
|
|
| 268 |
logging.error(f"Failed to generate query variations: {e}")
|
| 269 |
return []
|
| 270 |
|
| 271 |
+
|
| 272 |
def generate_prompt(query, context_chunks):
|
| 273 |
"""Generates a prompt for the LLM."""
|
| 274 |
+
# ... (function remains the same) ...
|
| 275 |
context_str = "\n\n".join(context_chunks)
|
| 276 |
liaison_directory_url = "https://libguides.gc.cuny.edu/directory/subject"
|
| 277 |
prompt = f"""Based on the following context from the library guides, answer the user's question.
|
|
|
|
| 289 |
return prompt
|
| 290 |
|
| 291 |
# --- Streamlit App UI ---
|
| 292 |
+
st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)")
|
| 293 |
|
| 294 |
+
# User input (only proceed if collection is ready)
|
| 295 |
if collection:
|
| 296 |
query = st.text_area("Enter your question:", height=100)
|
| 297 |
else:
|
| 298 |
+
st.error("Application initialization failed: Vector database not loaded.")
|
|
|
|
| 299 |
st.stop()
|
| 300 |
|
| 301 |
# --- Routing Prompt Definition ---
|