Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,10 @@ from sentence_transformers import SentenceTransformer
|
|
| 16 |
# Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
|
| 17 |
# import chromadb.utils.embedding_functions as embedding_functions
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# --- Configuration ---
|
| 20 |
# DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
|
| 21 |
COLLECTION_NAME = "libguides_content"
|
|
@@ -86,39 +90,45 @@ embedding_model = load_local_embedding_model()
|
|
| 86 |
# --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
|
| 87 |
@st.cache_resource
|
| 88 |
def load_data_and_setup_chroma():
|
|
|
|
| 89 |
if not generation_client or not embedding_model:
|
| 90 |
st.error("Required clients/models not initialized. Cannot proceed.")
|
|
|
|
| 91 |
st.stop()
|
| 92 |
|
| 93 |
try:
|
| 94 |
logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
|
| 95 |
-
#
|
| 96 |
-
# Handle potential errors during download/load
|
| 97 |
try:
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
| 102 |
st.stop()
|
| 103 |
|
| 104 |
-
logging.info("
|
| 105 |
-
df =
|
| 106 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 107 |
|
| 108 |
# Verify required columns
|
| 109 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 110 |
if not all(col in df.columns for col in required_cols):
|
| 111 |
-
st.error(f"Dataset is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 112 |
-
logging.error(f"Dataset missing required columns. Found: {df.columns}")
|
| 113 |
st.stop()
|
| 114 |
|
| 115 |
-
# Ensure embeddings are lists of floats
|
| 116 |
-
# This might not be strictly necessary if ChromaDB handles numpy arrays, but safer to convert
|
| 117 |
logging.info("Ensuring embeddings are in list format...")
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
initial_rows = len(df)
|
| 121 |
-
df.dropna(subset=['embedding'], inplace=True)
|
| 122 |
if len(df) < initial_rows:
|
| 123 |
logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
|
| 124 |
|
|
@@ -130,16 +140,16 @@ def load_data_and_setup_chroma():
|
|
| 130 |
logging.info("Initializing in-memory ChromaDB client...")
|
| 131 |
chroma_client = chromadb.Client() # In-memory client
|
| 132 |
|
| 133 |
-
# Delete collection if it somehow exists in memory (unlikely but safe)
|
| 134 |
try:
|
| 135 |
chroma_client.delete_collection(name=COLLECTION_NAME)
|
|
|
|
| 136 |
except: pass
|
| 137 |
|
| 138 |
logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
|
| 139 |
-
# Create collection WITHOUT embedding function
|
| 140 |
collection = chroma_client.create_collection(
|
| 141 |
name=COLLECTION_NAME,
|
| 142 |
-
metadata={"hnsw:space": "cosine"}
|
| 143 |
)
|
| 144 |
|
| 145 |
logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
|
|
@@ -154,11 +164,26 @@ def load_data_and_setup_chroma():
|
|
| 154 |
batch_df = df.iloc[start_idx:end_idx]
|
| 155 |
|
| 156 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
collection.add(
|
| 158 |
ids=batch_df['id'].tolist(),
|
| 159 |
embeddings=batch_df['embedding'].tolist(),
|
| 160 |
documents=batch_df['document'].tolist(),
|
| 161 |
-
metadatas=
|
| 162 |
)
|
| 163 |
except Exception as e:
|
| 164 |
logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
|
|
@@ -182,7 +207,7 @@ def load_data_and_setup_chroma():
|
|
| 182 |
st.error(f"Failed to load data and initialize ChromaDB: {e}")
|
| 183 |
logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
|
| 184 |
st.stop()
|
| 185 |
-
return None
|
| 186 |
|
| 187 |
# --- Load data and collection ---
|
| 188 |
collection = load_data_and_setup_chroma()
|
|
@@ -259,7 +284,6 @@ Answer:"""
|
|
| 259 |
return prompt
|
| 260 |
|
| 261 |
# --- Streamlit App UI ---
|
| 262 |
-
st.set_page_config(layout="wide")
|
| 263 |
st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
|
| 264 |
|
| 265 |
# User input (only proceed if collection loaded)
|
|
|
|
| 16 |
# Keep ChromaDB embedding function import only if needed elsewhere, otherwise remove
|
| 17 |
# import chromadb.utils.embedding_functions as embedding_functions
|
| 18 |
|
| 19 |
+
# --- Page Config (MUST BE FIRST Streamlit call) ---
|
| 20 |
+
st.set_page_config(layout="wide")
|
| 21 |
+
# ---
|
| 22 |
+
|
| 23 |
# --- Configuration ---
|
| 24 |
# DB_PATH = "./chroma_db" # No longer using persistent path for app runtime
|
| 25 |
COLLECTION_NAME = "libguides_content"
|
|
|
|
| 90 |
# --- Load Data from HF Dataset and Populate In-Memory ChromaDB ---
|
| 91 |
@st.cache_resource
|
| 92 |
def load_data_and_setup_chroma():
|
| 93 |
+
# Ensure dependent resources are loaded first
|
| 94 |
if not generation_client or not embedding_model:
|
| 95 |
st.error("Required clients/models not initialized. Cannot proceed.")
|
| 96 |
+
# Potentially redundant with individual init checks, but safe
|
| 97 |
st.stop()
|
| 98 |
|
| 99 |
try:
|
| 100 |
logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
|
| 101 |
+
# Download the specific parquet file from the dataset repo
|
|
|
|
| 102 |
try:
|
| 103 |
+
parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
|
| 104 |
+
logging.info(f"Downloaded dataset file to: {parquet_path}")
|
| 105 |
+
except Exception as download_e:
|
| 106 |
+
logging.error(f"Failed to download dataset file '{PARQUET_FILENAME}' from '{HF_DATASET_ID}': {download_e}")
|
| 107 |
+
st.error(f"Failed to download dataset '{HF_DATASET_ID}'. Check dataset ID, filename, and token permissions.")
|
| 108 |
st.stop()
|
| 109 |
|
| 110 |
+
logging.info(f"Loading Parquet file '{parquet_path}' into Pandas DataFrame...")
|
| 111 |
+
df = pd.read_parquet(parquet_path)
|
| 112 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 113 |
|
| 114 |
# Verify required columns
|
| 115 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 116 |
if not all(col in df.columns for col in required_cols):
|
| 117 |
+
st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 118 |
+
logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
|
| 119 |
st.stop()
|
| 120 |
|
| 121 |
+
# Ensure embeddings are lists of floats
|
|
|
|
| 122 |
logging.info("Ensuring embeddings are in list format...")
|
| 123 |
+
# Check if the first embedding is already a list of floats, otherwise convert
|
| 124 |
+
if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
|
| 125 |
+
df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
|
| 126 |
+
logging.info("Converted embeddings to list[float].")
|
| 127 |
+
else:
|
| 128 |
+
logging.info("Embeddings already seem to be in list[float] format.")
|
| 129 |
+
|
| 130 |
initial_rows = len(df)
|
| 131 |
+
df.dropna(subset=['embedding'], inplace=True) # Drop rows where embedding is None
|
| 132 |
if len(df) < initial_rows:
|
| 133 |
logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
|
| 134 |
|
|
|
|
| 140 |
logging.info("Initializing in-memory ChromaDB client...")
|
| 141 |
chroma_client = chromadb.Client() # In-memory client
|
| 142 |
|
|
|
|
| 143 |
try:
|
| 144 |
chroma_client.delete_collection(name=COLLECTION_NAME)
|
| 145 |
+
logging.info(f"Deleted existing in-memory collection (if any): {COLLECTION_NAME}")
|
| 146 |
except: pass
|
| 147 |
|
| 148 |
logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
|
| 149 |
+
# Create collection WITHOUT embedding function
|
| 150 |
collection = chroma_client.create_collection(
|
| 151 |
name=COLLECTION_NAME,
|
| 152 |
+
metadata={"hnsw:space": "cosine"}
|
| 153 |
)
|
| 154 |
|
| 155 |
logging.info(f"Adding {len(df)} documents to in-memory ChromaDB in batches of {ADD_BATCH_SIZE}...")
|
|
|
|
| 164 |
batch_df = df.iloc[start_idx:end_idx]
|
| 165 |
|
| 166 |
try:
|
| 167 |
+
# Convert metadata column if it contains dicts
|
| 168 |
+
metadatas_list = batch_df['metadata'].tolist()
|
| 169 |
+
if metadatas_list and isinstance(metadatas_list[0], dict):
|
| 170 |
+
pass # Already list of dicts
|
| 171 |
+
else:
|
| 172 |
+
# Attempt to parse if they are JSON strings, otherwise use empty dicts
|
| 173 |
+
parsed_metadatas = []
|
| 174 |
+
for item in metadatas_list:
|
| 175 |
+
try:
|
| 176 |
+
parsed = json.loads(item) if isinstance(item, str) else item
|
| 177 |
+
parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
|
| 178 |
+
except:
|
| 179 |
+
parsed_metadatas.append({})
|
| 180 |
+
metadatas_list = parsed_metadatas
|
| 181 |
+
|
| 182 |
collection.add(
|
| 183 |
ids=batch_df['id'].tolist(),
|
| 184 |
embeddings=batch_df['embedding'].tolist(),
|
| 185 |
documents=batch_df['document'].tolist(),
|
| 186 |
+
metadatas=metadatas_list
|
| 187 |
)
|
| 188 |
except Exception as e:
|
| 189 |
logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
|
|
|
|
| 207 |
st.error(f"Failed to load data and initialize ChromaDB: {e}")
|
| 208 |
logging.exception(f"An unexpected error occurred during data load/Chroma setup: {e}")
|
| 209 |
st.stop()
|
| 210 |
+
return None
|
| 211 |
|
| 212 |
# --- Load data and collection ---
|
| 213 |
collection = load_data_and_setup_chroma()
|
|
|
|
| 284 |
return prompt
|
| 285 |
|
| 286 |
# --- Streamlit App UI ---
|
|
|
|
| 287 |
st.title("📚 Ask the Library Guides (Dataset Embed + HF Gen)") # Updated title
|
| 288 |
|
| 289 |
# User input (only proceed if collection loaded)
|