Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -12,8 +12,8 @@ from tqdm import tqdm
|
|
| 12 |
from datasets import load_dataset
|
| 13 |
import pandas as pd
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
-
|
| 16 |
-
import chromadb.config
|
| 17 |
|
| 18 |
# --- Page Config (MUST BE FIRST Streamlit call) ---
|
| 19 |
st.set_page_config(layout="wide")
|
|
@@ -25,7 +25,7 @@ LOCAL_EMBEDDING_MODEL = 'BAAI/bge-m3' # Local model for QUERY embedding
|
|
| 25 |
HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
|
| 26 |
HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
|
| 27 |
PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
|
| 28 |
-
ADD_BATCH_SIZE = 500 # Batch size for adding to
|
| 29 |
TOP_K = 10
|
| 30 |
INITIAL_N_RESULTS = 50
|
| 31 |
MAX_NEW_TOKENS = 512
|
|
@@ -129,12 +129,18 @@ generation_client = initialize_hf_client()
|
|
| 129 |
embedding_model = load_local_embedding_model()
|
| 130 |
# ---
|
| 131 |
|
| 132 |
-
# --- Setup ChromaDB Collection (using Session State) ---
|
| 133 |
-
# This function now attempts to load or create the collection and stores it in session state
|
| 134 |
def setup_chroma_collection():
|
|
|
|
| 135 |
if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# Proceed with setup only if essential components are loaded
|
| 140 |
if not embedding_model or not generation_client:
|
|
@@ -147,17 +153,23 @@ def setup_chroma_collection():
|
|
| 147 |
st.error("Failed to load embedding data. Cannot initialize vector database.")
|
| 148 |
return None
|
| 149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
try:
|
| 151 |
-
logging.info("Initializing
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
allow_reset=True # Optional: Allows resetting
|
| 157 |
-
)
|
| 158 |
)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
# Check if collection exists and delete if it does
|
| 161 |
try:
|
| 162 |
existing_collections = [col.name for col in chroma_client.list_collections()]
|
| 163 |
if COLLECTION_NAME in existing_collections:
|
|
@@ -166,7 +178,6 @@ def setup_chroma_collection():
|
|
| 166 |
except Exception as delete_e:
|
| 167 |
logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
|
| 168 |
|
| 169 |
-
|
| 170 |
logging.info(f"Creating collection: {COLLECTION_NAME}")
|
| 171 |
collection_instance = chroma_client.create_collection(
|
| 172 |
name=COLLECTION_NAME,
|
|
@@ -234,7 +245,6 @@ def setup_chroma_collection():
|
|
| 234 |
return None
|
| 235 |
|
| 236 |
# --- Initialize collection ---
|
| 237 |
-
# Call the setup function which populates session state if needed
|
| 238 |
collection = setup_chroma_collection()
|
| 239 |
# ---
|
| 240 |
|
|
|
|
| 12 |
from datasets import load_dataset
|
| 13 |
import pandas as pd
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
+
import tempfile # Added for temporary directory
|
| 16 |
+
import chromadb.config # Added for Settings
|
| 17 |
|
| 18 |
# --- Page Config (MUST BE FIRST Streamlit call) ---
|
| 19 |
st.set_page_config(layout="wide")
|
|
|
|
| 25 |
HF_GENERATION_MODEL = "google/gemma-3-27b-it" # HF model for generation
|
| 26 |
HF_DATASET_ID = "Zwounds/Libguides_Embeddings" # Your HF Dataset ID
|
| 27 |
PARQUET_FILENAME = "libguides_embeddings.parquet" # Filename within the dataset
|
| 28 |
+
ADD_BATCH_SIZE = 500 # Batch size for adding to Chroma
|
| 29 |
TOP_K = 10
|
| 30 |
INITIAL_N_RESULTS = 50
|
| 31 |
MAX_NEW_TOKENS = 512
|
|
|
|
| 129 |
embedding_model = load_local_embedding_model()
|
| 130 |
# ---
|
| 131 |
|
| 132 |
+
# --- Setup ChromaDB Collection (using Session State and Temp Dir) ---
|
|
|
|
| 133 |
def setup_chroma_collection():
|
| 134 |
+
"""Loads data from HF, sets up ChromaDB in a temp dir, populates it, and returns the collection."""
|
| 135 |
if 'chroma_collection' in st.session_state and st.session_state.chroma_collection is not None:
|
| 136 |
+
# Basic check: see if collection is queryable
|
| 137 |
+
try:
|
| 138 |
+
st.session_state.chroma_collection.peek(1) # Try a lightweight operation
|
| 139 |
+
logging.info("Using existing Chroma collection from session state.")
|
| 140 |
+
return st.session_state.chroma_collection
|
| 141 |
+
except Exception as e:
|
| 142 |
+
logging.warning(f"Error accessing existing collection in session state ({e}), re-initializing.")
|
| 143 |
+
st.session_state.chroma_collection = None # Force re-init
|
| 144 |
|
| 145 |
# Proceed with setup only if essential components are loaded
|
| 146 |
if not embedding_model or not generation_client:
|
|
|
|
| 153 |
st.error("Failed to load embedding data. Cannot initialize vector database.")
|
| 154 |
return None
|
| 155 |
|
| 156 |
+
# Create a temporary directory for this session
|
| 157 |
+
# Note: This directory might be cleaned up automatically depending on the OS/environment
|
| 158 |
+
# In HF Spaces ephemeral storage, it will likely be wiped on restart anyway.
|
| 159 |
+
temp_dir = tempfile.mkdtemp()
|
| 160 |
+
logging.info(f"Created temporary directory for ChromaDB: {temp_dir}")
|
| 161 |
+
|
| 162 |
try:
|
| 163 |
+
logging.info("Initializing ChromaDB client with temporary storage...")
|
| 164 |
+
settings = chromadb.config.Settings(
|
| 165 |
+
persist_directory=temp_dir,
|
| 166 |
+
anonymized_telemetry=False,
|
| 167 |
+
is_persistent=True # Explicitly set for PersistentClient behavior in temp dir
|
|
|
|
|
|
|
| 168 |
)
|
| 169 |
+
# Use the standard Client, but point it to the temp directory
|
| 170 |
+
chroma_client = chromadb.Client(settings=settings)
|
| 171 |
|
| 172 |
+
# Check if collection exists and delete if it does
|
| 173 |
try:
|
| 174 |
existing_collections = [col.name for col in chroma_client.list_collections()]
|
| 175 |
if COLLECTION_NAME in existing_collections:
|
|
|
|
| 178 |
except Exception as delete_e:
|
| 179 |
logging.warning(f"Could not check/delete existing collection (might be okay): {delete_e}")
|
| 180 |
|
|
|
|
| 181 |
logging.info(f"Creating collection: {COLLECTION_NAME}")
|
| 182 |
collection_instance = chroma_client.create_collection(
|
| 183 |
name=COLLECTION_NAME,
|
|
|
|
| 245 |
return None
|
| 246 |
|
| 247 |
# --- Initialize collection ---
|
|
|
|
| 248 |
collection = setup_chroma_collection()
|
| 249 |
# ---
|
| 250 |
|