Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -93,12 +93,10 @@ def load_data_and_setup_chroma():
|
|
| 93 |
# Ensure dependent resources are loaded first
|
| 94 |
if not generation_client or not embedding_model:
|
| 95 |
st.error("Required clients/models not initialized. Cannot proceed.")
|
| 96 |
-
# Potentially redundant with individual init checks, but safe
|
| 97 |
st.stop()
|
| 98 |
|
| 99 |
try:
|
| 100 |
logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
|
| 101 |
-
# Download the specific parquet file from the dataset repo
|
| 102 |
try:
|
| 103 |
parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
|
| 104 |
logging.info(f"Downloaded dataset file to: {parquet_path}")
|
|
@@ -111,24 +109,21 @@ def load_data_and_setup_chroma():
|
|
| 111 |
df = pd.read_parquet(parquet_path)
|
| 112 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 113 |
|
| 114 |
-
# Verify required columns
|
| 115 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 116 |
if not all(col in df.columns for col in required_cols):
|
| 117 |
st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 118 |
logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
|
| 119 |
st.stop()
|
| 120 |
|
| 121 |
-
# Ensure embeddings are lists of floats
|
| 122 |
logging.info("Ensuring embeddings are in list format...")
|
| 123 |
-
|
| 124 |
-
if not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float):
|
| 125 |
df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
|
| 126 |
logging.info("Converted embeddings to list[float].")
|
| 127 |
else:
|
| 128 |
-
logging.info("Embeddings already seem to be in list[float] format.")
|
| 129 |
|
| 130 |
initial_rows = len(df)
|
| 131 |
-
df.dropna(subset=['embedding'], inplace=True)
|
| 132 |
if len(df) < initial_rows:
|
| 133 |
logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
|
| 134 |
|
|
@@ -138,7 +133,13 @@ def load_data_and_setup_chroma():
|
|
| 138 |
st.stop()
|
| 139 |
|
| 140 |
logging.info("Initializing in-memory ChromaDB client...")
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
try:
|
| 144 |
chroma_client.delete_collection(name=COLLECTION_NAME)
|
|
@@ -146,7 +147,6 @@ def load_data_and_setup_chroma():
|
|
| 146 |
except: pass
|
| 147 |
|
| 148 |
logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
|
| 149 |
-
# Create collection WITHOUT embedding function
|
| 150 |
collection = chroma_client.create_collection(
|
| 151 |
name=COLLECTION_NAME,
|
| 152 |
metadata={"hnsw:space": "cosine"}
|
|
@@ -164,47 +164,41 @@ def load_data_and_setup_chroma():
|
|
| 164 |
batch_df = df.iloc[start_idx:end_idx]
|
| 165 |
|
| 166 |
try:
|
| 167 |
-
#
|
| 168 |
-
|
| 169 |
-
if metadatas_list and isinstance(metadatas_list[0], dict):
|
| 170 |
-
pass # Already list of dicts
|
| 171 |
-
else:
|
| 172 |
-
# Attempt to parse if they are JSON strings, otherwise use empty dicts
|
| 173 |
-
parsed_metadatas = []
|
| 174 |
-
for item in metadatas_list:
|
| 175 |
-
try:
|
| 176 |
-
parsed = json.loads(item) if isinstance(item, str) else item
|
| 177 |
-
parsed_metadatas.append(parsed if isinstance(parsed, dict) else {})
|
| 178 |
-
except:
|
| 179 |
-
parsed_metadatas.append({})
|
| 180 |
-
metadatas_list = parsed_metadatas # This line has the wrong indentation
|
| 181 |
-
|
| 182 |
-
# --- Clean None values from metadata ---
|
| 183 |
cleaned_metadatas = []
|
| 184 |
-
for
|
| 185 |
cleaned_dict = {}
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
if value is None:
|
| 190 |
-
cleaned_dict[key] = ""
|
| 191 |
elif isinstance(value, (str, int, float, bool)):
|
| 192 |
-
cleaned_dict[key] = value
|
| 193 |
else:
|
| 194 |
-
# Attempt to convert
|
| 195 |
-
try:
|
| 196 |
cleaned_dict[key] = str(value)
|
| 197 |
logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
|
| 198 |
except:
|
| 199 |
logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
|
| 200 |
cleaned_metadatas.append(cleaned_dict)
|
| 201 |
-
# -----------------------------------------
|
| 202 |
|
|
|
|
| 203 |
collection.add(
|
| 204 |
ids=batch_df['id'].tolist(),
|
| 205 |
embeddings=batch_df['embedding'].tolist(),
|
| 206 |
documents=batch_df['document'].tolist(),
|
| 207 |
-
metadatas=cleaned_metadatas # Use cleaned list
|
| 208 |
)
|
| 209 |
except Exception as e:
|
| 210 |
logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
|
|
@@ -217,6 +211,13 @@ def load_data_and_setup_chroma():
|
|
| 217 |
if error_count > 0:
|
| 218 |
logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
|
| 219 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
st.success("Embeddings loaded successfully!")
|
| 221 |
return collection
|
| 222 |
|
|
|
|
| 93 |
# Ensure dependent resources are loaded first
|
| 94 |
if not generation_client or not embedding_model:
|
| 95 |
st.error("Required clients/models not initialized. Cannot proceed.")
|
|
|
|
| 96 |
st.stop()
|
| 97 |
|
| 98 |
try:
|
| 99 |
logging.info(f"Loading dataset '{HF_DATASET_ID}' from Hugging Face Hub...")
|
|
|
|
| 100 |
try:
|
| 101 |
parquet_path = hf_hub_download(repo_id=HF_DATASET_ID, filename=PARQUET_FILENAME, repo_type='dataset')
|
| 102 |
logging.info(f"Downloaded dataset file to: {parquet_path}")
|
|
|
|
| 109 |
df = pd.read_parquet(parquet_path)
|
| 110 |
logging.info(f"Dataset loaded into DataFrame with shape: {df.shape}")
|
| 111 |
|
|
|
|
| 112 |
required_cols = ['id', 'document', 'embedding', 'metadata']
|
| 113 |
if not all(col in df.columns for col in required_cols):
|
| 114 |
st.error(f"Dataset Parquet file is missing required columns. Found: {df.columns}. Required: {required_cols}")
|
| 115 |
logging.error(f"Dataset Parquet file missing required columns. Found: {df.columns}")
|
| 116 |
st.stop()
|
| 117 |
|
|
|
|
| 118 |
logging.info("Ensuring embeddings are in list format...")
|
| 119 |
+
if not df.empty and df['embedding'].iloc[0] is not None and (not isinstance(df['embedding'].iloc[0], list) or not isinstance(df['embedding'].iloc[0][0], float)):
|
|
|
|
| 120 |
df['embedding'] = df['embedding'].apply(lambda x: list(map(float, x)) if isinstance(x, (np.ndarray, list)) else None)
|
| 121 |
logging.info("Converted embeddings to list[float].")
|
| 122 |
else:
|
| 123 |
+
logging.info("Embeddings already seem to be in list[float] format or DataFrame is empty.")
|
| 124 |
|
| 125 |
initial_rows = len(df)
|
| 126 |
+
df.dropna(subset=['embedding'], inplace=True)
|
| 127 |
if len(df) < initial_rows:
|
| 128 |
logging.warning(f"Dropped {initial_rows - len(df)} rows due to invalid embedding format.")
|
| 129 |
|
|
|
|
| 133 |
st.stop()
|
| 134 |
|
| 135 |
logging.info("Initializing in-memory ChromaDB client...")
|
| 136 |
+
# Explicitly configure for in-memory using DuckDB+Parquet
|
| 137 |
+
settings = chromadb.config.Settings(
|
| 138 |
+
chroma_api_impl="local",
|
| 139 |
+
chroma_db_impl="duckdb+parquet",
|
| 140 |
+
persist_directory=None # Ensure no persistence is attempted
|
| 141 |
+
)
|
| 142 |
+
chroma_client = chromadb.Client(settings=settings)
|
| 143 |
|
| 144 |
try:
|
| 145 |
chroma_client.delete_collection(name=COLLECTION_NAME)
|
|
|
|
| 147 |
except: pass
|
| 148 |
|
| 149 |
logging.info(f"Creating in-memory collection: {COLLECTION_NAME}")
|
|
|
|
| 150 |
collection = chroma_client.create_collection(
|
| 151 |
name=COLLECTION_NAME,
|
| 152 |
metadata={"hnsw:space": "cosine"}
|
|
|
|
| 164 |
batch_df = df.iloc[start_idx:end_idx]
|
| 165 |
|
| 166 |
try:
|
| 167 |
+
# Prepare metadata for the batch
|
| 168 |
+
metadatas_list_raw = batch_df['metadata'].tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
cleaned_metadatas = []
|
| 170 |
+
for item in metadatas_list_raw:
|
| 171 |
cleaned_dict = {}
|
| 172 |
+
# Handle potential non-dict items loaded from parquet/dataset
|
| 173 |
+
if isinstance(item, dict):
|
| 174 |
+
current_meta = item
|
| 175 |
+
else:
|
| 176 |
+
try: # Attempt to parse if it's a JSON string
|
| 177 |
+
current_meta = json.loads(item) if isinstance(item, str) else {}
|
| 178 |
+
except:
|
| 179 |
+
current_meta = {} # Default to empty dict if not dict or valid JSON
|
| 180 |
+
|
| 181 |
+
# Clean None values within the dictionary
|
| 182 |
+
if isinstance(current_meta, dict):
|
| 183 |
+
for key, value in current_meta.items():
|
| 184 |
if value is None:
|
| 185 |
+
cleaned_dict[key] = "" # Replace None with empty string
|
| 186 |
elif isinstance(value, (str, int, float, bool)):
|
| 187 |
+
cleaned_dict[key] = value # Keep allowed types
|
| 188 |
else:
|
| 189 |
+
try: # Attempt to convert others to string
|
|
|
|
| 190 |
cleaned_dict[key] = str(value)
|
| 191 |
logging.warning(f"Converted unexpected metadata type ({type(value)}) to string for key '{key}'.")
|
| 192 |
except:
|
| 193 |
logging.warning(f"Skipping metadata key '{key}' with unconvertible type {type(value)}.")
|
| 194 |
cleaned_metadatas.append(cleaned_dict)
|
|
|
|
| 195 |
|
| 196 |
+
# Add the batch with cleaned metadata
|
| 197 |
collection.add(
|
| 198 |
ids=batch_df['id'].tolist(),
|
| 199 |
embeddings=batch_df['embedding'].tolist(),
|
| 200 |
documents=batch_df['document'].tolist(),
|
| 201 |
+
metadatas=cleaned_metadatas # Use the cleaned list
|
| 202 |
)
|
| 203 |
except Exception as e:
|
| 204 |
logging.error(f"Error adding batch {i+1}/{num_batches} to in-memory Chroma: {e}")
|
|
|
|
| 211 |
if error_count > 0:
|
| 212 |
logging.warning(f"Encountered errors in {error_count} batches during add to Chroma.")
|
| 213 |
|
| 214 |
+
# Verify count after adding
|
| 215 |
+
final_count = collection.count()
|
| 216 |
+
logging.info(f"Final document count in Chroma collection: {final_count}")
|
| 217 |
+
if final_count == 0 and len(df) > 0:
|
| 218 |
+
st.warning("ChromaDB collection is empty after attempting to add documents. Check logs for errors.")
|
| 219 |
+
# Don't necessarily stop, but warn the user.
|
| 220 |
+
|
| 221 |
st.success("Embeddings loaded successfully!")
|
| 222 |
return collection
|
| 223 |
|