cstr commited on
Commit
5aee2b7
·
verified ·
1 Parent(s): 73fc56a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -14
app.py CHANGED
@@ -257,7 +257,7 @@ def create_indexed_database():
257
  database_uploaded = progress.get("database_uploaded", False)
258
  indexing_complete = progress.get("indexing_complete", False)
259
 
260
- # If fully complete, try to download
261
  if indexing_complete:
262
  log_progress("Fully indexed database exists in HF!", "SUCCESS")
263
  log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
@@ -271,8 +271,7 @@ def create_indexed_database():
271
  )
272
  log_progress(f"Downloaded: {indexed_path}", "SUCCESS")
273
 
274
- # CRITICAL CHECK: Verify downloaded database actually has indices
275
- log_progress("Verifying downloaded database has indices...", "INFO")
276
  has_indices, idx_count = verify_database_has_indices(indexed_path)
277
 
278
  if has_indices:
@@ -280,32 +279,67 @@ def create_indexed_database():
280
  return indexed_path
281
  else:
282
  log_progress(f" ❌ CORRUPTED: Only {idx_count} indices (expected 4+)", "ERROR")
283
- log_progress(" The uploaded database is missing indices!", "ERROR")
284
- log_progress(" This was caused by WAL mode not being checkpointed.", "ERROR")
285
  log_progress(" Forcing re-indexing...", "WARN")
286
 
287
- # Reset progress to force rebuild
288
  indexing_complete = False
289
  completed_indices = set()
290
  analyzed_tables = set()
291
  database_uploaded = False
292
-
293
- # Update remote to clear the bad state
294
  update_remote_progress([], [], database_uploaded=False, indexing_complete=False)
295
 
296
  except Exception as e:
297
  log_progress(f"Download failed: {e}", "ERROR")
298
- log_progress("Will create locally", "INFO")
299
 
300
- # Check for partial progress
301
- if completed_indices or analyzed_tables:
302
- log_progress("Resuming from checkpoint:", "INFO")
303
  log_progress(f" Completed indices: {sorted(completed_indices)}", "INFO")
304
  log_progress(f" Analyzed tables: {sorted(analyzed_tables)}", "INFO")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
- # Download original database if needed
307
  if not os.path.exists(LOCAL_DB_PATH):
308
- log_progress("Downloading original database...", "INFO")
 
 
 
 
 
 
 
309
  original_path = hf_hub_download(
310
  repo_id=ORIGINAL_REPO_ID,
311
  filename=ORIGINAL_DB_FILENAME,
 
257
  database_uploaded = progress.get("database_uploaded", False)
258
  indexing_complete = progress.get("indexing_complete", False)
259
 
260
+ # If fully complete, download and return
261
  if indexing_complete:
262
  log_progress("Fully indexed database exists in HF!", "SUCCESS")
263
  log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
 
271
  )
272
  log_progress(f"Downloaded: {indexed_path}", "SUCCESS")
273
 
274
+ # Verify indices
 
275
  has_indices, idx_count = verify_database_has_indices(indexed_path)
276
 
277
  if has_indices:
 
279
  return indexed_path
280
  else:
281
  log_progress(f" ❌ CORRUPTED: Only {idx_count} indices (expected 4+)", "ERROR")
 
 
282
  log_progress(" Forcing re-indexing...", "WARN")
283
 
284
+ # Reset progress
285
  indexing_complete = False
286
  completed_indices = set()
287
  analyzed_tables = set()
288
  database_uploaded = False
 
 
289
  update_remote_progress([], [], database_uploaded=False, indexing_complete=False)
290
 
291
  except Exception as e:
292
  log_progress(f"Download failed: {e}", "ERROR")
 
293
 
294
+ # CRITICAL FIX: If ANY progress exists, download from OUR repo (not original!)
295
+ if (completed_indices or analyzed_tables or database_uploaded) and not os.path.exists(LOCAL_DB_PATH):
296
+ log_progress("Checkpoint detected - downloading PARTIALLY INDEXED database from our repo...", "INFO")
297
  log_progress(f" Completed indices: {sorted(completed_indices)}", "INFO")
298
  log_progress(f" Analyzed tables: {sorted(analyzed_tables)}", "INFO")
299
+
300
+ try:
301
+ # Download from OUR repo (has indices!)
302
+ indexed_path = hf_hub_download(
303
+ repo_id=INDEXED_REPO_ID,
304
+ filename=INDEXED_DB_FILENAME,
305
+ repo_type="dataset",
306
+ token=HF_TOKEN
307
+ )
308
+
309
+ log_progress(f"Downloaded partial DB: {indexed_path}", "SUCCESS")
310
+
311
+ # Verify it has the expected indices
312
+ has_indices, idx_count = verify_database_has_indices(indexed_path)
313
+ log_progress(f" Verified: {idx_count} indices present", "SUCCESS" if idx_count >= len(completed_indices) else "WARN")
314
+
315
+ # Copy to working location
316
+ log_progress(f"Copying to {LOCAL_DB_PATH}...", "INFO")
317
+ start = time.time()
318
+ shutil.copy2(indexed_path, LOCAL_DB_PATH)
319
+ elapsed = time.time() - start
320
+ log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
321
+
322
+ # Successfully resumed from checkpoint
323
+ log_progress("Resuming from our partially indexed database ✅", "SUCCESS")
324
+
325
+ except Exception as e:
326
+ log_progress(f"Could not download from our repo: {e}", "WARN")
327
+ log_progress("Will fall back to original database and rebuild indices", "WARN")
328
+
329
+ # Reset since we can't continue
330
+ completed_indices = set()
331
+ analyzed_tables = set()
332
 
333
+ # Only download ORIGINAL if we have NO local DB and NO checkpoint
334
  if not os.path.exists(LOCAL_DB_PATH):
335
+ if completed_indices or analyzed_tables:
336
+ log_progress("WARNING: Should have downloaded from our repo but failed!", "ERROR")
337
+ log_progress("Clearing progress and starting from scratch...", "WARN")
338
+ completed_indices = set()
339
+ analyzed_tables = set()
340
+ update_remote_progress([], [], database_uploaded=False, indexing_complete=False)
341
+
342
+ log_progress("Downloading ORIGINAL database (no checkpoint exists)...", "INFO")
343
  original_path = hf_hub_download(
344
  repo_id=ORIGINAL_REPO_ID,
345
  filename=ORIGINAL_DB_FILENAME,