cstr commited on
Commit
73fc56a
Β·
verified Β·
1 Parent(s): 228bc82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -47
app.py CHANGED
@@ -18,13 +18,12 @@ LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
18
 
19
  print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
20
 
21
- # Get HF token (multiple methods for compatibility)
22
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
23
 
24
  if not HF_TOKEN:
25
  print("⚠️ WARNING: No HF_TOKEN found!")
26
  print(" Add HF_TOKEN in Space settings to enable checkpointing")
27
- print(" Go to: Settings > Variables and secrets > HF_TOKEN")
28
  else:
29
  print(f"βœ… HF_TOKEN found (length: {len(HF_TOKEN)})")
30
 
@@ -44,6 +43,30 @@ def log_progress(message, level="INFO"):
44
  }.get(level, "")
45
  print(f"[{timestamp}] {prefix} {message}")
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def check_remote_progress():
48
  """Check remote progress with detailed logging"""
49
  if not HF_TOKEN:
@@ -166,7 +189,7 @@ def update_remote_progress(completed_indices, analyzed_tables=None, database_upl
166
  return False
167
 
168
  def upload_database_checkpoint(message=""):
169
- """Upload database with progress reporting"""
170
  if not HF_TOKEN:
171
  log_progress("Cannot upload: No HF_TOKEN", "WARN")
172
  return False
@@ -176,6 +199,20 @@ def upload_database_checkpoint(message=""):
176
  return False
177
 
178
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  api = HfApi()
180
 
181
  db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
@@ -220,9 +257,9 @@ def create_indexed_database():
220
  database_uploaded = progress.get("database_uploaded", False)
221
  indexing_complete = progress.get("indexing_complete", False)
222
 
223
- # If fully complete, download and return
224
  if indexing_complete:
225
- log_progress("Fully indexed database exists!", "SUCCESS")
226
  log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
227
 
228
  try:
@@ -233,38 +270,41 @@ def create_indexed_database():
233
  token=HF_TOKEN
234
  )
235
  log_progress(f"Downloaded: {indexed_path}", "SUCCESS")
236
- return indexed_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  except Exception as e:
239
  log_progress(f"Download failed: {e}", "ERROR")
240
  log_progress("Will create locally", "INFO")
241
 
242
- # NEW: If all indices complete OR partial work done, try to download indexed DB
243
- if completed_indices or analyzed_tables or database_uploaded:
244
- log_progress("Checkpoint detected - attempting to download indexed database...", "INFO")
245
  log_progress(f" Completed indices: {sorted(completed_indices)}", "INFO")
246
  log_progress(f" Analyzed tables: {sorted(analyzed_tables)}", "INFO")
247
-
248
- try:
249
- indexed_path = hf_hub_download(
250
- repo_id=INDEXED_REPO_ID,
251
- filename=INDEXED_DB_FILENAME,
252
- repo_type="dataset",
253
- token=HF_TOKEN
254
- )
255
-
256
- # Copy to local path
257
- log_progress(f"Downloaded indexed DB, copying to {LOCAL_DB_PATH}...", "INFO")
258
- shutil.copy2(indexed_path, LOCAL_DB_PATH)
259
- log_progress("Using downloaded indexed database", "SUCCESS")
260
-
261
- except Exception as e:
262
- log_progress(f"Could not download indexed DB: {e}", "WARN")
263
- log_progress("Will need to start from original and create indices", "INFO")
264
 
265
- # Only download original if we don't have local DB yet
266
  if not os.path.exists(LOCAL_DB_PATH):
267
- # Download and copy original
268
  log_progress("Downloading original database...", "INFO")
269
  original_path = hf_hub_download(
270
  repo_id=ORIGINAL_REPO_ID,
@@ -286,17 +326,12 @@ def create_indexed_database():
286
  shutil.copy2(original_path, LOCAL_DB_PATH)
287
  elapsed = time.time() - start
288
  log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
289
-
290
- # Clear completed indices since we're starting from scratch
291
- log_progress("Starting from original DB - clearing completed indices list", "WARN")
292
- completed_indices = set()
293
- analyzed_tables = set()
294
 
295
  # Connect to database
296
  conn = sqlite3.connect(LOCAL_DB_PATH)
297
  cursor = conn.cursor()
298
 
299
- # Enable optimizations
300
  cursor.execute("PRAGMA journal_mode = WAL")
301
  cursor.execute("PRAGMA synchronous = NORMAL")
302
  cursor.execute("PRAGMA cache_size = -512000")
@@ -339,7 +374,7 @@ def create_indexed_database():
339
  indexing_complete=False
340
  )
341
 
342
- # Upload checkpoint
343
  upload_database_checkpoint(f"Checkpoint: {idx_name} created ({i}/{len(indices_to_create)})")
344
 
345
  except Exception as e:
@@ -347,12 +382,11 @@ def create_indexed_database():
347
  conn.close()
348
  raise
349
 
350
- # PHASE 2: Analyze Tables (per-table with checkpoints)
351
  log_progress("="*60, "INFO")
352
  log_progress("PHASE 2: ANALYZING TABLES", "INFO")
353
  log_progress("="*60, "INFO")
354
 
355
- # Get list of tables
356
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
357
  tables = [row[0] for row in cursor.fetchall()]
358
 
@@ -365,7 +399,6 @@ def create_indexed_database():
365
 
366
  log_progress(f"[{i}/{len(tables)}] Analyzing table: {table}", "INFO")
367
 
368
- # Get table size
369
  try:
370
  cursor.execute(f"SELECT COUNT(*) FROM {table}")
371
  row_count = cursor.fetchone()[0]
@@ -376,7 +409,6 @@ def create_indexed_database():
376
  start = time.time()
377
 
378
  try:
379
- # Run ANALYZE on this specific table
380
  cursor.execute(f"ANALYZE {table}")
381
  conn.commit()
382
 
@@ -392,7 +424,7 @@ def create_indexed_database():
392
  indexing_complete=False
393
  )
394
 
395
- # Upload checkpoint after each table
396
  log_progress(f" Uploading checkpoint after analyzing {table}...", "CHECKPOINT")
397
  upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")
398
 
@@ -400,17 +432,29 @@ def create_indexed_database():
400
  log_progress(f"Failed to analyze {table}: {e}", "ERROR")
401
  log_progress("Continuing with next table...", "WARN")
402
 
 
 
 
 
403
  conn.close()
 
404
 
405
- # PHASE 3: Final upload and completion
406
  log_progress("="*60, "INFO")
407
  log_progress("PHASE 3: FINAL UPLOAD", "INFO")
408
  log_progress("="*60, "INFO")
409
 
410
  log_progress("All indexing and analysis complete!", "SUCCESS")
411
- log_progress("Performing final upload...", "INFO")
412
 
413
- upload_database_checkpoint("Final indexed database - COMPLETE")
 
 
 
 
 
 
 
 
414
 
415
  # Mark as complete
416
  update_remote_progress(
@@ -455,10 +499,13 @@ def verify_indices():
455
  cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
456
  custom_indices = cursor.fetchall()
457
 
458
- log_progress(f"Custom indices: {len(custom_indices)}", "INFO")
459
  for idx in custom_indices:
460
  log_progress(f" βœ“ {idx[0]}", "SUCCESS")
461
 
 
 
 
462
  # Speed test
463
  log_progress("Running speed test...", "INFO")
464
  start = time.time()
@@ -469,6 +516,9 @@ def verify_indices():
469
  status = "SUCCESS" if elapsed < 1 else "WARN" if elapsed < 5 else "ERROR"
470
  log_progress(f"Query: {count} results in {elapsed:.3f}s", status)
471
 
 
 
 
472
  log_progress("="*60, "INFO")
473
 
474
  verify_indices()
@@ -645,7 +695,7 @@ def get_schema_info():
645
  # UI
646
  with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
647
  gr.Markdown(f"# 🧠 ConceptNet ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
648
- gr.Markdown(f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | βœ… Per-table checkpoints")
649
 
650
  with gr.Tabs():
651
  with gr.TabItem("πŸ” Profile"):
@@ -675,7 +725,7 @@ with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
675
  schema_btn = gr.Button("πŸ“Š Load")
676
  schema_output = gr.Markdown()
677
 
678
- gr.Markdown("---\nβœ… **Per-table ANALYZE with checkpoints!** Check server logs for detailed progress.")
679
 
680
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
681
  query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
@@ -683,5 +733,5 @@ with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
683
  schema_btn.click(get_schema_info, None, schema_output)
684
 
685
  if __name__ == "__main__":
686
- log_progress("App ready with per-table ANALYZE checkpoints!", "SUCCESS")
687
  demo.launch(ssr_mode=False)
 
18
 
19
  print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
20
 
21
+ # Get HF token
22
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
23
 
24
  if not HF_TOKEN:
25
  print("⚠️ WARNING: No HF_TOKEN found!")
26
  print(" Add HF_TOKEN in Space settings to enable checkpointing")
 
27
  else:
28
  print(f"βœ… HF_TOKEN found (length: {len(HF_TOKEN)})")
29
 
 
43
  }.get(level, "")
44
  print(f"[{timestamp}] {prefix} {message}")
45
 
46
+ def verify_database_has_indices(db_path):
47
+ """
48
+ Verify that a database file actually has the required indices.
49
+ Returns (has_indices, index_count)
50
+ """
51
+ if not os.path.exists(db_path):
52
+ return False, 0
53
+
54
+ try:
55
+ conn = sqlite3.connect(db_path)
56
+ cursor = conn.cursor()
57
+
58
+ # Check for custom indices
59
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
60
+ custom_indices = cursor.fetchall()
61
+
62
+ conn.close()
63
+
64
+ return len(custom_indices) >= 4, len(custom_indices)
65
+
66
+ except Exception as e:
67
+ log_progress(f"Error verifying indices: {e}", "ERROR")
68
+ return False, 0
69
+
70
  def check_remote_progress():
71
  """Check remote progress with detailed logging"""
72
  if not HF_TOKEN:
 
189
  return False
190
 
191
  def upload_database_checkpoint(message=""):
192
+ """Upload database with proper WAL checkpoint"""
193
  if not HF_TOKEN:
194
  log_progress("Cannot upload: No HF_TOKEN", "WARN")
195
  return False
 
199
  return False
200
 
201
  try:
202
+ # CRITICAL FIX: Checkpoint WAL before upload
203
+ log_progress("Checkpointing WAL to merge changes into main file...", "INFO")
204
+ conn = sqlite3.connect(LOCAL_DB_PATH)
205
+ conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
206
+ conn.close()
207
+ log_progress(" WAL checkpoint complete", "SUCCESS")
208
+
209
+ # Verify indices are actually in the file
210
+ has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
211
+ if has_indices:
212
+ log_progress(f" Verified: {idx_count} indices present in file", "SUCCESS")
213
+ else:
214
+ log_progress(f" WARNING: Only {idx_count} indices found (expected 4+)", "WARN")
215
+
216
  api = HfApi()
217
 
218
  db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
 
257
  database_uploaded = progress.get("database_uploaded", False)
258
  indexing_complete = progress.get("indexing_complete", False)
259
 
260
+ # If fully complete, try to download
261
  if indexing_complete:
262
+ log_progress("Fully indexed database exists in HF!", "SUCCESS")
263
  log_progress(f"Downloading from {INDEXED_REPO_ID}...", "INFO")
264
 
265
  try:
 
270
  token=HF_TOKEN
271
  )
272
  log_progress(f"Downloaded: {indexed_path}", "SUCCESS")
273
+
274
+ # CRITICAL CHECK: Verify downloaded database actually has indices
275
+ log_progress("Verifying downloaded database has indices...", "INFO")
276
+ has_indices, idx_count = verify_database_has_indices(indexed_path)
277
+
278
+ if has_indices:
279
+ log_progress(f" βœ… Verified: {idx_count} indices found", "SUCCESS")
280
+ return indexed_path
281
+ else:
282
+ log_progress(f" ❌ CORRUPTED: Only {idx_count} indices (expected 4+)", "ERROR")
283
+ log_progress(" The uploaded database is missing indices!", "ERROR")
284
+ log_progress(" This was caused by WAL mode not being checkpointed.", "ERROR")
285
+ log_progress(" Forcing re-indexing...", "WARN")
286
+
287
+ # Reset progress to force rebuild
288
+ indexing_complete = False
289
+ completed_indices = set()
290
+ analyzed_tables = set()
291
+ database_uploaded = False
292
+
293
+ # Update remote to clear the bad state
294
+ update_remote_progress([], [], database_uploaded=False, indexing_complete=False)
295
 
296
  except Exception as e:
297
  log_progress(f"Download failed: {e}", "ERROR")
298
  log_progress("Will create locally", "INFO")
299
 
300
+ # Check for partial progress
301
+ if completed_indices or analyzed_tables:
302
+ log_progress("Resuming from checkpoint:", "INFO")
303
  log_progress(f" Completed indices: {sorted(completed_indices)}", "INFO")
304
  log_progress(f" Analyzed tables: {sorted(analyzed_tables)}", "INFO")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
305
 
306
+ # Download original database if needed
307
  if not os.path.exists(LOCAL_DB_PATH):
 
308
  log_progress("Downloading original database...", "INFO")
309
  original_path = hf_hub_download(
310
  repo_id=ORIGINAL_REPO_ID,
 
326
  shutil.copy2(original_path, LOCAL_DB_PATH)
327
  elapsed = time.time() - start
328
  log_progress(f"Copied in {elapsed:.1f}s", "SUCCESS")
 
 
 
 
 
329
 
330
  # Connect to database
331
  conn = sqlite3.connect(LOCAL_DB_PATH)
332
  cursor = conn.cursor()
333
 
334
+ # Enable optimizations (but we'll checkpoint WAL before upload!)
335
  cursor.execute("PRAGMA journal_mode = WAL")
336
  cursor.execute("PRAGMA synchronous = NORMAL")
337
  cursor.execute("PRAGMA cache_size = -512000")
 
374
  indexing_complete=False
375
  )
376
 
377
+ # Upload checkpoint (with WAL checkpoint!)
378
  upload_database_checkpoint(f"Checkpoint: {idx_name} created ({i}/{len(indices_to_create)})")
379
 
380
  except Exception as e:
 
382
  conn.close()
383
  raise
384
 
385
+ # PHASE 2: Analyze Tables
386
  log_progress("="*60, "INFO")
387
  log_progress("PHASE 2: ANALYZING TABLES", "INFO")
388
  log_progress("="*60, "INFO")
389
 
 
390
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
391
  tables = [row[0] for row in cursor.fetchall()]
392
 
 
399
 
400
  log_progress(f"[{i}/{len(tables)}] Analyzing table: {table}", "INFO")
401
 
 
402
  try:
403
  cursor.execute(f"SELECT COUNT(*) FROM {table}")
404
  row_count = cursor.fetchone()[0]
 
409
  start = time.time()
410
 
411
  try:
 
412
  cursor.execute(f"ANALYZE {table}")
413
  conn.commit()
414
 
 
424
  indexing_complete=False
425
  )
426
 
427
+ # Upload checkpoint
428
  log_progress(f" Uploading checkpoint after analyzing {table}...", "CHECKPOINT")
429
  upload_database_checkpoint(f"Checkpoint: {table} analyzed ({i}/{len(tables)})")
430
 
 
432
  log_progress(f"Failed to analyze {table}: {e}", "ERROR")
433
  log_progress("Continuing with next table...", "WARN")
434
 
435
+ # CRITICAL: Final WAL checkpoint before closing
436
+ log_progress("Performing final WAL checkpoint...", "INFO")
437
+ cursor.execute("PRAGMA wal_checkpoint(TRUNCATE)")
438
+ conn.commit()
439
  conn.close()
440
+ log_progress(" All changes merged to main database file", "SUCCESS")
441
 
442
+ # PHASE 3: Final upload
443
  log_progress("="*60, "INFO")
444
  log_progress("PHASE 3: FINAL UPLOAD", "INFO")
445
  log_progress("="*60, "INFO")
446
 
447
  log_progress("All indexing and analysis complete!", "SUCCESS")
 
448
 
449
+ # Verify one more time before final upload
450
+ has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
451
+ if has_indices:
452
+ log_progress(f"Final verification: {idx_count} indices confirmed in file βœ…", "SUCCESS")
453
+ else:
454
+ log_progress(f"WARNING: Only {idx_count} indices in file!", "ERROR")
455
+
456
+ log_progress("Performing final upload...", "INFO")
457
+ upload_database_checkpoint("Final indexed database - COMPLETE (with WAL checkpoint)")
458
 
459
  # Mark as complete
460
  update_remote_progress(
 
499
  cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
500
  custom_indices = cursor.fetchall()
501
 
502
+ log_progress(f"Custom indices: {len(custom_indices)}", "SUCCESS" if len(custom_indices) >= 4 else "ERROR")
503
  for idx in custom_indices:
504
  log_progress(f" βœ“ {idx[0]}", "SUCCESS")
505
 
506
+ if len(custom_indices) < 4:
507
+ log_progress("⚠️ WARNING: Expected 4 indices, something went wrong!", "ERROR")
508
+
509
  # Speed test
510
  log_progress("Running speed test...", "INFO")
511
  start = time.time()
 
516
  status = "SUCCESS" if elapsed < 1 else "WARN" if elapsed < 5 else "ERROR"
517
  log_progress(f"Query: {count} results in {elapsed:.3f}s", status)
518
 
519
+ if elapsed > 5:
520
+ log_progress("⚠️ Query is slow - indices may not be working!", "ERROR")
521
+
522
  log_progress("="*60, "INFO")
523
 
524
  verify_indices()
 
695
  # UI
696
  with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
697
  gr.Markdown(f"# 🧠 ConceptNet ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
698
+ gr.Markdown(f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | βœ… Fixed WAL checkpoint issue")
699
 
700
  with gr.Tabs():
701
  with gr.TabItem("πŸ” Profile"):
 
725
  schema_btn = gr.Button("πŸ“Š Load")
726
  schema_output = gr.Markdown()
727
 
728
+ gr.Markdown("---\nβœ… **Fixed WAL checkpoint issue!** Database now properly contains indices. Will auto-rebuild if corrupted DB detected.")
729
 
730
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
731
  query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
 
733
  schema_btn.click(get_schema_info, None, schema_output)
734
 
735
  if __name__ == "__main__":
736
+ log_progress("App ready with WAL checkpoint fix!", "SUCCESS")
737
  demo.launch(ssr_mode=False)