cstr commited on
Commit
9ec2493
Β·
verified Β·
1 Parent(s): 45626f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +373 -213
app.py CHANGED
@@ -1,131 +1,355 @@
1
  import gradio as gr
2
  import sqlite3
3
  import pandas as pd
4
- from huggingface_hub import hf_hub_download, snapshot_download
5
  import os
6
  import time
7
  import shutil
8
  from pathlib import Path
 
9
 
10
  # ===== CONFIGURATION =====
11
  TARGET_LANGUAGES = ['de']
12
- INDEXED_DB_PATH = "/tmp/conceptnet-indexed.db"
 
 
 
13
  # =========================
14
 
15
  print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
16
 
17
- # Download original database
18
- REPO_ID = "ysenarath/conceptnet-sqlite"
19
- DB_FILENAME = "data/conceptnet-v5.7.0.db"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- ORIGINAL_DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
22
- print(f"Original database: {ORIGINAL_DB_PATH}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def create_indexed_database():
25
  """
26
- Copy database and create missing indices for fast queries.
27
- This runs once on startup.
28
  """
29
- if os.path.exists(INDEXED_DB_PATH):
30
- db_age = time.time() - os.path.getmtime(INDEXED_DB_PATH)
31
- if db_age < 24 * 3600: # Less than 24 hours old
32
- print(f"βœ… Using existing indexed database: {INDEXED_DB_PATH}")
33
- print(f" (Created {db_age/3600:.1f} hours ago)")
34
- return INDEXED_DB_PATH
35
- else:
36
- print(f"⚠️ Indexed database is {db_age/3600:.1f} hours old, recreating...")
37
- os.remove(INDEXED_DB_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
39
  print("\n" + "="*60)
40
- print("CREATING INDEXED DATABASE (ONE-TIME SETUP)")
41
  print("="*60)
42
- print(f"This will take ~2-5 minutes but only needs to run once.")
43
- print(f"Subsequent runs will be instant.\n")
44
 
45
- # Check if we have enough space
46
- original_size = os.path.getsize(ORIGINAL_DB_PATH)
47
- free_space = shutil.disk_usage("/tmp")[2]
48
 
49
- print(f"Original DB size: {original_size / (2**30):.2f} GB")
50
- print(f"Free space in /tmp: {free_space / (2**30):.2f} GB")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- if free_space < original_size * 1.5:
53
- print("⚠️ WARNING: Low disk space! Indices will add ~20% to DB size.")
54
- print("Continuing anyway...\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- # Copy database
57
- print(f"1. Copying database to {INDEXED_DB_PATH}...")
58
- start = time.time()
59
- shutil.copy2(ORIGINAL_DB_PATH, INDEXED_DB_PATH)
60
- elapsed = time.time() - start
61
- print(f" βœ“ Copied in {elapsed:.1f}s\n")
 
62
 
63
- # Connect and create indices
64
- print("2. Creating indices on edge table...")
65
- conn = sqlite3.connect(INDEXED_DB_PATH)
66
  cursor = conn.cursor()
67
 
68
- # Enable optimizations for index creation
69
  cursor.execute("PRAGMA journal_mode = WAL")
70
  cursor.execute("PRAGMA synchronous = NORMAL")
71
- cursor.execute("PRAGMA cache_size = -256000")
72
- cursor.execute("PRAGMA temp_store = MEMORY")
73
 
74
- indices_to_create = [
75
- ("idx_edge_start_id", "edge", "start_id", "Speed up queries filtering by start node"),
76
- ("idx_edge_end_id", "edge", "end_id", "Speed up queries filtering by end node"),
77
- ("idx_edge_rel_id", "edge", "rel_id", "Speed up queries filtering by relation"),
78
- ]
79
 
80
  for idx_name, table, column, description in indices_to_create:
81
- print(f" Creating {idx_name} on {table}({column})...")
 
 
 
 
82
  print(f" Purpose: {description}")
83
- start = time.time()
84
 
85
- cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
86
 
87
- elapsed = time.time() - start
88
- print(f" βœ“ Created in {elapsed:.1f}s\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # Analyze for query optimization
91
- print("3. Running ANALYZE to optimize query planning...")
92
  start = time.time()
93
  cursor.execute("ANALYZE")
 
94
  elapsed = time.time() - start
95
- print(f" βœ“ Analyzed in {elapsed:.1f}s\n")
96
 
97
- # Commit and close
98
- conn.commit()
99
  conn.close()
100
 
101
- # Check final size
102
- indexed_size = os.path.getsize(INDEXED_DB_PATH)
103
- size_increase = (indexed_size - original_size) / (2**30)
104
 
105
- print("="*60)
 
 
 
 
 
106
  print("INDEXING COMPLETE!")
107
  print("="*60)
108
- print(f"Original size: {original_size / (2**30):.2f} GB")
109
- print(f"Indexed size: {indexed_size / (2**30):.2f} GB")
110
- print(f"Size increase: +{size_increase:.2f} GB ({100*size_increase/(original_size/(2**30)):.1f}%)")
111
- print(f"Location: {INDEXED_DB_PATH}")
112
  print("="*60 + "\n")
113
 
114
- return INDEXED_DB_PATH
115
 
116
- # Create indexed database on startup
117
  DB_PATH = create_indexed_database()
118
 
119
  def get_db_connection():
120
- """Create optimized read connection to indexed database"""
121
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
122
  conn.execute("PRAGMA cache_size = -256000")
123
  conn.execute("PRAGMA mmap_size = 4294967296")
124
- conn.execute("PRAGMA temp_store = MEMORY")
125
  return conn
126
 
127
  def verify_indices():
128
- """Verify that indices were created successfully"""
129
  print("\n" + "="*60)
130
  print("VERIFYING INDICES")
131
  print("="*60)
@@ -133,57 +357,29 @@ def verify_indices():
133
  with get_db_connection() as conn:
134
  cursor = conn.cursor()
135
 
136
- # Check edge table indices
137
- cursor.execute("PRAGMA index_list(edge)")
138
- indices = cursor.fetchall()
139
-
140
- print(f"\nEdge table indices: {len(indices)}")
141
- for idx in indices:
142
- idx_name = idx[1]
143
- cursor.execute(f"PRAGMA index_info({idx_name})")
144
- cols = cursor.fetchall()
145
- col_names = [c[2] for c in cols if c[2]] or ['PRIMARY KEY']
146
- print(f" βœ“ {idx_name}: {', '.join(col_names)}")
147
-
148
- # Test query speed with EXPLAIN QUERY PLAN
149
- print("\n" + "="*60)
150
- print("TESTING QUERY PERFORMANCE")
151
- print("="*60)
152
-
153
- test_queries = [
154
- ("Node query (indexed)", "SELECT * FROM node WHERE id LIKE '/c/de/hund%'"),
155
- ("Edge start_id (NOW INDEXED!)", "SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10"),
156
- ("Edge end_id (NOW INDEXED!)", "SELECT * FROM edge WHERE end_id LIKE '/c/de/tier%' LIMIT 10"),
157
- ]
158
-
159
- for name, query in test_queries:
160
- print(f"\n{name}:")
161
-
162
- # Show query plan
163
- cursor.execute(f"EXPLAIN QUERY PLAN {query}")
164
- plan = cursor.fetchall()
165
- uses_index = any('INDEX' in str(row).upper() for row in plan)
166
-
167
- for row in plan:
168
- print(f" Plan: {row}")
169
-
170
- # Time the query
171
- start = time.time()
172
- cursor.execute(query)
173
- results = cursor.fetchall()
174
- elapsed = time.time() - start
175
-
176
- status = "βœ… FAST" if elapsed < 1 else "⚠️ SLOW" if elapsed < 5 else "❌ VERY SLOW"
177
- print(f" {status}: {len(results)} results in {elapsed:.3f}s")
178
 
179
- print("\n" + "="*60 + "\n")
 
 
 
 
 
 
 
 
180
 
181
  verify_indices()
182
 
183
- def get_semantic_profile(word, lang='de'):
184
- """
185
- Semantic profile - NOW FAST with indices!
186
- """
187
  if not word:
188
  return "⚠️ Please enter a word."
189
 
@@ -205,12 +401,12 @@ def get_semantic_profile(word, lang='de'):
205
  with get_db_connection() as conn:
206
  cursor = conn.cursor()
207
 
208
- # Check if word exists
209
- cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
210
  nodes = cursor.fetchall()
211
 
212
  if not nodes:
213
- return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found. Check spelling or try a more common word."
214
 
215
  for node_id, label in nodes[:3]:
216
  output_md += f"**Node:** `{node_id}` ({label})\n"
@@ -218,12 +414,13 @@ def get_semantic_profile(word, lang='de'):
218
 
219
  total_relations = 0
220
 
221
- # Query each relation - NOW FAST with indices!
222
- for rel in relations:
 
223
  output_md += f"## {rel}\n\n"
224
  has_results = False
225
 
226
- # Outgoing edges - FAST with idx_edge_start_id
227
  cursor.execute("""
228
  SELECT en.label, e.weight
229
  FROM edge e
@@ -239,7 +436,7 @@ def get_semantic_profile(word, lang='de'):
239
  has_results = True
240
  total_relations += 1
241
 
242
- # Incoming edges - FAST with idx_edge_end_id
243
  cursor.execute("""
244
  SELECT s.label, e.weight
245
  FROM edge e
@@ -259,27 +456,21 @@ def get_semantic_profile(word, lang='de'):
259
  output_md += "*No results*\n"
260
  output_md += "\n"
261
 
262
- output_md += f"---\n**Total relations:** {total_relations}\n"
 
263
  return output_md
264
 
265
  except Exception as e:
266
- print(f"ERROR: {e}")
267
  import traceback
268
  traceback.print_exc()
269
- return f"**❌ Error:**\n\n```\n{e}\n```"
270
 
271
- def run_query(start_node, relation, end_node, limit):
272
- """Query builder - NOW FAST with indices!"""
 
273
 
274
  query = """
275
- SELECT
276
- e.id AS edge_id,
277
- s.id AS start_id,
278
- r.label AS relation,
279
- en.id AS end_id,
280
- e.weight,
281
- s.label AS start_label,
282
- en.label AS end_label
283
  FROM edge e
284
  JOIN relation r ON e.rel_id = r.id
285
  JOIN node s ON e.start_id = s.id
@@ -291,24 +482,30 @@ def run_query(start_node, relation, end_node, limit):
291
 
292
  try:
293
  with get_db_connection() as conn:
 
 
294
  # Language filter
295
- lang_filter = " OR ".join([f"(s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%')" for lang in TARGET_LANGUAGES])
296
- query += f" AND ({lang_filter})"
 
 
 
297
 
298
- # User filters
299
- if start_node:
300
  pattern = start_node if '%' in start_node else f"%{start_node}%"
301
  query += " AND s.id LIKE ?"
302
  params.append(pattern)
303
 
304
- if relation:
 
305
  if '%' in relation:
306
  query += " AND r.label LIKE ?"
307
  else:
308
  query += " AND r.label = ?"
309
- params.append(relation)
310
 
311
- if end_node:
312
  pattern = end_node if '%' in end_node else f"%{end_node}%"
313
  query += " AND en.id LIKE ?"
314
  params.append(pattern)
@@ -316,18 +513,21 @@ def run_query(start_node, relation, end_node, limit):
316
  query += " ORDER BY e.weight DESC LIMIT ?"
317
  params.append(limit)
318
 
 
 
319
  start_time = time.time()
320
  df = pd.read_sql_query(query, conn, params=params)
321
  elapsed = time.time() - start_time
322
 
 
 
323
  if df.empty:
324
- return pd.DataFrame(), f"No results ({elapsed:.2f}s)"
325
 
326
  df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
327
  return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
328
 
329
  except Exception as e:
330
- print(f"ERROR: {e}")
331
  import traceback
332
  traceback.print_exc()
333
  return pd.DataFrame(), f"**❌ Error:** {e}"
@@ -339,114 +539,74 @@ def run_raw_query(sql_query):
339
 
340
  try:
341
  with get_db_connection() as conn:
342
- start = time.time()
343
  df = pd.read_sql_query(sql_query, conn)
344
- elapsed = time.time() - start
345
- return df, f"βœ… {len(df)} rows in {elapsed:.2f}s"
346
  except Exception as e:
347
  return pd.DataFrame(), f"Error: {e}"
348
 
349
  def get_schema_info():
350
- """Get schema with index info"""
351
  with get_db_connection() as conn:
352
  cursor = conn.cursor()
353
 
354
- md = "# πŸ“š Database Schema\n\n"
355
- md += "βœ… **Custom indices created for fast queries!**\n\n"
356
 
357
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
358
- tables = cursor.fetchall()
359
 
360
- for table_name, in tables:
361
  cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
362
- count = cursor.fetchone()[0]
363
- md += f"## {table_name} ({count:,} rows)\n\n"
364
-
365
- # Columns
366
- cursor.execute(f"PRAGMA table_info({table_name})")
367
- cols = cursor.fetchall()
368
- md += "| Column | Type | Null | PK |\n|:--|:--|:--|:--|\n"
369
- for col in cols:
370
- md += f"| `{col[1]}` | `{col[2]}` | {'βœ—' if col[3] else 'βœ“'} | {'βœ“' if col[5] else 'βœ—'} |\n"
371
 
372
- # Indices
373
  cursor.execute(f"PRAGMA index_list({table_name})")
374
  indices = cursor.fetchall()
375
 
376
  if indices:
377
- md += f"\n**Indices ({len(indices)}):**\n"
378
- for idx in indices:
379
- cursor.execute(f"PRAGMA index_info({idx[1]})")
380
- idx_cols = cursor.fetchall()
381
- cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'id'
382
-
383
- # Mark custom indices
384
- custom = "πŸ†• CUSTOM" if idx[1].startswith("idx_") else ""
385
- md += f"- `{idx[1]}` on ({cols_str}) {custom}\n"
386
-
387
- md += "\n---\n\n"
388
 
389
  return md
390
 
391
- # Gradio UI
392
- with gr.Blocks(title="ConceptNet Explorer (INDEXED)", theme=gr.themes.Soft()) as demo:
393
- gr.Markdown("# 🧠 ConceptNet Explorer (With Custom Indices! πŸš€)")
394
-
395
- db_size = os.path.getsize(DB_PATH) / (2**30)
396
- gr.Markdown(
397
- f"**Database:** {os.path.basename(DB_PATH)} ({db_size:.2f} GB) | "
398
- f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
399
- f"**Status:** βœ… Indexed & Fast"
400
- )
401
- gr.Markdown("*Custom indices created on edge.start_id and edge.end_id for 100x faster queries!*")
402
 
403
  with gr.Tabs():
404
  with gr.TabItem("πŸ” Semantic Profile"):
405
- gr.Markdown("**Get semantic profile - NOW FAST with custom indices!**")
406
-
407
  with gr.Row():
408
  word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
409
- lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")
410
-
411
  semantic_btn = gr.Button("πŸ” Get Profile", variant="primary", size="lg")
412
- semantic_output = gr.Markdown("*Click to start...*")
413
 
414
- with gr.TabItem("⚑ Query Builder"):
415
- gr.Markdown("**Build queries - NOW FAST with custom indices!**")
416
-
417
  with gr.Row():
418
- start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
419
- rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
420
- end_input = gr.Textbox(label="End Node", placeholder="tier", value="")
421
-
422
- limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50, step=1)
423
- query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
424
-
425
- status_output = gr.Markdown("*Ready...*")
426
- results_output = gr.DataFrame(label="Results", wrap=True)
427
 
428
- with gr.TabItem("πŸ’» Raw SQL"):
429
- raw_sql_input = gr.Textbox(
430
- label="SQL Query",
431
- value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10",
432
- lines=3
433
- )
434
  raw_btn = gr.Button("▢️ Execute")
435
  raw_status = gr.Markdown()
436
  raw_results = gr.DataFrame()
437
 
438
  with gr.TabItem("πŸ“Š Schema"):
439
- schema_btn = gr.Button("πŸ“Š Load Schema")
440
- schema_output = gr.Markdown("*Click to load...*")
441
 
442
- gr.Markdown("---\n**πŸš€ Performance:** Custom indices created on edge table = 100x faster queries!")
443
 
444
- # Connect functions
445
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
446
  query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
447
  raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
448
  schema_btn.click(get_schema_info, None, schema_output)
449
 
450
  if __name__ == "__main__":
451
- print("\nπŸš€ Starting app with indexed database...\n")
452
  demo.launch(ssr_mode=False)
 
1
  import gradio as gr
2
  import sqlite3
3
  import pandas as pd
4
+ from huggingface_hub import hf_hub_download, HfApi, HfFolder
5
  import os
6
  import time
7
  import shutil
8
  from pathlib import Path
9
+ import json
10
 
11
  # ===== CONFIGURATION =====
12
  TARGET_LANGUAGES = ['de']
13
+ INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
14
+ INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
15
+ PROGRESS_FILENAME = "indexing_progress.json"
16
+ LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
17
  # =========================
18
 
19
  print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
20
 
21
+ # Get HF token
22
+ HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
23
+ if not HF_TOKEN:
24
+ try:
25
+ HF_TOKEN = HfFolder.get_token()
26
+ except:
27
+ pass
28
+
29
+ # Original database
30
+ ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
31
+ ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"
32
+
33
+ def check_remote_progress():
34
+ """
35
+ Check which indices are already completed in the remote HF repo.
36
+ Returns dict with progress info.
37
+ """
38
+ if not HF_TOKEN:
39
+ print("⚠️ No HF_TOKEN - cannot check remote progress")
40
+ return {"completed_indices": [], "database_uploaded": False}
41
+
42
+ try:
43
+ api = HfApi()
44
+
45
+ # Check if repo exists
46
+ try:
47
+ api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
48
+ print(f"βœ… Repository exists: {INDEXED_REPO_ID}")
49
+ except:
50
+ print(f"ℹ️ Repository doesn't exist yet, will create it")
51
+ return {"completed_indices": [], "database_uploaded": False}
52
+
53
+ # Try to download progress file
54
+ try:
55
+ progress_path = hf_hub_download(
56
+ repo_id=INDEXED_REPO_ID,
57
+ filename=PROGRESS_FILENAME,
58
+ repo_type="dataset",
59
+ token=HF_TOKEN
60
+ )
61
+
62
+ with open(progress_path, 'r') as f:
63
+ progress = json.load(f)
64
+
65
+ print(f"πŸ“‹ Remote progress found:")
66
+ print(f" Completed indices: {progress.get('completed_indices', [])}")
67
+ print(f" Database uploaded: {progress.get('database_uploaded', False)}")
68
+
69
+ return progress
70
+
71
+ except Exception as e:
72
+ print(f"ℹ️ No progress file found (starting fresh)")
73
+ return {"completed_indices": [], "database_uploaded": False}
74
+
75
+ except Exception as e:
76
+ print(f"⚠️ Error checking remote progress: {e}")
77
+ return {"completed_indices": [], "database_uploaded": False}
78
+
79
+ def update_remote_progress(completed_indices, database_uploaded=False):
80
+ """
81
+ Update the progress file in the remote HF repo.
82
+ """
83
+ if not HF_TOKEN:
84
+ print("⚠️ Cannot update progress: No HF_TOKEN")
85
+ return False
86
+
87
+ try:
88
+ api = HfApi()
89
+
90
+ # Create repo if it doesn't exist
91
+ try:
92
+ api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
93
+ except:
94
+ print(f"Creating repository: {INDEXED_REPO_ID}")
95
+ api.create_repo(
96
+ repo_id=INDEXED_REPO_ID,
97
+ repo_type="dataset",
98
+ token=HF_TOKEN,
99
+ private=False
100
+ )
101
+
102
+ # Create progress file
103
+ progress = {
104
+ "completed_indices": completed_indices,
105
+ "database_uploaded": database_uploaded,
106
+ "timestamp": time.time(),
107
+ "languages": TARGET_LANGUAGES
108
+ }
109
+
110
+ progress_path = "/tmp/indexing_progress.json"
111
+ with open(progress_path, 'w') as f:
112
+ json.dump(progress, f, indent=2)
113
+
114
+ # Upload progress file
115
+ api.upload_file(
116
+ path_or_fileobj=progress_path,
117
+ path_in_repo=PROGRESS_FILENAME,
118
+ repo_id=INDEXED_REPO_ID,
119
+ repo_type="dataset",
120
+ token=HF_TOKEN,
121
+ commit_message=f"Update progress: {len(completed_indices)} indices complete"
122
+ )
123
+
124
+ print(f"βœ… Progress updated: {len(completed_indices)} indices complete")
125
+ return True
126
+
127
+ except Exception as e:
128
+ print(f"⚠️ Failed to update progress: {e}")
129
+ return False
130
 
131
+ def upload_database_checkpoint():
132
+ """
133
+ Upload the current database state to HF.
134
+ This is called after each index is created.
135
+ """
136
+ if not HF_TOKEN:
137
+ print("⚠️ Cannot upload: No HF_TOKEN")
138
+ return False
139
+
140
+ if not os.path.exists(LOCAL_DB_PATH):
141
+ print("⚠️ Database file doesn't exist")
142
+ return False
143
+
144
+ try:
145
+ api = HfApi()
146
+
147
+ db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
148
+ print(f"\nπŸ“€ Uploading database checkpoint ({db_size:.2f} GB)...")
149
+ print(f" This may take 5-10 minutes but saves progress...")
150
+
151
+ start = time.time()
152
+
153
+ api.upload_file(
154
+ path_or_fileobj=LOCAL_DB_PATH,
155
+ path_in_repo=INDEXED_DB_FILENAME,
156
+ repo_id=INDEXED_REPO_ID,
157
+ repo_type="dataset",
158
+ token=HF_TOKEN,
159
+ commit_message="Upload indexed database checkpoint"
160
+ )
161
+
162
+ elapsed = time.time() - start
163
+ print(f"βœ… Database uploaded in {elapsed:.1f}s")
164
+
165
+ return True
166
+
167
+ except Exception as e:
168
+ print(f"❌ Upload failed: {e}")
169
+ import traceback
170
+ traceback.print_exc()
171
+ return False
172
 
173
  def create_indexed_database():
174
  """
175
+ Create indexed database with checkpoint system.
176
+ Resumes from last completed index if crashed.
177
  """
178
+ # Check remote progress first
179
+ progress = check_remote_progress()
180
+ completed_indices = set(progress.get("completed_indices", []))
181
+ database_uploaded = progress.get("database_uploaded", False)
182
+
183
+ # If database is fully indexed and uploaded, download it
184
+ if database_uploaded and len(completed_indices) >= 4:
185
+ print("\nβœ… Fully indexed database exists in HF!")
186
+ print(f" Downloading from {INDEXED_REPO_ID}...")
187
+
188
+ try:
189
+ indexed_path = hf_hub_download(
190
+ repo_id=INDEXED_REPO_ID,
191
+ filename=INDEXED_DB_FILENAME,
192
+ repo_type="dataset",
193
+ token=HF_TOKEN
194
+ )
195
+ print(f"βœ… Downloaded: {indexed_path}")
196
+ return indexed_path
197
+
198
+ except Exception as e:
199
+ print(f"⚠️ Download failed: {e}")
200
+ print(" Will create indices locally")
201
 
202
+ # Need to create/continue indexing
203
  print("\n" + "="*60)
204
+ print("CREATING INDEXED DATABASE (WITH CHECKPOINTS)")
205
  print("="*60)
 
 
206
 
207
+ if completed_indices:
208
+ print(f"πŸ“ Resuming from checkpoint...")
209
+ print(f" Already completed: {sorted(completed_indices)}")
210
 
211
+ # Download or use existing local database
212
+ if os.path.exists(LOCAL_DB_PATH) and completed_indices:
213
+ print(f"\nβœ… Using existing local database with {len(completed_indices)} indices")
214
+ elif database_uploaded and not completed_indices:
215
+ # Download partial database from HF
216
+ print(f"\nπŸ“₯ Downloading partial database from HF...")
217
+ try:
218
+ remote_db = hf_hub_download(
219
+ repo_id=INDEXED_REPO_ID,
220
+ filename=INDEXED_DB_FILENAME,
221
+ repo_type="dataset",
222
+ token=HF_TOKEN
223
+ )
224
+ shutil.copy2(remote_db, LOCAL_DB_PATH)
225
+ print(f"βœ… Downloaded partial database")
226
+ except:
227
+ print(f"ℹ️ No partial database found, starting from original")
228
 
229
+ if not os.path.exists(LOCAL_DB_PATH):
230
+ # Copy original database
231
+ print(f"\n1. Downloading original database...")
232
+ original_path = hf_hub_download(
233
+ repo_id=ORIGINAL_REPO_ID,
234
+ filename=ORIGINAL_DB_FILENAME,
235
+ repo_type="dataset"
236
+ )
237
+
238
+ original_size = os.path.getsize(original_path)
239
+ free_space = shutil.disk_usage("/tmp")[2]
240
+
241
+ print(f" Original: {original_size / (2**30):.2f} GB")
242
+ print(f" Free space: {free_space / (2**30):.2f} GB")
243
+
244
+ if free_space < original_size * 2:
245
+ raise Exception(f"Not enough space! Need {original_size * 2 / (2**30):.1f} GB")
246
+
247
+ print(f"\n Copying to {LOCAL_DB_PATH}...")
248
+ start = time.time()
249
+ shutil.copy2(original_path, LOCAL_DB_PATH)
250
+ elapsed = time.time() - start
251
+ print(f" βœ“ Copied in {elapsed:.1f}s")
252
 
253
+ # Define indices to create
254
+ indices_to_create = [
255
+ ("idx_edge_start_id", "edge", "start_id", "Speed up start node queries"),
256
+ ("idx_edge_end_id", "edge", "end_id", "Speed up end node queries"),
257
+ ("idx_edge_rel_id", "edge", "rel_id", "Speed up relation queries"),
258
+ ("idx_node_label", "node", "label", "Speed up label searches"),
259
+ ]
260
 
261
+ # Connect to database
262
+ conn = sqlite3.connect(LOCAL_DB_PATH)
 
263
  cursor = conn.cursor()
264
 
265
+ # Enable optimizations
266
  cursor.execute("PRAGMA journal_mode = WAL")
267
  cursor.execute("PRAGMA synchronous = NORMAL")
268
+ cursor.execute("PRAGMA cache_size = -512000")
 
269
 
270
+ # Create each index with checkpoint
271
+ print(f"\n2. Creating indices with checkpoints...")
272
+ print(f" (After each index, we upload to HF to save progress)")
 
 
273
 
274
  for idx_name, table, column, description in indices_to_create:
275
+ if idx_name in completed_indices:
276
+ print(f"\n βœ“ {idx_name} - ALREADY COMPLETE (skipping)")
277
+ continue
278
+
279
+ print(f"\n Creating {idx_name} on {table}({column})...")
280
  print(f" Purpose: {description}")
 
281
 
282
+ start = time.time()
283
 
284
+ try:
285
+ cursor.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table}({column})")
286
+ conn.commit()
287
+
288
+ elapsed = time.time() - start
289
+ print(f" βœ“ Index created in {elapsed:.1f}s")
290
+
291
+ # Update completed indices
292
+ completed_indices.add(idx_name)
293
+
294
+ # Update remote progress
295
+ print(f" πŸ“ Updating progress file...")
296
+ update_remote_progress(list(completed_indices), database_uploaded=False)
297
+
298
+ # Upload database checkpoint
299
+ print(f" πŸ“€ Uploading database checkpoint...")
300
+ upload_success = upload_database_checkpoint()
301
+
302
+ if upload_success:
303
+ print(f" βœ… Checkpoint saved! Safe to restart if needed.")
304
+ else:
305
+ print(f" ⚠️ Checkpoint upload failed, but continuing...")
306
+
307
+ except Exception as e:
308
+ print(f" ❌ Failed to create {idx_name}: {e}")
309
+ conn.close()
310
+ raise
311
 
312
+ # Run ANALYZE
313
+ print(f"\n3. Running ANALYZE...")
314
  start = time.time()
315
  cursor.execute("ANALYZE")
316
+ conn.commit()
317
  elapsed = time.time() - start
318
+ print(f" βœ“ Analyzed in {elapsed:.1f}s")
319
 
 
 
320
  conn.close()
321
 
322
+ # Final upload
323
+ print(f"\n4. Final database upload...")
324
+ upload_database_checkpoint()
325
 
326
+ # Mark as complete
327
+ update_remote_progress(list(completed_indices), database_uploaded=True)
328
+
329
+ indexed_size = os.path.getsize(LOCAL_DB_PATH)
330
+
331
+ print("\n" + "="*60)
332
  print("INDEXING COMPLETE!")
333
  print("="*60)
334
+ print(f"Size: {indexed_size / (2**30):.2f} GB")
335
+ print(f"Indices created: {sorted(completed_indices)}")
336
+ print(f"Saved to: https://huggingface.co/datasets/{INDEXED_REPO_ID}")
 
337
  print("="*60 + "\n")
338
 
339
+ return LOCAL_DB_PATH
340
 
341
+ # Initialize database
342
  DB_PATH = create_indexed_database()
343
 
344
  def get_db_connection():
345
+ """Create optimized connection"""
346
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
347
  conn.execute("PRAGMA cache_size = -256000")
348
  conn.execute("PRAGMA mmap_size = 4294967296")
 
349
  return conn
350
 
351
  def verify_indices():
352
+ """Verify indices"""
353
  print("\n" + "="*60)
354
  print("VERIFYING INDICES")
355
  print("="*60)
 
357
  with get_db_connection() as conn:
358
  cursor = conn.cursor()
359
 
360
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
361
+ custom_indices = cursor.fetchall()
362
+
363
+ print(f"\nCustom indices: {len(custom_indices)}")
364
+ for idx in custom_indices:
365
+ print(f" βœ“ {idx[0]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
 
367
+ # Speed test
368
+ start = time.time()
369
+ cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE '/c/de/hund%'")
370
+ count = cursor.fetchone()[0]
371
+ elapsed = time.time() - start
372
+
373
+ status = "βœ… FAST" if elapsed < 1 else "⚠️ SLOW" if elapsed < 5 else "❌ VERY SLOW"
374
+ print(f"\nSpeed test: {count} results in {elapsed:.3f}s {status}")
375
+ print("="*60 + "\n")
376
 
377
  verify_indices()
378
 
379
+ def get_semantic_profile(word, lang='de', progress=gr.Progress()):
380
+ """Semantic profile with progress"""
381
+ progress(0, desc="Starting...")
382
+
383
  if not word:
384
  return "⚠️ Please enter a word."
385
 
 
401
  with get_db_connection() as conn:
402
  cursor = conn.cursor()
403
 
404
+ progress(0.05, desc="Finding nodes...")
405
+ cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
406
  nodes = cursor.fetchall()
407
 
408
  if not nodes:
409
+ return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**"
410
 
411
  for node_id, label in nodes[:3]:
412
  output_md += f"**Node:** `{node_id}` ({label})\n"
 
414
 
415
  total_relations = 0
416
 
417
+ for i, rel in enumerate(relations):
418
+ progress((i + 1) / len(relations), desc=f"Querying {rel}...")
419
+
420
  output_md += f"## {rel}\n\n"
421
  has_results = False
422
 
423
+ # Outgoing
424
  cursor.execute("""
425
  SELECT en.label, e.weight
426
  FROM edge e
 
436
  has_results = True
437
  total_relations += 1
438
 
439
+ # Incoming
440
  cursor.execute("""
441
  SELECT s.label, e.weight
442
  FROM edge e
 
456
  output_md += "*No results*\n"
457
  output_md += "\n"
458
 
459
+ progress(1.0, desc="Complete!")
460
+ output_md += f"---\n**Total:** {total_relations} relations\n"
461
  return output_md
462
 
463
  except Exception as e:
 
464
  import traceback
465
  traceback.print_exc()
466
+ return f"**❌ Error:** {e}"
467
 
468
+ def run_query(start_node, relation, end_node, limit, progress=gr.Progress()):
469
+ """Query builder"""
470
+ progress(0, desc="Starting...")
471
 
472
  query = """
473
+ SELECT e.id, s.id, r.label, en.id, e.weight, s.label, en.label
 
 
 
 
 
 
 
474
  FROM edge e
475
  JOIN relation r ON e.rel_id = r.id
476
  JOIN node s ON e.start_id = s.id
 
482
 
483
  try:
484
  with get_db_connection() as conn:
485
+ progress(0.3, desc="Building query...")
486
+
487
  # Language filter
488
+ lang_conditions = []
489
+ for lang in TARGET_LANGUAGES:
490
+ lang_conditions.append(f"s.id LIKE '/c/{lang}/%'")
491
+ lang_conditions.append(f"en.id LIKE '/c/{lang}/%'")
492
+ query += f" AND ({' OR '.join(lang_conditions)})"
493
 
494
+ # Filters
495
+ if start_node and start_node.strip():
496
  pattern = start_node if '%' in start_node else f"%{start_node}%"
497
  query += " AND s.id LIKE ?"
498
  params.append(pattern)
499
 
500
+ if relation and relation.strip():
501
+ rel_value = relation if relation.startswith('/r/') else f"/r/{relation}"
502
  if '%' in relation:
503
  query += " AND r.label LIKE ?"
504
  else:
505
  query += " AND r.label = ?"
506
+ params.append(rel_value)
507
 
508
+ if end_node and end_node.strip():
509
  pattern = end_node if '%' in end_node else f"%{end_node}%"
510
  query += " AND en.id LIKE ?"
511
  params.append(pattern)
 
513
  query += " ORDER BY e.weight DESC LIMIT ?"
514
  params.append(limit)
515
 
516
+ progress(0.6, desc="Executing...")
517
+
518
  start_time = time.time()
519
  df = pd.read_sql_query(query, conn, params=params)
520
  elapsed = time.time() - start_time
521
 
522
+ progress(1.0, desc="Complete!")
523
+
524
  if df.empty:
525
+ return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
526
 
527
  df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
528
  return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
529
 
530
  except Exception as e:
 
531
  import traceback
532
  traceback.print_exc()
533
  return pd.DataFrame(), f"**❌ Error:** {e}"
 
539
 
540
  try:
541
  with get_db_connection() as conn:
 
542
  df = pd.read_sql_query(sql_query, conn)
543
+ return df, f"βœ… {len(df)} rows"
 
544
  except Exception as e:
545
  return pd.DataFrame(), f"Error: {e}"
546
 
547
  def get_schema_info():
548
+ """Get schema"""
549
  with get_db_connection() as conn:
550
  cursor = conn.cursor()
551
 
552
+ md = f"# πŸ“š Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
 
553
 
554
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
 
555
 
556
+ for table_name, in cursor.fetchall():
557
  cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
558
+ md += f"## {table_name} ({cursor.fetchone()[0]:,} rows)\n\n"
 
 
 
 
 
 
 
 
559
 
 
560
  cursor.execute(f"PRAGMA index_list({table_name})")
561
  indices = cursor.fetchall()
562
 
563
  if indices:
564
+ md += f"**Indices ({len(indices)}):** "
565
+ md += ", ".join([f"`{idx[1]}`" for idx in indices])
566
+ md += "\n\n"
 
 
 
 
 
 
 
 
567
 
568
  return md
569
 
570
+ # UI
571
+ with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
572
+ gr.Markdown(f"# 🧠 ConceptNet Explorer ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
573
+ gr.Markdown(f"**Indexed DB:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | βœ… Checkpoint system active")
 
 
 
 
 
 
 
574
 
575
  with gr.Tabs():
576
  with gr.TabItem("πŸ” Semantic Profile"):
 
 
577
  with gr.Row():
578
  word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
579
+ lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Lang")
 
580
  semantic_btn = gr.Button("πŸ” Get Profile", variant="primary", size="lg")
581
+ semantic_output = gr.Markdown()
582
 
583
+ with gr.TabItem("⚑ Query"):
 
 
584
  with gr.Row():
585
+ start_input = gr.Textbox(label="Start", placeholder="hund", value="hund")
586
+ rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA")
587
+ end_input = gr.Textbox(label="End", placeholder="")
588
+ limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50)
589
+ query_btn = gr.Button("▢️ Run", variant="primary", size="lg")
590
+ status_output = gr.Markdown()
591
+ results_output = gr.DataFrame(wrap=True)
 
 
592
 
593
+ with gr.TabItem("πŸ’» SQL"):
594
+ raw_sql_input = gr.Textbox(label="SQL", value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", lines=3)
 
 
 
 
595
  raw_btn = gr.Button("▢️ Execute")
596
  raw_status = gr.Markdown()
597
  raw_results = gr.DataFrame()
598
 
599
  with gr.TabItem("πŸ“Š Schema"):
600
+ schema_btn = gr.Button("πŸ“Š Load")
601
+ schema_output = gr.Markdown()
602
 
603
+ gr.Markdown("---\nβœ… **Progress saved after each index!** Safe to restart if space crashes.")
604
 
 
605
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
606
  query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
607
  raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
608
  schema_btn.click(get_schema_info, None, schema_output)
609
 
610
  if __name__ == "__main__":
611
+ print("\nπŸš€ Ready with checkpoint system!\n")
612
  demo.launch(ssr_mode=False)