cstr commited on
Commit
85c7f72
·
verified ·
1 Parent(s): 5172377

gunzip wiktionary

Browse files
Files changed (1) hide show
  1. app.py +29 -9
app.py CHANGED
@@ -50,6 +50,8 @@ import re
50
  import sqlite3
51
  import json
52
  from huggingface_hub import hf_hub_download
 
 
53
 
54
  # --- Requests and gradio Import (for ConceptNet) ---
55
  try:
@@ -225,8 +227,9 @@ def log(msg):
225
  print(f"[DEBUG] {msg}")
226
 
227
  # --- Wiktionary Cache & Lock (ENGLISH) ---
228
- WIKTIONARY_DB_PATH = "en_wiktionary_normalized.db"
229
  WIKTIONARY_REPO_ID = "cstr/en-wiktionary-sqlite-full"
 
 
230
  WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
231
  WIKTIONARY_CONN_LOCK = threading.Lock()
232
  WIKTIONARY_AVAILABLE = False
@@ -1227,27 +1230,44 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
1227
  # 6d. WIKTIONARY DATABASE LOGIC (EN)
1228
  # ============================================================================
1229
  def wiktionary_download_db() -> bool:
1230
- """ Downloads the English Wiktionary DB. """
1231
  global WIKTIONARY_AVAILABLE
 
 
1232
  if os.path.exists(WIKTIONARY_DB_PATH):
1233
  print(f"✓ English Wiktionary DB '{WIKTIONARY_DB_PATH}' already exists.")
1234
  WIKTIONARY_AVAILABLE = True
1235
  return True
1236
 
1237
- print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
1238
  try:
1239
- hf_hub_download(
 
1240
  repo_id=WIKTIONARY_REPO_ID,
1241
- filename=WIKTIONARY_DB_PATH,
1242
  repo_type="dataset",
1243
- local_dir=".",
1244
- local_dir_use_symlinks=False
1245
  )
1246
- print(f"✓ English Wiktionary DB downloaded successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
 
1247
  WIKTIONARY_AVAILABLE = True
1248
  return True
1249
  except Exception as e:
1250
- print(f"✗ CRITICAL: Failed to download English Wiktionary DB: {e}")
 
1251
  return False
1252
 
1253
  def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
 
50
  import sqlite3
51
  import json
52
  from huggingface_hub import hf_hub_download
53
+ import gzip
54
+ import shutil
55
 
56
  # --- Requests and gradio Import (for ConceptNet) ---
57
  try:
 
227
  print(f"[DEBUG] {msg}")
228
 
229
  # --- Wiktionary Cache & Lock (ENGLISH) ---
 
230
  WIKTIONARY_REPO_ID = "cstr/en-wiktionary-sqlite-full"
231
+ WIKTIONARY_REMOTE_FILE = "en_wiktionary_normalized_full.db.gz" # File as seen in your screenshot
232
+ WIKTIONARY_DB_PATH = "en_wiktionary_normalized.db" # Local extracted file
233
  WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
234
  WIKTIONARY_CONN_LOCK = threading.Lock()
235
  WIKTIONARY_AVAILABLE = False
 
1230
  # 6d. WIKTIONARY DATABASE LOGIC (EN)
1231
  # ============================================================================
1232
  def wiktionary_download_db() -> bool:
1233
+ """ Downloads the compressed English Wiktionary DB and extracts it. """
1234
  global WIKTIONARY_AVAILABLE
1235
+
1236
+ # Check if the extracted DB already exists
1237
  if os.path.exists(WIKTIONARY_DB_PATH):
1238
  print(f"✓ English Wiktionary DB '{WIKTIONARY_DB_PATH}' already exists.")
1239
  WIKTIONARY_AVAILABLE = True
1240
  return True
1241
 
1242
+ print(f"English Wiktionary DB not found. Downloading '{WIKTIONARY_REMOTE_FILE}' from '{WIKTIONARY_REPO_ID}'...")
1243
  try:
1244
+ # 1. Download the .gz file
1245
+ downloaded_gz_path = hf_hub_download(
1246
  repo_id=WIKTIONARY_REPO_ID,
1247
+ filename=WIKTIONARY_REMOTE_FILE,
1248
  repo_type="dataset",
1249
+ local_dir="."
1250
+ # Removed deprecated `local_dir_use_symlinks`
1251
  )
1252
+
1253
+ # 2. Decompress the .gz file to the .db file
1254
+ print(f"Downloading complete. Extracting '{downloaded_gz_path}' to '{WIKTIONARY_DB_PATH}'...")
1255
+ with gzip.open(downloaded_gz_path, 'rb') as f_in:
1256
+ with open(WIKTIONARY_DB_PATH, 'wb') as f_out:
1257
+ shutil.copyfileobj(f_in, f_out)
1258
+
1259
+ # Optional: Cleanup the .gz file to save space
1260
+ try:
1261
+ os.remove(downloaded_gz_path)
1262
+ except OSError:
1263
+ pass
1264
+
1265
+ print(f"✓ English Wiktionary DB downloaded and extracted successfully.")
1266
  WIKTIONARY_AVAILABLE = True
1267
  return True
1268
  except Exception as e:
1269
+ print(f"✗ CRITICAL: Failed to download/extract English Wiktionary DB: {e}")
1270
+ # traceback.print_exc() # Uncomment for deep debugging
1271
  return False
1272
 
1273
  def wiktionary_get_connection() -> Optional[sqlite3.Connection]: