Spaces:
Running
Running
gunzip wiktionary
Browse files
app.py
CHANGED
|
@@ -50,6 +50,8 @@ import re
|
|
| 50 |
import sqlite3
|
| 51 |
import json
|
| 52 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# --- Requests and gradio Import (for ConceptNet) ---
|
| 55 |
try:
|
|
@@ -225,8 +227,9 @@ def log(msg):
|
|
| 225 |
print(f"[DEBUG] {msg}")
|
| 226 |
|
| 227 |
# --- Wiktionary Cache & Lock (ENGLISH) ---
|
| 228 |
-
WIKTIONARY_DB_PATH = "en_wiktionary_normalized.db"
|
| 229 |
WIKTIONARY_REPO_ID = "cstr/en-wiktionary-sqlite-full"
|
|
|
|
|
|
|
| 230 |
WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
|
| 231 |
WIKTIONARY_CONN_LOCK = threading.Lock()
|
| 232 |
WIKTIONARY_AVAILABLE = False
|
|
@@ -1227,27 +1230,44 @@ def _analyze_word_with_hanta_en(word: str, top_n: int) -> Dict[str, Any]:
|
|
| 1227 |
# 6d. WIKTIONARY DATABASE LOGIC (EN)
|
| 1228 |
# ============================================================================
|
| 1229 |
def wiktionary_download_db() -> bool:
|
| 1230 |
-
""" Downloads the English Wiktionary DB. """
|
| 1231 |
global WIKTIONARY_AVAILABLE
|
|
|
|
|
|
|
| 1232 |
if os.path.exists(WIKTIONARY_DB_PATH):
|
| 1233 |
print(f"✓ English Wiktionary DB '{WIKTIONARY_DB_PATH}' already exists.")
|
| 1234 |
WIKTIONARY_AVAILABLE = True
|
| 1235 |
return True
|
| 1236 |
|
| 1237 |
-
print(f"English Wiktionary DB not found. Downloading from '{WIKTIONARY_REPO_ID}'...")
|
| 1238 |
try:
|
| 1239 |
-
|
|
|
|
| 1240 |
repo_id=WIKTIONARY_REPO_ID,
|
| 1241 |
-
filename=
|
| 1242 |
repo_type="dataset",
|
| 1243 |
-
local_dir="."
|
| 1244 |
-
local_dir_use_symlinks
|
| 1245 |
)
|
| 1246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1247 |
WIKTIONARY_AVAILABLE = True
|
| 1248 |
return True
|
| 1249 |
except Exception as e:
|
| 1250 |
-
print(f"✗ CRITICAL: Failed to download English Wiktionary DB: {e}")
|
|
|
|
| 1251 |
return False
|
| 1252 |
|
| 1253 |
def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
|
|
|
|
| 50 |
import sqlite3
|
| 51 |
import json
|
| 52 |
from huggingface_hub import hf_hub_download
|
| 53 |
+
import gzip
|
| 54 |
+
import shutil
|
| 55 |
|
| 56 |
# --- Requests and gradio Import (for ConceptNet) ---
|
| 57 |
try:
|
|
|
|
| 227 |
print(f"[DEBUG] {msg}")
|
| 228 |
|
| 229 |
# --- Wiktionary Cache & Lock (ENGLISH) ---
|
|
|
|
| 230 |
WIKTIONARY_REPO_ID = "cstr/en-wiktionary-sqlite-full"
|
| 231 |
+
WIKTIONARY_REMOTE_FILE = "en_wiktionary_normalized_full.db.gz" # File as seen in your screenshot
|
| 232 |
+
WIKTIONARY_DB_PATH = "en_wiktionary_normalized.db" # Local extracted file
|
| 233 |
WIKTIONARY_CONN: Optional[sqlite3.Connection] = None
|
| 234 |
WIKTIONARY_CONN_LOCK = threading.Lock()
|
| 235 |
WIKTIONARY_AVAILABLE = False
|
|
|
|
| 1230 |
# 6d. WIKTIONARY DATABASE LOGIC (EN)
|
| 1231 |
# ============================================================================
|
| 1232 |
def wiktionary_download_db() -> bool:
|
| 1233 |
+
""" Downloads the compressed English Wiktionary DB and extracts it. """
|
| 1234 |
global WIKTIONARY_AVAILABLE
|
| 1235 |
+
|
| 1236 |
+
# Check if the extracted DB already exists
|
| 1237 |
if os.path.exists(WIKTIONARY_DB_PATH):
|
| 1238 |
print(f"✓ English Wiktionary DB '{WIKTIONARY_DB_PATH}' already exists.")
|
| 1239 |
WIKTIONARY_AVAILABLE = True
|
| 1240 |
return True
|
| 1241 |
|
| 1242 |
+
print(f"English Wiktionary DB not found. Downloading '{WIKTIONARY_REMOTE_FILE}' from '{WIKTIONARY_REPO_ID}'...")
|
| 1243 |
try:
|
| 1244 |
+
# 1. Download the .gz file
|
| 1245 |
+
downloaded_gz_path = hf_hub_download(
|
| 1246 |
repo_id=WIKTIONARY_REPO_ID,
|
| 1247 |
+
filename=WIKTIONARY_REMOTE_FILE,
|
| 1248 |
repo_type="dataset",
|
| 1249 |
+
local_dir="."
|
| 1250 |
+
# Removed deprecated `local_dir_use_symlinks`
|
| 1251 |
)
|
| 1252 |
+
|
| 1253 |
+
# 2. Decompress the .gz file to the .db file
|
| 1254 |
+
print(f"Downloading complete. Extracting '{downloaded_gz_path}' to '{WIKTIONARY_DB_PATH}'...")
|
| 1255 |
+
with gzip.open(downloaded_gz_path, 'rb') as f_in:
|
| 1256 |
+
with open(WIKTIONARY_DB_PATH, 'wb') as f_out:
|
| 1257 |
+
shutil.copyfileobj(f_in, f_out)
|
| 1258 |
+
|
| 1259 |
+
# Optional: Cleanup the .gz file to save space
|
| 1260 |
+
try:
|
| 1261 |
+
os.remove(downloaded_gz_path)
|
| 1262 |
+
except OSError:
|
| 1263 |
+
pass
|
| 1264 |
+
|
| 1265 |
+
print(f"✓ English Wiktionary DB downloaded and extracted successfully.")
|
| 1266 |
WIKTIONARY_AVAILABLE = True
|
| 1267 |
return True
|
| 1268 |
except Exception as e:
|
| 1269 |
+
print(f"✗ CRITICAL: Failed to download/extract English Wiktionary DB: {e}")
|
| 1270 |
+
# traceback.print_exc() # Uncomment for deep debugging
|
| 1271 |
return False
|
| 1272 |
|
| 1273 |
def wiktionary_get_connection() -> Optional[sqlite3.Connection]:
|