etymology / backend /enrich_definitions.py
lucharo's picture
Super-squash branch 'main' using huggingface_hub
13812dc
"""Enrich etymology database with definitions from Free Dictionary API.
Usage:
uv run python -m backend.enrich_definitions # Run full enrichment
uv run python -m backend.enrich_definitions --stats # Check progress
uv run python -m backend.enrich_definitions --test 50 # Test with 50 words
"""
from __future__ import annotations
import argparse
import asyncio
import json
import sys
import time
from datetime import datetime
import duckdb
try:
import aiohttp
except ImportError:
print("aiohttp is required. Install with: uv add aiohttp")
sys.exit(1)
try:
from .database import database_path
from .sql_loader import load_sql
except ImportError:
from database import database_path
from sql_loader import load_sql
API_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
DELAY_MS = 300 # Delay between requests
COMMIT_EVERY = 100 # Commit to DB every N words
async def fetch_definition(
session: aiohttp.ClientSession, word: str
) -> tuple[str, str, str | None]:
"""Fetch definition for a single word with retry logic."""
url = f"{API_BASE}/{word}"
for attempt in range(3):
try:
async with session.get(url) as resp:
if resp.status == 200:
data = await resp.json()
return (word, "success", json.dumps(data))
elif resp.status == 404:
return (word, "not_found", None)
elif resp.status == 429: # Rate limited
await asyncio.sleep(2**attempt)
continue
else:
if attempt < 2:
await asyncio.sleep(1)
continue
return (word, "error", None)
except Exception as e:
if attempt < 2:
await asyncio.sleep(1)
continue
print(f" Error fetching '{word}': {e}")
return (word, "error", None)
return (word, "error", None)
def get_words_to_enrich(conn: duckdb.DuckDBPyConnection) -> list[str]:
"""Get curated English words that haven't been enriched yet."""
tables = conn.execute(
load_sql("enrichment/check_table_exists.sql"), ["definitions_raw"]
).fetchall()
if not tables:
rows = conn.execute(load_sql("enrichment/words_to_enrich_initial.sql")).fetchall()
else:
rows = conn.execute(load_sql("enrichment/words_to_enrich.sql")).fetchall()
return [row[0] for row in rows]
def ensure_definitions_table(conn: duckdb.DuckDBPyConnection) -> None:
"""Create definitions_raw table if it doesn't exist."""
conn.execute(load_sql("ingestion/09_create_definitions_raw.sql"))
def materialize_definitions(conn: duckdb.DuckDBPyConnection) -> None:
"""Materialize parsed definitions into a proper table.
Extracts ALL definitions from the Free Dictionary API response,
not just the first one. Each row represents one definition with
indexes for entry, meaning, and definition within the JSON structure.
The raw JSON is kept in definitions_raw for debugging/reprocessing.
Lexeme is stored lowercase for fast equality joins.
"""
print("Materializing definitions table...")
for stmt in load_sql("enrichment/materialize_definitions.sql").strip().split(";"):
if stmt.strip():
conn.execute(stmt)
for stmt in load_sql("enrichment/create_definition_indexes.sql").strip().split(";"):
if stmt.strip():
conn.execute(stmt)
count = conn.execute("SELECT COUNT(*) FROM definitions").fetchone()[0]
unique_words = conn.execute("SELECT COUNT(DISTINCT lexeme) FROM definitions").fetchone()[0]
print(f" Materialized {count:,} definitions for {unique_words:,} words")
def store_result(
conn: duckdb.DuckDBPyConnection, word: str, status: str, api_response: str | None
) -> None:
"""Store a single result in the database."""
conn.execute(
"INSERT OR REPLACE INTO definitions_raw (lexeme, api_response, fetched_at, status) VALUES (?, ?::JSON, ?, ?)",
[word, api_response, datetime.now().isoformat(), status],
)
async def enrich_definitions(max_words: int | None = None) -> None:
"""Fetch definitions for all curated English words."""
db_path = database_path()
print(f"Database: {db_path}\n")
# Get words to enrich
with duckdb.connect(db_path.as_posix(), read_only=True) as conn:
words = get_words_to_enrich(conn)
if max_words:
words = words[:max_words]
total = len(words)
if total == 0:
print("All words already enriched!")
return
print(f"Words to enrich: {total:,}")
print(f"Estimated time: {(total * DELAY_MS) / 1000 / 60:.1f} minutes\n")
stats = {"success": 0, "not_found": 0, "error": 0}
start_time = time.time()
with duckdb.connect(db_path.as_posix()) as conn:
ensure_definitions_table(conn)
async with aiohttp.ClientSession() as session:
for i, word in enumerate(words):
result = await fetch_definition(session, word)
_, status, api_response = result
stats[status] += 1
store_result(conn, word, status, api_response)
# Progress every 50 words
if (i + 1) % 50 == 0 or i + 1 == total:
elapsed = time.time() - start_time
rate = (i + 1) / elapsed if elapsed > 0 else 0
remaining = (total - i - 1) / rate if rate > 0 else 0
print(
f"[{i + 1:,}/{total:,}] {100 * (i + 1) / total:.1f}% - {rate:.1f}/sec - ETA: {remaining / 60:.1f}m"
)
# Commit periodically
if (i + 1) % COMMIT_EVERY == 0:
conn.commit()
# Rate limiting
if i < total - 1:
await asyncio.sleep(DELAY_MS / 1000)
conn.commit()
elapsed = time.time() - start_time
print(f"\nDone! {total:,} words in {elapsed / 60:.1f}m ({total / elapsed:.1f}/sec)")
print(
f" Success: {stats['success']:,}, Not found: {stats['not_found']:,}, Errors: {stats['error']:,}"
)
# Materialize the definitions table for fast queries
with duckdb.connect(db_path.as_posix()) as conn:
materialize_definitions(conn)
def get_stats() -> None:
"""Show current enrichment statistics."""
db_path = database_path()
with duckdb.connect(db_path.as_posix(), read_only=True) as conn:
tables = conn.execute(
load_sql("enrichment/check_table_exists.sql"), ["definitions_raw"]
).fetchall()
if not tables:
print("No definitions_raw table yet. Run enrichment first.")
return
total_curated = conn.execute(
"SELECT COUNT(DISTINCT lexeme) FROM v_english_curated"
).fetchone()[0]
stats = conn.execute(load_sql("enrichment/enrichment_stats.sql")).fetchall()
total_enriched = sum(s[1] for s in stats)
remaining = total_curated - total_enriched
print(f"Curated English words: {total_curated:,}")
print(f"Enriched: {total_enriched:,} ({100 * total_enriched / total_curated:.1f}%)")
print(f"Remaining: {remaining:,}\n")
print("By status:")
for status, count in stats:
print(f" {status}: {count:,}")
def main():
parser = argparse.ArgumentParser(description="Enrich etymology database with definitions")
parser.add_argument("--stats", action="store_true", help="Show enrichment statistics")
parser.add_argument("--test", type=int, metavar="N", help="Test mode: only process N words")
args = parser.parse_args()
if args.stats:
get_stats()
else:
asyncio.run(enrich_definitions(max_words=args.test))
if __name__ == "__main__":
main()