Spaces:

lucharo
/

etymology

Running

App Files Files Community

etymology / backend /enrich_definitions.py

lucharo

Super-squash branch 'main' using huggingface_hub

13812dc 3 months ago

raw

history blame contribute delete

7.94 kB

	"""Enrich etymology database with definitions from Free Dictionary API.

	Usage:
	uv run python -m backend.enrich_definitions # Run full enrichment
	uv run python -m backend.enrich_definitions --stats # Check progress
	uv run python -m backend.enrich_definitions --test 50 # Test with 50 words
	"""

	from __future__ import annotations

	import argparse
	import asyncio
	import json
	import sys
	import time
	from datetime import datetime

	import duckdb

	try:
	import aiohttp
	except ImportError:
	print("aiohttp is required. Install with: uv add aiohttp")
	sys.exit(1)

	try:
	from .database import database_path
	from .sql_loader import load_sql
	except ImportError:
	from database import database_path
	from sql_loader import load_sql

	API_BASE = "https://api.dictionaryapi.dev/api/v2/entries/en"
	DELAY_MS = 300 # Delay between requests
	COMMIT_EVERY = 100 # Commit to DB every N words


	async def fetch_definition(
	session: aiohttp.ClientSession, word: str
	) -> tuple[str, str, str \| None]:
	"""Fetch definition for a single word with retry logic."""
	url = f"{API_BASE}/{word}"

	for attempt in range(3):
	try:
	async with session.get(url) as resp:
	if resp.status == 200:
	data = await resp.json()
	return (word, "success", json.dumps(data))
	elif resp.status == 404:
	return (word, "not_found", None)
	elif resp.status == 429: # Rate limited
	await asyncio.sleep(2**attempt)
	continue
	else:
	if attempt < 2:
	await asyncio.sleep(1)
	continue
	return (word, "error", None)
	except Exception as e:
	if attempt < 2:
	await asyncio.sleep(1)
	continue
	print(f" Error fetching '{word}': {e}")
	return (word, "error", None)

	return (word, "error", None)


	def get_words_to_enrich(conn: duckdb.DuckDBPyConnection) -> list[str]:
	"""Get curated English words that haven't been enriched yet."""
	tables = conn.execute(
	load_sql("enrichment/check_table_exists.sql"), ["definitions_raw"]
	).fetchall()

	if not tables:
	rows = conn.execute(load_sql("enrichment/words_to_enrich_initial.sql")).fetchall()
	else:
	rows = conn.execute(load_sql("enrichment/words_to_enrich.sql")).fetchall()

	return [row[0] for row in rows]


	def ensure_definitions_table(conn: duckdb.DuckDBPyConnection) -> None:
	"""Create definitions_raw table if it doesn't exist."""
	conn.execute(load_sql("ingestion/09_create_definitions_raw.sql"))


	def materialize_definitions(conn: duckdb.DuckDBPyConnection) -> None:
	"""Materialize parsed definitions into a proper table.

	Extracts ALL definitions from the Free Dictionary API response,
	not just the first one. Each row represents one definition with
	indexes for entry, meaning, and definition within the JSON structure.

	The raw JSON is kept in definitions_raw for debugging/reprocessing.
	Lexeme is stored lowercase for fast equality joins.
	"""
	print("Materializing definitions table...")
	for stmt in load_sql("enrichment/materialize_definitions.sql").strip().split(";"):
	if stmt.strip():
	conn.execute(stmt)

	for stmt in load_sql("enrichment/create_definition_indexes.sql").strip().split(";"):
	if stmt.strip():
	conn.execute(stmt)

	count = conn.execute("SELECT COUNT(*) FROM definitions").fetchone()[0]
	unique_words = conn.execute("SELECT COUNT(DISTINCT lexeme) FROM definitions").fetchone()[0]
	print(f" Materialized {count:,} definitions for {unique_words:,} words")


	def store_result(
	conn: duckdb.DuckDBPyConnection, word: str, status: str, api_response: str \| None
	) -> None:
	"""Store a single result in the database."""
	conn.execute(
	"INSERT OR REPLACE INTO definitions_raw (lexeme, api_response, fetched_at, status) VALUES (?, ?::JSON, ?, ?)",
	[word, api_response, datetime.now().isoformat(), status],
	)


	async def enrich_definitions(max_words: int \| None = None) -> None:
	"""Fetch definitions for all curated English words."""
	db_path = database_path()
	print(f"Database: {db_path}\n")

	# Get words to enrich
	with duckdb.connect(db_path.as_posix(), read_only=True) as conn:
	words = get_words_to_enrich(conn)

	if max_words:
	words = words[:max_words]

	total = len(words)
	if total == 0:
	print("All words already enriched!")
	return

	print(f"Words to enrich: {total:,}")
	print(f"Estimated time: {(total * DELAY_MS) / 1000 / 60:.1f} minutes\n")

	stats = {"success": 0, "not_found": 0, "error": 0}
	start_time = time.time()

	with duckdb.connect(db_path.as_posix()) as conn:
	ensure_definitions_table(conn)

	async with aiohttp.ClientSession() as session:
	for i, word in enumerate(words):
	result = await fetch_definition(session, word)
	_, status, api_response = result
	stats[status] += 1
	store_result(conn, word, status, api_response)

	# Progress every 50 words
	if (i + 1) % 50 == 0 or i + 1 == total:
	elapsed = time.time() - start_time
	rate = (i + 1) / elapsed if elapsed > 0 else 0
	remaining = (total - i - 1) / rate if rate > 0 else 0
	print(
	f"[{i + 1:,}/{total:,}] {100 * (i + 1) / total:.1f}% - {rate:.1f}/sec - ETA: {remaining / 60:.1f}m"
	)

	# Commit periodically
	if (i + 1) % COMMIT_EVERY == 0:
	conn.commit()

	# Rate limiting
	if i < total - 1:
	await asyncio.sleep(DELAY_MS / 1000)

	conn.commit()

	elapsed = time.time() - start_time
	print(f"\nDone! {total:,} words in {elapsed / 60:.1f}m ({total / elapsed:.1f}/sec)")
	print(
	f" Success: {stats['success']:,}, Not found: {stats['not_found']:,}, Errors: {stats['error']:,}"
	)

	# Materialize the definitions table for fast queries
	with duckdb.connect(db_path.as_posix()) as conn:
	materialize_definitions(conn)


	def get_stats() -> None:
	"""Show current enrichment statistics."""
	db_path = database_path()

	with duckdb.connect(db_path.as_posix(), read_only=True) as conn:
	tables = conn.execute(
	load_sql("enrichment/check_table_exists.sql"), ["definitions_raw"]
	).fetchall()

	if not tables:
	print("No definitions_raw table yet. Run enrichment first.")
	return

	total_curated = conn.execute(
	"SELECT COUNT(DISTINCT lexeme) FROM v_english_curated"
	).fetchone()[0]
	stats = conn.execute(load_sql("enrichment/enrichment_stats.sql")).fetchall()

	total_enriched = sum(s[1] for s in stats)
	remaining = total_curated - total_enriched

	print(f"Curated English words: {total_curated:,}")
	print(f"Enriched: {total_enriched:,} ({100 * total_enriched / total_curated:.1f}%)")
	print(f"Remaining: {remaining:,}\n")
	print("By status:")
	for status, count in stats:
	print(f" {status}: {count:,}")


	def main():
	parser = argparse.ArgumentParser(description="Enrich etymology database with definitions")
	parser.add_argument("--stats", action="store_true", help="Show enrichment statistics")
	parser.add_argument("--test", type=int, metavar="N", help="Test mode: only process N words")
	args = parser.parse_args()

	if args.stats:
	get_stats()
	else:
	asyncio.run(enrich_definitions(max_words=args.test))


	if __name__ == "__main__":
	main()