jw-search / scripts /seed_sample_data.py
jw-tools's picture
deploy: latest main (lazy-ML cold start, durable launcher, web-image search, scene search) + full-app data refresh
7ea1851 verified
#!/usr/bin/env python3
"""Seed a small, representative content set for local dev / CI / demos.
The real Search-UI databases are multi-GB and built from JW.org downloads,
and the semantic index needs a sentence-transformers model that may not be
available offline. That makes it hard to exercise the search experience
end-to-end without the production data.
This script seeds just enough **keyword-searchable** content to drive the
app for real:
* writes ``json/<lang>/all_media_items.json`` so title search and result
decoration work,
* inserts subtitle text straight into the ``subtitles_fts`` FTS5 table
(bypassing embeddings, so no model download is required),
* indexes scripture references from that text (pure regex, model-free).
Keyword search, title search, and scripture search all return real results
afterwards. Semantic/hybrid search still needs the embedding model and is
intentionally out of scope here.
The sample set is fixed and deterministic so a golden snapshot taken
against it is stable across runs.
Usage::
# seed into the default throwaway ./sample-data dir (gitignored)
python scripts/seed_sample_data.py
# seed a specific throwaway location
python scripts/seed_sample_data.py --data-root /tmp/searchui-sample
# re-clear the sample rows first (scoped to sample keys only)
python scripts/seed_sample_data.py --clear
Safety: the default data root is ./sample-data, never your real database.
If you point --data-root at a location that already holds real (non-sample)
content, the script REFUSES to run unless you pass --force — it will not
overwrite a production media catalog or mix fake videos into a real index.
Nothing it writes is committed to git (sample-data/, json/ and *.db are
gitignored).
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT / "backend"))
# A fixed, representative slice of JW.org-style VOD content. Subtitle text is
# written to read naturally and to contain the vocabulary + scripture
# references the golden queries probe (kingdom, field service, New World
# Translation, John 3:16, ...). natural_key mirrors the real
# languageAgnosticNaturalKey style (pub_track_VIDEO).
SAMPLE_ITEMS: list[dict] = [
{
"natural_key": "pub-jwbcov_E_1_VIDEO",
"title": "What Is God's Kingdom?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"God's Kingdom is a real government in heaven. Jesus taught his "
"followers to pray for the Kingdom to come. At Daniel 2:44 we read "
"that the Kingdom will never be destroyed and will crush all human "
"governments. The Kingdom will bring peace to the whole earth."
),
},
{
"natural_key": "pub-jwbcov_E_2_VIDEO",
"title": "Why Do We Share in the Field Ministry?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"Jehovah's Witnesses share in the field ministry because Jesus "
"commanded his disciples to preach the good news of the Kingdom. "
"Field service gives everyone an opportunity to help others learn "
"the truth. At Matthew 28:19, 20 Jesus said to make disciples of "
"people of all the nations."
),
},
{
"natural_key": "pub-jwbcov_E_3_VIDEO",
"title": "How Can You Trust the Bible?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"The Bible has been translated into thousands of languages. The "
"New World Translation is accurate and easy to understand. Second "
"Timothy 3:16 explains that all Scripture is inspired of God and "
"beneficial for teaching. You can trust the Bible because its "
"prophecies have come true."
),
},
{
"natural_key": "pub-jwbcov_E_4_VIDEO",
"title": "God's Love for Us",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"John 3:16 tells us that God loved the world so much that he gave "
"his only-begotten Son. God's love moves us to want to know him "
"better and to show love to our neighbors. Love is the greatest "
"quality, as explained at 1 Corinthians 13."
),
},
{
"natural_key": "pub-jwbcov_E_5_VIDEO",
"title": "Enjoy Life Forever in Paradise",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"The Bible promises that the earth will become a paradise. "
"Revelation 21:4 says God will wipe out every tear, and death will "
"be no more. Psalm 37:29 promises that the righteous will possess "
"the earth and live forever on it."
),
},
{
"natural_key": "pub-jwbvod24_E_201_VIDEO",
"title": "JW Broadcasting—Bible Principles for Family Life",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Applying Bible principles strengthens family life. Husbands and "
"wives who follow the Bible's counsel build strong marriages. "
"Ephesians 5:33 counsels husbands to love their wives and wives to "
"respect their husbands. Children who honor their parents are "
"happier."
),
},
{
"natural_key": "pub-jwbvod24_E_202_VIDEO",
"title": "Cart Witnessing in the Community",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Public witnessing with carts lets people approach at their own "
"pace. Many take a free Bible study or a magazine. The field "
"ministry reaches people in busy public places. Volunteers stand "
"ready to answer Bible questions about the Kingdom hope."
),
},
{
"natural_key": "pub-jwbvod24_E_203_VIDEO",
"title": "Memorial of Jesus' Death",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Each year millions gather for the Memorial of Jesus' death. Jesus "
"said at Luke 22:19 to keep doing this in remembrance of him. The "
"emblems of unleavened bread and red wine represent his body and "
"blood. The Memorial reminds us of God's love and the ransom."
),
},
{
"natural_key": "pub-ihelp_E_7_VIDEO",
"title": "Coping With Anxiety",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"Everyone feels anxious sometimes. Prayer can help us cope with "
"anxiety. Philippians 4:6, 7 encourages us to bring our worries to "
"God in prayer, and the peace of God will guard our hearts. "
"Casting our anxiety on Jehovah brings real comfort."
),
},
{
"natural_key": "pub-ihelp_E_8_VIDEO",
"title": "Finding True Happiness",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"True happiness comes from helping others and knowing God. Jesus "
"said at Acts 20:35 that there is more happiness in giving than in "
"receiving. Spiritual things bring lasting joy that money cannot "
"buy."
),
},
]
SAMPLE_KEYS: frozenset[str] = frozenset(item["natural_key"] for item in SAMPLE_ITEMS)
def _build_media_record(item: dict) -> dict:
"""Shape one item the way all_media_items.json entries look."""
return {
"title": item["title"],
"languageAgnosticNaturalKey": item["natural_key"],
"primaryCategory": item.get("primaryCategory", "VODBibleTeachings"),
"_category_key": "VideoOnDemand",
"_subcategory": item.get("primaryCategory", "VODBibleTeachings"),
"duration": 300,
"firstPublished": "2024-01-01T00:00:00Z",
}
def _describe_real_data(instance, language: str, media_path: Path) -> str | None:
"""Return a human description of REAL (non-sample) content found at the
target, or None if the target is empty / sample-only.
Used to refuse seeding over a developer's production database — the
sample keys are a fixed known set, so anything else is real data.
"""
conn = instance._get_db_connection()
try:
rows = conn.execute(
"SELECT DISTINCT natural_key FROM subtitles_fts WHERE language = ?",
(language,),
).fetchall()
finally:
conn.close()
real_subs = [r[0] for r in rows if r[0] not in SAMPLE_KEYS]
if real_subs:
return f"{len(real_subs)} non-sample subtitle row(s) already indexed for language {language}"
if media_path.exists():
try:
existing = json.loads(media_path.read_text(encoding="utf-8"))
real_media = [k for k in existing if k not in SAMPLE_KEYS]
if real_media:
return f"{len(real_media)} non-sample entrie(s) in existing {media_path.name}"
except (json.JSONDecodeError, OSError):
return f"an existing {media_path.name} that could not be parsed (refusing to overwrite blindly)"
return None
def seed(*, data_root: Path, language: str, clear: bool, force: bool = False) -> dict:
"""Seed the sample content. Returns a summary dict.
Refuses to run if the target data root already holds real (non-sample)
content, unless ``force=True`` — seeding is for throwaway dev/CI/demo
roots, never for a production database.
"""
# Point the runtime path helpers at the chosen data root before importing
# anything that resolves DB/JSON locations. Restore it afterwards so a
# caller in the same process (e.g. a test) doesn't inherit it.
_prev_data_root = os.environ.get("SEARCH_UI_DATA_ROOT")
os.environ["SEARCH_UI_DATA_ROOT"] = str(data_root)
try:
return _seed_inner(
data_root=data_root, language=language, clear=clear, force=force
)
finally:
if _prev_data_root is None:
os.environ.pop("SEARCH_UI_DATA_ROOT", None)
else:
os.environ["SEARCH_UI_DATA_ROOT"] = _prev_data_root
def _seed_inner(*, data_root: Path, language: str, clear: bool, force: bool) -> dict:
import search # noqa: E402 (imported after env setup, by design)
search_db_path = os.path.join(str(data_root), "database.db")
instance = search.SubtitleSearch(db_path=search_db_path)
media_path = data_root / "json" / language / "all_media_items.json"
if not force:
real = _describe_real_data(instance, language, media_path)
if real:
raise RuntimeError(
f"Refusing to seed: {data_root} already contains real data "
f"({real}). Seeding would overwrite the media catalog and mix "
f"fake videos into a real index.\n"
f"Point --data-root at a throwaway directory (the default is "
f"the gitignored ./sample-data), or pass --force if you really "
f"mean to seed this location."
)
if clear:
# Scope the clear to the SAMPLE keys only. Using the global
# clear_language() here would wipe a developer's entire real
# subtitle index for this language when run against the default
# backend/ data root — a destructive footgun. Removing just the
# sample rows gives a clean re-seed without touching real content.
for item in SAMPLE_ITEMS:
instance.remove_subtitle(item["natural_key"], language)
instance.remove_scripture_references_for_video(item["natural_key"], language)
print(f"Cleared {len(SAMPLE_ITEMS)} sample subtitle rows for language {language}.")
conn = instance._get_db_connection()
indexed_subtitles = 0
try:
for item in SAMPLE_ITEMS:
conn.execute(
"DELETE FROM subtitles_fts WHERE natural_key = ? AND language = ?",
(item["natural_key"], language),
)
conn.execute(
"INSERT INTO subtitles_fts (natural_key, language, content) VALUES (?, ?, ?)",
(item["natural_key"], language, item["subtitles"]),
)
indexed_subtitles += 1
conn.commit()
finally:
conn.close()
# Scripture references (model-free regex parse + index).
scripture_refs = 0
for item in SAMPLE_ITEMS:
scripture_refs += instance.index_scripture_from_text(
item["natural_key"], language, item["subtitles"]
)
# Media metadata for title search + result decoration. (The guard above
# has already ensured media_path holds no real catalog entries.)
media = {item["natural_key"]: _build_media_record(item) for item in SAMPLE_ITEMS}
media_path.parent.mkdir(parents=True, exist_ok=True)
with open(media_path, "w", encoding="utf-8") as handle:
json.dump(media, handle, indent=2)
return {
"data_root": str(data_root),
"language": language,
"media_items": len(media),
"subtitles_indexed": indexed_subtitles,
"scripture_references": scripture_refs,
"search_db": search_db_path,
"media_json": str(media_path),
}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data-root",
default=str(REPO_ROOT / "sample-data"),
help=(
"Where to write the sample DB + json. Defaults to the gitignored "
"./sample-data so it never touches a real database. Pass an "
"explicit path to target elsewhere."
),
)
parser.add_argument("--language", default="E", help="Language code (default: E).")
parser.add_argument(
"--clear",
action="store_true",
help="Remove the sample rows first (scoped to sample keys only).",
)
parser.add_argument(
"--force",
action="store_true",
help=(
"Seed even if the target data root already contains real "
"(non-sample) content. Dangerous — only for deliberate use."
),
)
args = parser.parse_args()
data_root = Path(args.data_root).resolve()
data_root.mkdir(parents=True, exist_ok=True)
summary = seed(
data_root=data_root,
language=args.language,
clear=args.clear,
force=args.force,
)
print("Seeded sample content:")
for key, value in summary.items():
print(f" {key:22} {value}")
print(
"\nKeyword, title, and scripture search now return results. "
"Semantic/hybrid search still needs the embedding model."
)
return 0
if __name__ == "__main__":
raise SystemExit(main())