#!/usr/bin/env python3 """Seed a small, representative content set for local dev / CI / demos. The real Search-UI databases are multi-GB and built from JW.org downloads, and the semantic index needs a sentence-transformers model that may not be available offline. That makes it hard to exercise the search experience end-to-end without the production data. This script seeds just enough **keyword-searchable** content to drive the app for real: * writes ``json//all_media_items.json`` so title search and result decoration work, * inserts subtitle text straight into the ``subtitles_fts`` FTS5 table (bypassing embeddings, so no model download is required), * indexes scripture references from that text (pure regex, model-free). Keyword search, title search, and scripture search all return real results afterwards. Semantic/hybrid search still needs the embedding model and is intentionally out of scope here. The sample set is fixed and deterministic so a golden snapshot taken against it is stable across runs. Usage:: # seed into the default throwaway ./sample-data dir (gitignored) python scripts/seed_sample_data.py # seed a specific throwaway location python scripts/seed_sample_data.py --data-root /tmp/searchui-sample # re-clear the sample rows first (scoped to sample keys only) python scripts/seed_sample_data.py --clear Safety: the default data root is ./sample-data, never your real database. If you point --data-root at a location that already holds real (non-sample) content, the script REFUSES to run unless you pass --force — it will not overwrite a production media catalog or mix fake videos into a real index. Nothing it writes is committed to git (sample-data/, json/ and *.db are gitignored). """ from __future__ import annotations import argparse import json import os import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(REPO_ROOT / "backend")) # A fixed, representative slice of JW.org-style VOD content. Subtitle text is # written to read naturally and to contain the vocabulary + scripture # references the golden queries probe (kingdom, field service, New World # Translation, John 3:16, ...). natural_key mirrors the real # languageAgnosticNaturalKey style (pub_track_VIDEO). SAMPLE_ITEMS: list[dict] = [ { "natural_key": "pub-jwbcov_E_1_VIDEO", "title": "What Is God's Kingdom?", "primaryCategory": "VODBibleTeachings", "subtitles": ( "God's Kingdom is a real government in heaven. Jesus taught his " "followers to pray for the Kingdom to come. At Daniel 2:44 we read " "that the Kingdom will never be destroyed and will crush all human " "governments. The Kingdom will bring peace to the whole earth." ), }, { "natural_key": "pub-jwbcov_E_2_VIDEO", "title": "Why Do We Share in the Field Ministry?", "primaryCategory": "VODBibleTeachings", "subtitles": ( "Jehovah's Witnesses share in the field ministry because Jesus " "commanded his disciples to preach the good news of the Kingdom. " "Field service gives everyone an opportunity to help others learn " "the truth. At Matthew 28:19, 20 Jesus said to make disciples of " "people of all the nations." ), }, { "natural_key": "pub-jwbcov_E_3_VIDEO", "title": "How Can You Trust the Bible?", "primaryCategory": "VODBibleTeachings", "subtitles": ( "The Bible has been translated into thousands of languages. The " "New World Translation is accurate and easy to understand. Second " "Timothy 3:16 explains that all Scripture is inspired of God and " "beneficial for teaching. You can trust the Bible because its " "prophecies have come true." ), }, { "natural_key": "pub-jwbcov_E_4_VIDEO", "title": "God's Love for Us", "primaryCategory": "VODBibleTeachings", "subtitles": ( "John 3:16 tells us that God loved the world so much that he gave " "his only-begotten Son. God's love moves us to want to know him " "better and to show love to our neighbors. Love is the greatest " "quality, as explained at 1 Corinthians 13." ), }, { "natural_key": "pub-jwbcov_E_5_VIDEO", "title": "Enjoy Life Forever in Paradise", "primaryCategory": "VODBibleTeachings", "subtitles": ( "The Bible promises that the earth will become a paradise. " "Revelation 21:4 says God will wipe out every tear, and death will " "be no more. Psalm 37:29 promises that the righteous will possess " "the earth and live forever on it." ), }, { "natural_key": "pub-jwbvod24_E_201_VIDEO", "title": "JW Broadcasting—Bible Principles for Family Life", "primaryCategory": "VODProgramsEvents", "subtitles": ( "Applying Bible principles strengthens family life. Husbands and " "wives who follow the Bible's counsel build strong marriages. " "Ephesians 5:33 counsels husbands to love their wives and wives to " "respect their husbands. Children who honor their parents are " "happier." ), }, { "natural_key": "pub-jwbvod24_E_202_VIDEO", "title": "Cart Witnessing in the Community", "primaryCategory": "VODProgramsEvents", "subtitles": ( "Public witnessing with carts lets people approach at their own " "pace. Many take a free Bible study or a magazine. The field " "ministry reaches people in busy public places. Volunteers stand " "ready to answer Bible questions about the Kingdom hope." ), }, { "natural_key": "pub-jwbvod24_E_203_VIDEO", "title": "Memorial of Jesus' Death", "primaryCategory": "VODProgramsEvents", "subtitles": ( "Each year millions gather for the Memorial of Jesus' death. Jesus " "said at Luke 22:19 to keep doing this in remembrance of him. The " "emblems of unleavened bread and red wine represent his body and " "blood. The Memorial reminds us of God's love and the ransom." ), }, { "natural_key": "pub-ihelp_E_7_VIDEO", "title": "Coping With Anxiety", "primaryCategory": "VODBibleTeachings", "subtitles": ( "Everyone feels anxious sometimes. Prayer can help us cope with " "anxiety. Philippians 4:6, 7 encourages us to bring our worries to " "God in prayer, and the peace of God will guard our hearts. " "Casting our anxiety on Jehovah brings real comfort." ), }, { "natural_key": "pub-ihelp_E_8_VIDEO", "title": "Finding True Happiness", "primaryCategory": "VODBibleTeachings", "subtitles": ( "True happiness comes from helping others and knowing God. Jesus " "said at Acts 20:35 that there is more happiness in giving than in " "receiving. Spiritual things bring lasting joy that money cannot " "buy." ), }, ] SAMPLE_KEYS: frozenset[str] = frozenset(item["natural_key"] for item in SAMPLE_ITEMS) def _build_media_record(item: dict) -> dict: """Shape one item the way all_media_items.json entries look.""" return { "title": item["title"], "languageAgnosticNaturalKey": item["natural_key"], "primaryCategory": item.get("primaryCategory", "VODBibleTeachings"), "_category_key": "VideoOnDemand", "_subcategory": item.get("primaryCategory", "VODBibleTeachings"), "duration": 300, "firstPublished": "2024-01-01T00:00:00Z", } def _describe_real_data(instance, language: str, media_path: Path) -> str | None: """Return a human description of REAL (non-sample) content found at the target, or None if the target is empty / sample-only. Used to refuse seeding over a developer's production database — the sample keys are a fixed known set, so anything else is real data. """ conn = instance._get_db_connection() try: rows = conn.execute( "SELECT DISTINCT natural_key FROM subtitles_fts WHERE language = ?", (language,), ).fetchall() finally: conn.close() real_subs = [r[0] for r in rows if r[0] not in SAMPLE_KEYS] if real_subs: return f"{len(real_subs)} non-sample subtitle row(s) already indexed for language {language}" if media_path.exists(): try: existing = json.loads(media_path.read_text(encoding="utf-8")) real_media = [k for k in existing if k not in SAMPLE_KEYS] if real_media: return f"{len(real_media)} non-sample entrie(s) in existing {media_path.name}" except (json.JSONDecodeError, OSError): return f"an existing {media_path.name} that could not be parsed (refusing to overwrite blindly)" return None def seed(*, data_root: Path, language: str, clear: bool, force: bool = False) -> dict: """Seed the sample content. Returns a summary dict. Refuses to run if the target data root already holds real (non-sample) content, unless ``force=True`` — seeding is for throwaway dev/CI/demo roots, never for a production database. """ # Point the runtime path helpers at the chosen data root before importing # anything that resolves DB/JSON locations. Restore it afterwards so a # caller in the same process (e.g. a test) doesn't inherit it. _prev_data_root = os.environ.get("SEARCH_UI_DATA_ROOT") os.environ["SEARCH_UI_DATA_ROOT"] = str(data_root) try: return _seed_inner( data_root=data_root, language=language, clear=clear, force=force ) finally: if _prev_data_root is None: os.environ.pop("SEARCH_UI_DATA_ROOT", None) else: os.environ["SEARCH_UI_DATA_ROOT"] = _prev_data_root def _seed_inner(*, data_root: Path, language: str, clear: bool, force: bool) -> dict: import search # noqa: E402 (imported after env setup, by design) search_db_path = os.path.join(str(data_root), "database.db") instance = search.SubtitleSearch(db_path=search_db_path) media_path = data_root / "json" / language / "all_media_items.json" if not force: real = _describe_real_data(instance, language, media_path) if real: raise RuntimeError( f"Refusing to seed: {data_root} already contains real data " f"({real}). Seeding would overwrite the media catalog and mix " f"fake videos into a real index.\n" f"Point --data-root at a throwaway directory (the default is " f"the gitignored ./sample-data), or pass --force if you really " f"mean to seed this location." ) if clear: # Scope the clear to the SAMPLE keys only. Using the global # clear_language() here would wipe a developer's entire real # subtitle index for this language when run against the default # backend/ data root — a destructive footgun. Removing just the # sample rows gives a clean re-seed without touching real content. for item in SAMPLE_ITEMS: instance.remove_subtitle(item["natural_key"], language) instance.remove_scripture_references_for_video(item["natural_key"], language) print(f"Cleared {len(SAMPLE_ITEMS)} sample subtitle rows for language {language}.") conn = instance._get_db_connection() indexed_subtitles = 0 try: for item in SAMPLE_ITEMS: conn.execute( "DELETE FROM subtitles_fts WHERE natural_key = ? AND language = ?", (item["natural_key"], language), ) conn.execute( "INSERT INTO subtitles_fts (natural_key, language, content) VALUES (?, ?, ?)", (item["natural_key"], language, item["subtitles"]), ) indexed_subtitles += 1 conn.commit() finally: conn.close() # Scripture references (model-free regex parse + index). scripture_refs = 0 for item in SAMPLE_ITEMS: scripture_refs += instance.index_scripture_from_text( item["natural_key"], language, item["subtitles"] ) # Media metadata for title search + result decoration. (The guard above # has already ensured media_path holds no real catalog entries.) media = {item["natural_key"]: _build_media_record(item) for item in SAMPLE_ITEMS} media_path.parent.mkdir(parents=True, exist_ok=True) with open(media_path, "w", encoding="utf-8") as handle: json.dump(media, handle, indent=2) return { "data_root": str(data_root), "language": language, "media_items": len(media), "subtitles_indexed": indexed_subtitles, "scripture_references": scripture_refs, "search_db": search_db_path, "media_json": str(media_path), } def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--data-root", default=str(REPO_ROOT / "sample-data"), help=( "Where to write the sample DB + json. Defaults to the gitignored " "./sample-data so it never touches a real database. Pass an " "explicit path to target elsewhere." ), ) parser.add_argument("--language", default="E", help="Language code (default: E).") parser.add_argument( "--clear", action="store_true", help="Remove the sample rows first (scoped to sample keys only).", ) parser.add_argument( "--force", action="store_true", help=( "Seed even if the target data root already contains real " "(non-sample) content. Dangerous — only for deliberate use." ), ) args = parser.parse_args() data_root = Path(args.data_root).resolve() data_root.mkdir(parents=True, exist_ok=True) summary = seed( data_root=data_root, language=args.language, clear=args.clear, force=args.force, ) print("Seeded sample content:") for key, value in summary.items(): print(f" {key:22} {value}") print( "\nKeyword, title, and scripture search now return results. " "Semantic/hybrid search still needs the embedding model." ) return 0 if __name__ == "__main__": raise SystemExit(main())