| |
| """Seed a small, representative content set for local dev / CI / demos. |
| |
| The real Search-UI databases are multi-GB and built from JW.org downloads, |
| and the semantic index needs a sentence-transformers model that may not be |
| available offline. That makes it hard to exercise the search experience |
| end-to-end without the production data. |
| |
| This script seeds just enough **keyword-searchable** content to drive the |
| app for real: |
| |
| * writes ``json/<lang>/all_media_items.json`` so title search and result |
| decoration work, |
| * inserts subtitle text straight into the ``subtitles_fts`` FTS5 table |
| (bypassing embeddings, so no model download is required), |
| * indexes scripture references from that text (pure regex, model-free). |
| |
| Keyword search, title search, and scripture search all return real results |
| afterwards. Semantic/hybrid search still needs the embedding model and is |
| intentionally out of scope here. |
| |
| The sample set is fixed and deterministic so a golden snapshot taken |
| against it is stable across runs. |
| |
| Usage:: |
| |
| # seed into the default throwaway ./sample-data dir (gitignored) |
| python scripts/seed_sample_data.py |
| # seed a specific throwaway location |
| python scripts/seed_sample_data.py --data-root /tmp/searchui-sample |
| # re-clear the sample rows first (scoped to sample keys only) |
| python scripts/seed_sample_data.py --clear |
| |
| Safety: the default data root is ./sample-data, never your real database. |
| If you point --data-root at a location that already holds real (non-sample) |
| content, the script REFUSES to run unless you pass --force — it will not |
| overwrite a production media catalog or mix fake videos into a real index. |
| Nothing it writes is committed to git (sample-data/, json/ and *.db are |
| gitignored). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import sys |
| from pathlib import Path |
|
|
| REPO_ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(REPO_ROOT / "backend")) |
|
|
|
|
| |
| |
| |
| |
| |
| SAMPLE_ITEMS: list[dict] = [ |
| { |
| "natural_key": "pub-jwbcov_E_1_VIDEO", |
| "title": "What Is God's Kingdom?", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "God's Kingdom is a real government in heaven. Jesus taught his " |
| "followers to pray for the Kingdom to come. At Daniel 2:44 we read " |
| "that the Kingdom will never be destroyed and will crush all human " |
| "governments. The Kingdom will bring peace to the whole earth." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbcov_E_2_VIDEO", |
| "title": "Why Do We Share in the Field Ministry?", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "Jehovah's Witnesses share in the field ministry because Jesus " |
| "commanded his disciples to preach the good news of the Kingdom. " |
| "Field service gives everyone an opportunity to help others learn " |
| "the truth. At Matthew 28:19, 20 Jesus said to make disciples of " |
| "people of all the nations." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbcov_E_3_VIDEO", |
| "title": "How Can You Trust the Bible?", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "The Bible has been translated into thousands of languages. The " |
| "New World Translation is accurate and easy to understand. Second " |
| "Timothy 3:16 explains that all Scripture is inspired of God and " |
| "beneficial for teaching. You can trust the Bible because its " |
| "prophecies have come true." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbcov_E_4_VIDEO", |
| "title": "God's Love for Us", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "John 3:16 tells us that God loved the world so much that he gave " |
| "his only-begotten Son. God's love moves us to want to know him " |
| "better and to show love to our neighbors. Love is the greatest " |
| "quality, as explained at 1 Corinthians 13." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbcov_E_5_VIDEO", |
| "title": "Enjoy Life Forever in Paradise", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "The Bible promises that the earth will become a paradise. " |
| "Revelation 21:4 says God will wipe out every tear, and death will " |
| "be no more. Psalm 37:29 promises that the righteous will possess " |
| "the earth and live forever on it." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbvod24_E_201_VIDEO", |
| "title": "JW Broadcasting—Bible Principles for Family Life", |
| "primaryCategory": "VODProgramsEvents", |
| "subtitles": ( |
| "Applying Bible principles strengthens family life. Husbands and " |
| "wives who follow the Bible's counsel build strong marriages. " |
| "Ephesians 5:33 counsels husbands to love their wives and wives to " |
| "respect their husbands. Children who honor their parents are " |
| "happier." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbvod24_E_202_VIDEO", |
| "title": "Cart Witnessing in the Community", |
| "primaryCategory": "VODProgramsEvents", |
| "subtitles": ( |
| "Public witnessing with carts lets people approach at their own " |
| "pace. Many take a free Bible study or a magazine. The field " |
| "ministry reaches people in busy public places. Volunteers stand " |
| "ready to answer Bible questions about the Kingdom hope." |
| ), |
| }, |
| { |
| "natural_key": "pub-jwbvod24_E_203_VIDEO", |
| "title": "Memorial of Jesus' Death", |
| "primaryCategory": "VODProgramsEvents", |
| "subtitles": ( |
| "Each year millions gather for the Memorial of Jesus' death. Jesus " |
| "said at Luke 22:19 to keep doing this in remembrance of him. The " |
| "emblems of unleavened bread and red wine represent his body and " |
| "blood. The Memorial reminds us of God's love and the ransom." |
| ), |
| }, |
| { |
| "natural_key": "pub-ihelp_E_7_VIDEO", |
| "title": "Coping With Anxiety", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "Everyone feels anxious sometimes. Prayer can help us cope with " |
| "anxiety. Philippians 4:6, 7 encourages us to bring our worries to " |
| "God in prayer, and the peace of God will guard our hearts. " |
| "Casting our anxiety on Jehovah brings real comfort." |
| ), |
| }, |
| { |
| "natural_key": "pub-ihelp_E_8_VIDEO", |
| "title": "Finding True Happiness", |
| "primaryCategory": "VODBibleTeachings", |
| "subtitles": ( |
| "True happiness comes from helping others and knowing God. Jesus " |
| "said at Acts 20:35 that there is more happiness in giving than in " |
| "receiving. Spiritual things bring lasting joy that money cannot " |
| "buy." |
| ), |
| }, |
| ] |
|
|
|
|
| SAMPLE_KEYS: frozenset[str] = frozenset(item["natural_key"] for item in SAMPLE_ITEMS) |
|
|
|
|
| def _build_media_record(item: dict) -> dict: |
| """Shape one item the way all_media_items.json entries look.""" |
| return { |
| "title": item["title"], |
| "languageAgnosticNaturalKey": item["natural_key"], |
| "primaryCategory": item.get("primaryCategory", "VODBibleTeachings"), |
| "_category_key": "VideoOnDemand", |
| "_subcategory": item.get("primaryCategory", "VODBibleTeachings"), |
| "duration": 300, |
| "firstPublished": "2024-01-01T00:00:00Z", |
| } |
|
|
|
|
| def _describe_real_data(instance, language: str, media_path: Path) -> str | None: |
| """Return a human description of REAL (non-sample) content found at the |
| target, or None if the target is empty / sample-only. |
| |
| Used to refuse seeding over a developer's production database — the |
| sample keys are a fixed known set, so anything else is real data. |
| """ |
| conn = instance._get_db_connection() |
| try: |
| rows = conn.execute( |
| "SELECT DISTINCT natural_key FROM subtitles_fts WHERE language = ?", |
| (language,), |
| ).fetchall() |
| finally: |
| conn.close() |
| real_subs = [r[0] for r in rows if r[0] not in SAMPLE_KEYS] |
| if real_subs: |
| return f"{len(real_subs)} non-sample subtitle row(s) already indexed for language {language}" |
|
|
| if media_path.exists(): |
| try: |
| existing = json.loads(media_path.read_text(encoding="utf-8")) |
| real_media = [k for k in existing if k not in SAMPLE_KEYS] |
| if real_media: |
| return f"{len(real_media)} non-sample entrie(s) in existing {media_path.name}" |
| except (json.JSONDecodeError, OSError): |
| return f"an existing {media_path.name} that could not be parsed (refusing to overwrite blindly)" |
| return None |
|
|
|
|
| def seed(*, data_root: Path, language: str, clear: bool, force: bool = False) -> dict: |
| """Seed the sample content. Returns a summary dict. |
| |
| Refuses to run if the target data root already holds real (non-sample) |
| content, unless ``force=True`` — seeding is for throwaway dev/CI/demo |
| roots, never for a production database. |
| """ |
| |
| |
| |
| _prev_data_root = os.environ.get("SEARCH_UI_DATA_ROOT") |
| os.environ["SEARCH_UI_DATA_ROOT"] = str(data_root) |
| try: |
| return _seed_inner( |
| data_root=data_root, language=language, clear=clear, force=force |
| ) |
| finally: |
| if _prev_data_root is None: |
| os.environ.pop("SEARCH_UI_DATA_ROOT", None) |
| else: |
| os.environ["SEARCH_UI_DATA_ROOT"] = _prev_data_root |
|
|
|
|
| def _seed_inner(*, data_root: Path, language: str, clear: bool, force: bool) -> dict: |
| import search |
|
|
| search_db_path = os.path.join(str(data_root), "database.db") |
| instance = search.SubtitleSearch(db_path=search_db_path) |
|
|
| media_path = data_root / "json" / language / "all_media_items.json" |
| if not force: |
| real = _describe_real_data(instance, language, media_path) |
| if real: |
| raise RuntimeError( |
| f"Refusing to seed: {data_root} already contains real data " |
| f"({real}). Seeding would overwrite the media catalog and mix " |
| f"fake videos into a real index.\n" |
| f"Point --data-root at a throwaway directory (the default is " |
| f"the gitignored ./sample-data), or pass --force if you really " |
| f"mean to seed this location." |
| ) |
|
|
| if clear: |
| |
| |
| |
| |
| |
| for item in SAMPLE_ITEMS: |
| instance.remove_subtitle(item["natural_key"], language) |
| instance.remove_scripture_references_for_video(item["natural_key"], language) |
| print(f"Cleared {len(SAMPLE_ITEMS)} sample subtitle rows for language {language}.") |
|
|
| conn = instance._get_db_connection() |
| indexed_subtitles = 0 |
| try: |
| for item in SAMPLE_ITEMS: |
| conn.execute( |
| "DELETE FROM subtitles_fts WHERE natural_key = ? AND language = ?", |
| (item["natural_key"], language), |
| ) |
| conn.execute( |
| "INSERT INTO subtitles_fts (natural_key, language, content) VALUES (?, ?, ?)", |
| (item["natural_key"], language, item["subtitles"]), |
| ) |
| indexed_subtitles += 1 |
| conn.commit() |
| finally: |
| conn.close() |
|
|
| |
| scripture_refs = 0 |
| for item in SAMPLE_ITEMS: |
| scripture_refs += instance.index_scripture_from_text( |
| item["natural_key"], language, item["subtitles"] |
| ) |
|
|
| |
| |
| media = {item["natural_key"]: _build_media_record(item) for item in SAMPLE_ITEMS} |
| media_path.parent.mkdir(parents=True, exist_ok=True) |
| with open(media_path, "w", encoding="utf-8") as handle: |
| json.dump(media, handle, indent=2) |
|
|
| return { |
| "data_root": str(data_root), |
| "language": language, |
| "media_items": len(media), |
| "subtitles_indexed": indexed_subtitles, |
| "scripture_references": scripture_refs, |
| "search_db": search_db_path, |
| "media_json": str(media_path), |
| } |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument( |
| "--data-root", |
| default=str(REPO_ROOT / "sample-data"), |
| help=( |
| "Where to write the sample DB + json. Defaults to the gitignored " |
| "./sample-data so it never touches a real database. Pass an " |
| "explicit path to target elsewhere." |
| ), |
| ) |
| parser.add_argument("--language", default="E", help="Language code (default: E).") |
| parser.add_argument( |
| "--clear", |
| action="store_true", |
| help="Remove the sample rows first (scoped to sample keys only).", |
| ) |
| parser.add_argument( |
| "--force", |
| action="store_true", |
| help=( |
| "Seed even if the target data root already contains real " |
| "(non-sample) content. Dangerous — only for deliberate use." |
| ), |
| ) |
| args = parser.parse_args() |
|
|
| data_root = Path(args.data_root).resolve() |
| data_root.mkdir(parents=True, exist_ok=True) |
|
|
| summary = seed( |
| data_root=data_root, |
| language=args.language, |
| clear=args.clear, |
| force=args.force, |
| ) |
|
|
| print("Seeded sample content:") |
| for key, value in summary.items(): |
| print(f" {key:22} {value}") |
| print( |
| "\nKeyword, title, and scripture search now return results. " |
| "Semantic/hybrid search still needs the embedding model." |
| ) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|