File size: 14,832 Bytes
7ea1851 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | #!/usr/bin/env python3
"""Seed a small, representative content set for local dev / CI / demos.
The real Search-UI databases are multi-GB and built from JW.org downloads,
and the semantic index needs a sentence-transformers model that may not be
available offline. That makes it hard to exercise the search experience
end-to-end without the production data.
This script seeds just enough **keyword-searchable** content to drive the
app for real:
* writes ``json/<lang>/all_media_items.json`` so title search and result
decoration work,
* inserts subtitle text straight into the ``subtitles_fts`` FTS5 table
(bypassing embeddings, so no model download is required),
* indexes scripture references from that text (pure regex, model-free).
Keyword search, title search, and scripture search all return real results
afterwards. Semantic/hybrid search still needs the embedding model and is
intentionally out of scope here.
The sample set is fixed and deterministic so a golden snapshot taken
against it is stable across runs.
Usage::
# seed into the default throwaway ./sample-data dir (gitignored)
python scripts/seed_sample_data.py
# seed a specific throwaway location
python scripts/seed_sample_data.py --data-root /tmp/searchui-sample
# re-clear the sample rows first (scoped to sample keys only)
python scripts/seed_sample_data.py --clear
Safety: the default data root is ./sample-data, never your real database.
If you point --data-root at a location that already holds real (non-sample)
content, the script REFUSES to run unless you pass --force — it will not
overwrite a production media catalog or mix fake videos into a real index.
Nothing it writes is committed to git (sample-data/, json/ and *.db are
gitignored).
"""
from __future__ import annotations
import argparse
import json
import os
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT / "backend"))
# A fixed, representative slice of JW.org-style VOD content. Subtitle text is
# written to read naturally and to contain the vocabulary + scripture
# references the golden queries probe (kingdom, field service, New World
# Translation, John 3:16, ...). natural_key mirrors the real
# languageAgnosticNaturalKey style (pub_track_VIDEO).
SAMPLE_ITEMS: list[dict] = [
{
"natural_key": "pub-jwbcov_E_1_VIDEO",
"title": "What Is God's Kingdom?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"God's Kingdom is a real government in heaven. Jesus taught his "
"followers to pray for the Kingdom to come. At Daniel 2:44 we read "
"that the Kingdom will never be destroyed and will crush all human "
"governments. The Kingdom will bring peace to the whole earth."
),
},
{
"natural_key": "pub-jwbcov_E_2_VIDEO",
"title": "Why Do We Share in the Field Ministry?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"Jehovah's Witnesses share in the field ministry because Jesus "
"commanded his disciples to preach the good news of the Kingdom. "
"Field service gives everyone an opportunity to help others learn "
"the truth. At Matthew 28:19, 20 Jesus said to make disciples of "
"people of all the nations."
),
},
{
"natural_key": "pub-jwbcov_E_3_VIDEO",
"title": "How Can You Trust the Bible?",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"The Bible has been translated into thousands of languages. The "
"New World Translation is accurate and easy to understand. Second "
"Timothy 3:16 explains that all Scripture is inspired of God and "
"beneficial for teaching. You can trust the Bible because its "
"prophecies have come true."
),
},
{
"natural_key": "pub-jwbcov_E_4_VIDEO",
"title": "God's Love for Us",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"John 3:16 tells us that God loved the world so much that he gave "
"his only-begotten Son. God's love moves us to want to know him "
"better and to show love to our neighbors. Love is the greatest "
"quality, as explained at 1 Corinthians 13."
),
},
{
"natural_key": "pub-jwbcov_E_5_VIDEO",
"title": "Enjoy Life Forever in Paradise",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"The Bible promises that the earth will become a paradise. "
"Revelation 21:4 says God will wipe out every tear, and death will "
"be no more. Psalm 37:29 promises that the righteous will possess "
"the earth and live forever on it."
),
},
{
"natural_key": "pub-jwbvod24_E_201_VIDEO",
"title": "JW Broadcasting—Bible Principles for Family Life",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Applying Bible principles strengthens family life. Husbands and "
"wives who follow the Bible's counsel build strong marriages. "
"Ephesians 5:33 counsels husbands to love their wives and wives to "
"respect their husbands. Children who honor their parents are "
"happier."
),
},
{
"natural_key": "pub-jwbvod24_E_202_VIDEO",
"title": "Cart Witnessing in the Community",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Public witnessing with carts lets people approach at their own "
"pace. Many take a free Bible study or a magazine. The field "
"ministry reaches people in busy public places. Volunteers stand "
"ready to answer Bible questions about the Kingdom hope."
),
},
{
"natural_key": "pub-jwbvod24_E_203_VIDEO",
"title": "Memorial of Jesus' Death",
"primaryCategory": "VODProgramsEvents",
"subtitles": (
"Each year millions gather for the Memorial of Jesus' death. Jesus "
"said at Luke 22:19 to keep doing this in remembrance of him. The "
"emblems of unleavened bread and red wine represent his body and "
"blood. The Memorial reminds us of God's love and the ransom."
),
},
{
"natural_key": "pub-ihelp_E_7_VIDEO",
"title": "Coping With Anxiety",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"Everyone feels anxious sometimes. Prayer can help us cope with "
"anxiety. Philippians 4:6, 7 encourages us to bring our worries to "
"God in prayer, and the peace of God will guard our hearts. "
"Casting our anxiety on Jehovah brings real comfort."
),
},
{
"natural_key": "pub-ihelp_E_8_VIDEO",
"title": "Finding True Happiness",
"primaryCategory": "VODBibleTeachings",
"subtitles": (
"True happiness comes from helping others and knowing God. Jesus "
"said at Acts 20:35 that there is more happiness in giving than in "
"receiving. Spiritual things bring lasting joy that money cannot "
"buy."
),
},
]
SAMPLE_KEYS: frozenset[str] = frozenset(item["natural_key"] for item in SAMPLE_ITEMS)
def _build_media_record(item: dict) -> dict:
"""Shape one item the way all_media_items.json entries look."""
return {
"title": item["title"],
"languageAgnosticNaturalKey": item["natural_key"],
"primaryCategory": item.get("primaryCategory", "VODBibleTeachings"),
"_category_key": "VideoOnDemand",
"_subcategory": item.get("primaryCategory", "VODBibleTeachings"),
"duration": 300,
"firstPublished": "2024-01-01T00:00:00Z",
}
def _describe_real_data(instance, language: str, media_path: Path) -> str | None:
"""Return a human description of REAL (non-sample) content found at the
target, or None if the target is empty / sample-only.
Used to refuse seeding over a developer's production database — the
sample keys are a fixed known set, so anything else is real data.
"""
conn = instance._get_db_connection()
try:
rows = conn.execute(
"SELECT DISTINCT natural_key FROM subtitles_fts WHERE language = ?",
(language,),
).fetchall()
finally:
conn.close()
real_subs = [r[0] for r in rows if r[0] not in SAMPLE_KEYS]
if real_subs:
return f"{len(real_subs)} non-sample subtitle row(s) already indexed for language {language}"
if media_path.exists():
try:
existing = json.loads(media_path.read_text(encoding="utf-8"))
real_media = [k for k in existing if k not in SAMPLE_KEYS]
if real_media:
return f"{len(real_media)} non-sample entrie(s) in existing {media_path.name}"
except (json.JSONDecodeError, OSError):
return f"an existing {media_path.name} that could not be parsed (refusing to overwrite blindly)"
return None
def seed(*, data_root: Path, language: str, clear: bool, force: bool = False) -> dict:
"""Seed the sample content. Returns a summary dict.
Refuses to run if the target data root already holds real (non-sample)
content, unless ``force=True`` — seeding is for throwaway dev/CI/demo
roots, never for a production database.
"""
# Point the runtime path helpers at the chosen data root before importing
# anything that resolves DB/JSON locations. Restore it afterwards so a
# caller in the same process (e.g. a test) doesn't inherit it.
_prev_data_root = os.environ.get("SEARCH_UI_DATA_ROOT")
os.environ["SEARCH_UI_DATA_ROOT"] = str(data_root)
try:
return _seed_inner(
data_root=data_root, language=language, clear=clear, force=force
)
finally:
if _prev_data_root is None:
os.environ.pop("SEARCH_UI_DATA_ROOT", None)
else:
os.environ["SEARCH_UI_DATA_ROOT"] = _prev_data_root
def _seed_inner(*, data_root: Path, language: str, clear: bool, force: bool) -> dict:
import search # noqa: E402 (imported after env setup, by design)
search_db_path = os.path.join(str(data_root), "database.db")
instance = search.SubtitleSearch(db_path=search_db_path)
media_path = data_root / "json" / language / "all_media_items.json"
if not force:
real = _describe_real_data(instance, language, media_path)
if real:
raise RuntimeError(
f"Refusing to seed: {data_root} already contains real data "
f"({real}). Seeding would overwrite the media catalog and mix "
f"fake videos into a real index.\n"
f"Point --data-root at a throwaway directory (the default is "
f"the gitignored ./sample-data), or pass --force if you really "
f"mean to seed this location."
)
if clear:
# Scope the clear to the SAMPLE keys only. Using the global
# clear_language() here would wipe a developer's entire real
# subtitle index for this language when run against the default
# backend/ data root — a destructive footgun. Removing just the
# sample rows gives a clean re-seed without touching real content.
for item in SAMPLE_ITEMS:
instance.remove_subtitle(item["natural_key"], language)
instance.remove_scripture_references_for_video(item["natural_key"], language)
print(f"Cleared {len(SAMPLE_ITEMS)} sample subtitle rows for language {language}.")
conn = instance._get_db_connection()
indexed_subtitles = 0
try:
for item in SAMPLE_ITEMS:
conn.execute(
"DELETE FROM subtitles_fts WHERE natural_key = ? AND language = ?",
(item["natural_key"], language),
)
conn.execute(
"INSERT INTO subtitles_fts (natural_key, language, content) VALUES (?, ?, ?)",
(item["natural_key"], language, item["subtitles"]),
)
indexed_subtitles += 1
conn.commit()
finally:
conn.close()
# Scripture references (model-free regex parse + index).
scripture_refs = 0
for item in SAMPLE_ITEMS:
scripture_refs += instance.index_scripture_from_text(
item["natural_key"], language, item["subtitles"]
)
# Media metadata for title search + result decoration. (The guard above
# has already ensured media_path holds no real catalog entries.)
media = {item["natural_key"]: _build_media_record(item) for item in SAMPLE_ITEMS}
media_path.parent.mkdir(parents=True, exist_ok=True)
with open(media_path, "w", encoding="utf-8") as handle:
json.dump(media, handle, indent=2)
return {
"data_root": str(data_root),
"language": language,
"media_items": len(media),
"subtitles_indexed": indexed_subtitles,
"scripture_references": scripture_refs,
"search_db": search_db_path,
"media_json": str(media_path),
}
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data-root",
default=str(REPO_ROOT / "sample-data"),
help=(
"Where to write the sample DB + json. Defaults to the gitignored "
"./sample-data so it never touches a real database. Pass an "
"explicit path to target elsewhere."
),
)
parser.add_argument("--language", default="E", help="Language code (default: E).")
parser.add_argument(
"--clear",
action="store_true",
help="Remove the sample rows first (scoped to sample keys only).",
)
parser.add_argument(
"--force",
action="store_true",
help=(
"Seed even if the target data root already contains real "
"(non-sample) content. Dangerous — only for deliberate use."
),
)
args = parser.parse_args()
data_root = Path(args.data_root).resolve()
data_root.mkdir(parents=True, exist_ok=True)
summary = seed(
data_root=data_root,
language=args.language,
clear=args.clear,
force=args.force,
)
print("Seeded sample content:")
for key, value in summary.items():
print(f" {key:22} {value}")
print(
"\nKeyword, title, and scripture search now return results. "
"Semantic/hybrid search still needs the embedding model."
)
return 0
if __name__ == "__main__":
raise SystemExit(main())
|