| |
| """ |
| Build SQLite database from TWL aligned corpus JSON. |
| |
| Usage: |
| python3 build_db.py 2026-03-27/TWL.2026-03-27.aligned.json twl_concordancer.db |
| python3 build_db.py 2026-03-27/A0000001.aligned.json twl_concordancer.db --append |
| """ |
|
|
| import sys |
| import json |
| import sqlite3 |
| import argparse |
| from pathlib import Path |
|
|
|
|
| DDL = """ |
| CREATE TABLE IF NOT EXISTS laws ( |
| id INTEGER PRIMARY KEY, |
| law_id TEXT UNIQUE, |
| type TEXT, |
| zh_name TEXT, |
| en_name TEXT, |
| category TEXT, |
| effective_date TEXT, |
| modified_date TEXT |
| ); |
| |
| CREATE TABLE IF NOT EXISTS articles ( |
| id INTEGER PRIMARY KEY, |
| law_id INTEGER REFERENCES laws(id), |
| article_no_zh TEXT, |
| article_no_en TEXT, |
| article_type TEXT, |
| article_index INTEGER |
| ); |
| |
| CREATE TABLE IF NOT EXISTS paragraphs ( |
| id INTEGER PRIMARY KEY, |
| article_id INTEGER REFERENCES articles(id), |
| law_id INTEGER REFERENCES laws(id), |
| paragraph_index INTEGER |
| ); |
| |
| CREATE TABLE IF NOT EXISTS sentences ( |
| id INTEGER PRIMARY KEY, |
| paragraph_id INTEGER REFERENCES paragraphs(id), |
| article_id INTEGER REFERENCES articles(id), |
| law_id INTEGER REFERENCES laws(id), |
| zh_text TEXT NOT NULL, |
| en_text TEXT NOT NULL, |
| alignment_score REAL, |
| zh_sentence_idx INTEGER, |
| en_sentence_idx INTEGER |
| ); |
| |
| CREATE INDEX IF NOT EXISTS idx_sentences_paragraph ON sentences(paragraph_id); |
| CREATE INDEX IF NOT EXISTS idx_sentences_article ON sentences(article_id); |
| CREATE INDEX IF NOT EXISTS idx_sentences_law ON sentences(law_id); |
| CREATE INDEX IF NOT EXISTS idx_paragraphs_article ON paragraphs(article_id); |
| CREATE INDEX IF NOT EXISTS idx_articles_law ON articles(law_id); |
| """ |
|
|
| TRIGGERS = "" |
|
|
|
|
| def build_db(input_file, db_file, append=False): |
| conn = sqlite3.connect(db_file) |
| conn.execute("PRAGMA journal_mode=WAL") |
| conn.execute("PRAGMA synchronous=NORMAL") |
| conn.execute("PRAGMA foreign_keys=ON") |
| cur = conn.cursor() |
|
|
| if not append: |
| cur.executescript( |
| "DROP TABLE IF EXISTS sentences; DROP TABLE IF EXISTS paragraphs; DROP TABLE IF EXISTS articles; DROP TABLE IF EXISTS laws; DROP TABLE IF EXISTS sentence_fts;" |
| ) |
|
|
| cur.executescript(DDL) |
| cur.executescript(TRIGGERS) |
|
|
| with open(input_file, encoding="utf-8") as f: |
| corpus = json.load(f) |
|
|
| law_count = 0 |
| article_count = 0 |
| paragraph_count = 0 |
| sentence_count = 0 |
|
|
| for entry_type, key in [("law", "laws"), ("order", "orders")]: |
| items = corpus.get(key, []) |
| for item in items: |
| law_id = item.get("law_id") or item.get("order_id", "") |
| zh_name = item.get("zh_name", "") |
| en_name = item.get("en_name", "") |
| category = item.get("category", "") |
|
|
| try: |
| cur.execute( |
| "INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)", |
| (law_id, entry_type, zh_name, en_name, category), |
| ) |
| except sqlite3.IntegrityError: |
| if append: |
| cur.execute("SELECT id FROM laws WHERE law_id = ?", (law_id,)) |
| row = cur.fetchone() |
| if row: |
| cur.execute( |
| "UPDATE laws SET zh_name = ?, en_name = ?, category = ? WHERE id = ?", |
| (zh_name, en_name, category, row[0]), |
| ) |
| law_db_id = row[0] |
| else: |
| cur.execute( |
| "INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)", |
| (law_id, entry_type, zh_name, en_name, category), |
| ) |
| law_db_id = cur.lastrowid |
| else: |
| raise |
| else: |
| law_db_id = cur.lastrowid |
|
|
| law_count += 1 |
|
|
| articles = item.get("articles", []) |
| for art_idx, art in enumerate(articles): |
| article_no_zh = art.get("article_no_zh", "") |
| article_no_en = art.get("article_no_en", "") |
| article_type = art.get("article_type", "") |
|
|
| cur.execute( |
| "INSERT INTO articles (law_id, article_no_zh, article_no_en, article_type, article_index) VALUES (?, ?, ?, ?, ?)", |
| (law_db_id, article_no_zh, article_no_en, article_type, art_idx), |
| ) |
| article_db_id = cur.lastrowid |
| article_count += 1 |
|
|
| paragraphs = art.get("paragraphs", []) |
| for para_idx, para in enumerate(paragraphs): |
| cur.execute( |
| "INSERT INTO paragraphs (article_id, law_id, paragraph_index) VALUES (?, ?, ?)", |
| (article_db_id, law_db_id, para_idx), |
| ) |
| paragraph_db_id = cur.lastrowid |
| paragraph_count += 1 |
|
|
| aligned_sentences = para.get("sentences", []) |
| for sent_idx, sent in enumerate(aligned_sentences): |
| zh_text = sent.get("zh", "") |
| en_text = sent.get("en", "") |
| score = sent.get("score", 0.0) |
| zh_sidx = ( |
| sent.get("zh_indices", [sent_idx])[0] |
| if sent.get("zh_indices") |
| else sent_idx |
| ) |
| en_sidx = ( |
| sent.get("en_indices", [sent_idx])[0] |
| if sent.get("en_indices") |
| else sent_idx |
| ) |
|
|
| if zh_text or en_text: |
| cur.execute( |
| "INSERT INTO sentences (paragraph_id, article_id, law_id, zh_text, en_text, alignment_score, zh_sentence_idx, en_sentence_idx) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", |
| ( |
| paragraph_db_id, |
| article_db_id, |
| law_db_id, |
| zh_text, |
| en_text, |
| score, |
| zh_sidx, |
| en_sidx, |
| ), |
| ) |
| sentence_count += 1 |
|
|
| if law_count % 100 == 0: |
| conn.commit() |
| print( |
| f" Processed {law_count} {entry_type}s, {sentence_count} sentences..." |
| ) |
|
|
| conn.commit() |
|
|
| print(f"\nDatabase built: {db_file}") |
| print(f" Laws/orders: {law_count}") |
| print(f" Articles: {article_count}") |
| print(f" Paragraphs: {paragraph_count}") |
| print(f" Sentences: {sentence_count}") |
|
|
| cur.execute("SELECT count(*) FROM sentences") |
| sent_count = cur.fetchone()[0] |
| print(f" Sentence records: {sent_count}") |
|
|
| conn.close() |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser( |
| description="Build SQLite concordancer DB from TWL aligned JSON" |
| ) |
| parser.add_argument("input_file", help="Path to aligned corpus JSON") |
| parser.add_argument("db_file", help="Output SQLite database path") |
| parser.add_argument( |
| "--append", |
| action="store_true", |
| help="Append to existing database instead of replacing", |
| ) |
| args = parser.parse_args() |
|
|
| build_db(args.input_file, args.db_file, append=args.append) |
|
|