TWL / build_db.py
rubentsui's picture
Upload folder using huggingface_hub
d883c53 verified
#!/usr/bin/env python3
"""
Build SQLite database from TWL aligned corpus JSON.
Usage:
python3 build_db.py 2026-03-27/TWL.2026-03-27.aligned.json twl_concordancer.db
python3 build_db.py 2026-03-27/A0000001.aligned.json twl_concordancer.db --append
"""
import sys
import json
import sqlite3
import argparse
from pathlib import Path
DDL = """
CREATE TABLE IF NOT EXISTS laws (
id INTEGER PRIMARY KEY,
law_id TEXT UNIQUE,
type TEXT,
zh_name TEXT,
en_name TEXT,
category TEXT,
effective_date TEXT,
modified_date TEXT
);
CREATE TABLE IF NOT EXISTS articles (
id INTEGER PRIMARY KEY,
law_id INTEGER REFERENCES laws(id),
article_no_zh TEXT,
article_no_en TEXT,
article_type TEXT,
article_index INTEGER
);
CREATE TABLE IF NOT EXISTS paragraphs (
id INTEGER PRIMARY KEY,
article_id INTEGER REFERENCES articles(id),
law_id INTEGER REFERENCES laws(id),
paragraph_index INTEGER
);
CREATE TABLE IF NOT EXISTS sentences (
id INTEGER PRIMARY KEY,
paragraph_id INTEGER REFERENCES paragraphs(id),
article_id INTEGER REFERENCES articles(id),
law_id INTEGER REFERENCES laws(id),
zh_text TEXT NOT NULL,
en_text TEXT NOT NULL,
alignment_score REAL,
zh_sentence_idx INTEGER,
en_sentence_idx INTEGER
);
CREATE INDEX IF NOT EXISTS idx_sentences_paragraph ON sentences(paragraph_id);
CREATE INDEX IF NOT EXISTS idx_sentences_article ON sentences(article_id);
CREATE INDEX IF NOT EXISTS idx_sentences_law ON sentences(law_id);
CREATE INDEX IF NOT EXISTS idx_paragraphs_article ON paragraphs(article_id);
CREATE INDEX IF NOT EXISTS idx_articles_law ON articles(law_id);
"""
TRIGGERS = ""
def build_db(input_file, db_file, append=False):
conn = sqlite3.connect(db_file)
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA synchronous=NORMAL")
conn.execute("PRAGMA foreign_keys=ON")
cur = conn.cursor()
if not append:
cur.executescript(
"DROP TABLE IF EXISTS sentences; DROP TABLE IF EXISTS paragraphs; DROP TABLE IF EXISTS articles; DROP TABLE IF EXISTS laws; DROP TABLE IF EXISTS sentence_fts;"
)
cur.executescript(DDL)
cur.executescript(TRIGGERS)
with open(input_file, encoding="utf-8") as f:
corpus = json.load(f)
law_count = 0
article_count = 0
paragraph_count = 0
sentence_count = 0
for entry_type, key in [("law", "laws"), ("order", "orders")]:
items = corpus.get(key, [])
for item in items:
law_id = item.get("law_id") or item.get("order_id", "")
zh_name = item.get("zh_name", "")
en_name = item.get("en_name", "")
category = item.get("category", "")
try:
cur.execute(
"INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)",
(law_id, entry_type, zh_name, en_name, category),
)
except sqlite3.IntegrityError:
if append:
cur.execute("SELECT id FROM laws WHERE law_id = ?", (law_id,))
row = cur.fetchone()
if row:
cur.execute(
"UPDATE laws SET zh_name = ?, en_name = ?, category = ? WHERE id = ?",
(zh_name, en_name, category, row[0]),
)
law_db_id = row[0]
else:
cur.execute(
"INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)",
(law_id, entry_type, zh_name, en_name, category),
)
law_db_id = cur.lastrowid
else:
raise
else:
law_db_id = cur.lastrowid
law_count += 1
articles = item.get("articles", [])
for art_idx, art in enumerate(articles):
article_no_zh = art.get("article_no_zh", "")
article_no_en = art.get("article_no_en", "")
article_type = art.get("article_type", "")
cur.execute(
"INSERT INTO articles (law_id, article_no_zh, article_no_en, article_type, article_index) VALUES (?, ?, ?, ?, ?)",
(law_db_id, article_no_zh, article_no_en, article_type, art_idx),
)
article_db_id = cur.lastrowid
article_count += 1
paragraphs = art.get("paragraphs", [])
for para_idx, para in enumerate(paragraphs):
cur.execute(
"INSERT INTO paragraphs (article_id, law_id, paragraph_index) VALUES (?, ?, ?)",
(article_db_id, law_db_id, para_idx),
)
paragraph_db_id = cur.lastrowid
paragraph_count += 1
aligned_sentences = para.get("sentences", [])
for sent_idx, sent in enumerate(aligned_sentences):
zh_text = sent.get("zh", "")
en_text = sent.get("en", "")
score = sent.get("score", 0.0)
zh_sidx = (
sent.get("zh_indices", [sent_idx])[0]
if sent.get("zh_indices")
else sent_idx
)
en_sidx = (
sent.get("en_indices", [sent_idx])[0]
if sent.get("en_indices")
else sent_idx
)
if zh_text or en_text:
cur.execute(
"INSERT INTO sentences (paragraph_id, article_id, law_id, zh_text, en_text, alignment_score, zh_sentence_idx, en_sentence_idx) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
(
paragraph_db_id,
article_db_id,
law_db_id,
zh_text,
en_text,
score,
zh_sidx,
en_sidx,
),
)
sentence_count += 1
if law_count % 100 == 0:
conn.commit()
print(
f" Processed {law_count} {entry_type}s, {sentence_count} sentences..."
)
conn.commit()
print(f"\nDatabase built: {db_file}")
print(f" Laws/orders: {law_count}")
print(f" Articles: {article_count}")
print(f" Paragraphs: {paragraph_count}")
print(f" Sentences: {sentence_count}")
cur.execute("SELECT count(*) FROM sentences")
sent_count = cur.fetchone()[0]
print(f" Sentence records: {sent_count}")
conn.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Build SQLite concordancer DB from TWL aligned JSON"
)
parser.add_argument("input_file", help="Path to aligned corpus JSON")
parser.add_argument("db_file", help="Output SQLite database path")
parser.add_argument(
"--append",
action="store_true",
help="Append to existing database instead of replacing",
)
args = parser.parse_args()
build_db(args.input_file, args.db_file, append=args.append)