Spaces:

rubentsui
/

TWL

Running

App Files Files Community

TWL / build_db.py

rubentsui

Upload folder using huggingface_hub

d883c53 verified 14 days ago

raw

history blame contribute delete

7.66 kB

	#!/usr/bin/env python3
	"""
	Build SQLite database from TWL aligned corpus JSON.

	Usage:
	python3 build_db.py 2026-03-27/TWL.2026-03-27.aligned.json twl_concordancer.db
	python3 build_db.py 2026-03-27/A0000001.aligned.json twl_concordancer.db --append
	"""

	import sys
	import json
	import sqlite3
	import argparse
	from pathlib import Path


	DDL = """
	CREATE TABLE IF NOT EXISTS laws (
	id INTEGER PRIMARY KEY,
	law_id TEXT UNIQUE,
	type TEXT,
	zh_name TEXT,
	en_name TEXT,
	category TEXT,
	effective_date TEXT,
	modified_date TEXT
	);

	CREATE TABLE IF NOT EXISTS articles (
	id INTEGER PRIMARY KEY,
	law_id INTEGER REFERENCES laws(id),
	article_no_zh TEXT,
	article_no_en TEXT,
	article_type TEXT,
	article_index INTEGER
	);

	CREATE TABLE IF NOT EXISTS paragraphs (
	id INTEGER PRIMARY KEY,
	article_id INTEGER REFERENCES articles(id),
	law_id INTEGER REFERENCES laws(id),
	paragraph_index INTEGER
	);

	CREATE TABLE IF NOT EXISTS sentences (
	id INTEGER PRIMARY KEY,
	paragraph_id INTEGER REFERENCES paragraphs(id),
	article_id INTEGER REFERENCES articles(id),
	law_id INTEGER REFERENCES laws(id),
	zh_text TEXT NOT NULL,
	en_text TEXT NOT NULL,
	alignment_score REAL,
	zh_sentence_idx INTEGER,
	en_sentence_idx INTEGER
	);

	CREATE INDEX IF NOT EXISTS idx_sentences_paragraph ON sentences(paragraph_id);
	CREATE INDEX IF NOT EXISTS idx_sentences_article ON sentences(article_id);
	CREATE INDEX IF NOT EXISTS idx_sentences_law ON sentences(law_id);
	CREATE INDEX IF NOT EXISTS idx_paragraphs_article ON paragraphs(article_id);
	CREATE INDEX IF NOT EXISTS idx_articles_law ON articles(law_id);
	"""

	TRIGGERS = ""


	def build_db(input_file, db_file, append=False):
	conn = sqlite3.connect(db_file)
	conn.execute("PRAGMA journal_mode=WAL")
	conn.execute("PRAGMA synchronous=NORMAL")
	conn.execute("PRAGMA foreign_keys=ON")
	cur = conn.cursor()

	if not append:
	cur.executescript(
	"DROP TABLE IF EXISTS sentences; DROP TABLE IF EXISTS paragraphs; DROP TABLE IF EXISTS articles; DROP TABLE IF EXISTS laws; DROP TABLE IF EXISTS sentence_fts;"
	)

	cur.executescript(DDL)
	cur.executescript(TRIGGERS)

	with open(input_file, encoding="utf-8") as f:
	corpus = json.load(f)

	law_count = 0
	article_count = 0
	paragraph_count = 0
	sentence_count = 0

	for entry_type, key in [("law", "laws"), ("order", "orders")]:
	items = corpus.get(key, [])
	for item in items:
	law_id = item.get("law_id") or item.get("order_id", "")
	zh_name = item.get("zh_name", "")
	en_name = item.get("en_name", "")
	category = item.get("category", "")

	try:
	cur.execute(
	"INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)",
	(law_id, entry_type, zh_name, en_name, category),
	)
	except sqlite3.IntegrityError:
	if append:
	cur.execute("SELECT id FROM laws WHERE law_id = ?", (law_id,))
	row = cur.fetchone()
	if row:
	cur.execute(
	"UPDATE laws SET zh_name = ?, en_name = ?, category = ? WHERE id = ?",
	(zh_name, en_name, category, row[0]),
	)
	law_db_id = row[0]
	else:
	cur.execute(
	"INSERT INTO laws (law_id, type, zh_name, en_name, category) VALUES (?, ?, ?, ?, ?)",
	(law_id, entry_type, zh_name, en_name, category),
	)
	law_db_id = cur.lastrowid
	else:
	raise
	else:
	law_db_id = cur.lastrowid

	law_count += 1

	articles = item.get("articles", [])
	for art_idx, art in enumerate(articles):
	article_no_zh = art.get("article_no_zh", "")
	article_no_en = art.get("article_no_en", "")
	article_type = art.get("article_type", "")

	cur.execute(
	"INSERT INTO articles (law_id, article_no_zh, article_no_en, article_type, article_index) VALUES (?, ?, ?, ?, ?)",
	(law_db_id, article_no_zh, article_no_en, article_type, art_idx),
	)
	article_db_id = cur.lastrowid
	article_count += 1

	paragraphs = art.get("paragraphs", [])
	for para_idx, para in enumerate(paragraphs):
	cur.execute(
	"INSERT INTO paragraphs (article_id, law_id, paragraph_index) VALUES (?, ?, ?)",
	(article_db_id, law_db_id, para_idx),
	)
	paragraph_db_id = cur.lastrowid
	paragraph_count += 1

	aligned_sentences = para.get("sentences", [])
	for sent_idx, sent in enumerate(aligned_sentences):
	zh_text = sent.get("zh", "")
	en_text = sent.get("en", "")
	score = sent.get("score", 0.0)
	zh_sidx = (
	sent.get("zh_indices", [sent_idx])[0]
	if sent.get("zh_indices")
	else sent_idx
	)
	en_sidx = (
	sent.get("en_indices", [sent_idx])[0]
	if sent.get("en_indices")
	else sent_idx
	)

	if zh_text or en_text:
	cur.execute(
	"INSERT INTO sentences (paragraph_id, article_id, law_id, zh_text, en_text, alignment_score, zh_sentence_idx, en_sentence_idx) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
	(
	paragraph_db_id,
	article_db_id,
	law_db_id,
	zh_text,
	en_text,
	score,
	zh_sidx,
	en_sidx,
	),
	)
	sentence_count += 1

	if law_count % 100 == 0:
	conn.commit()
	print(
	f" Processed {law_count} {entry_type}s, {sentence_count} sentences..."
	)

	conn.commit()

	print(f"\nDatabase built: {db_file}")
	print(f" Laws/orders: {law_count}")
	print(f" Articles: {article_count}")
	print(f" Paragraphs: {paragraph_count}")
	print(f" Sentences: {sentence_count}")

	cur.execute("SELECT count(*) FROM sentences")
	sent_count = cur.fetchone()[0]
	print(f" Sentence records: {sent_count}")

	conn.close()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Build SQLite concordancer DB from TWL aligned JSON"
	)
	parser.add_argument("input_file", help="Path to aligned corpus JSON")
	parser.add_argument("db_file", help="Output SQLite database path")
	parser.add_argument(
	"--append",
	action="store_true",
	help="Append to existing database instead of replacing",
	)
	args = parser.parse_args()

	build_db(args.input_file, args.db_file, append=args.append)