Spaces:

rubentsui
/

TWL

Running

App Files Files Community

TWL / concordancer.py

rubentsui

Upload concordancer.py with huggingface_hub

4fe179b verified 14 days ago

raw

history blame contribute delete

14.5 kB

	"""TWL Bilingual Concordancer — Streamlit App."""

	import html
	from pathlib import Path

	import regex
	import streamlit as st

	import db

	st.set_page_config(page_title="TWL Concordancer", page_icon="⚖️", layout="wide")

	DB_PATH = Path(__file__).parent / "twl_concordancer.db"

	st.markdown(
	"""
	<style>
	section[data-testid="stSidebar"] > div:first-child {
	top: 0;
	height: 100vh;
	}
	section[data-testid="stSidebar"] div[data-testid="stSidebarContent"] {
	padding-top: 0rem !important;
	margin-top: 0rem !important;
	}
	section[data-testid="stSidebar"] div[data-testid="stSidebarHeader"] {
	min-height: 0rem !important;
	height: 0.25rem !important;
	padding-top: 0rem !important;
	padding-bottom: 0rem !important;
	margin-bottom: 0rem !important;
	}
	section[data-testid="stSidebar"] div[data-testid="stSidebarUserContent"] {
	padding-top: 0rem !important;
	margin-top: 0rem !important;
	}
	div[data-testid="stMainBlockContainer"],
	.main .block-container {
	padding-top: 1.2rem;
	}
	.zh-text, .en-text {
	line-height: 1.8;
	padding: 6px 10px;
	border-radius: 4px;
	white-space: pre-wrap;
	color: var(--text-color);
	word-break: break-word;
	}
	.zh-text {
	font-family: "Microsoft JhengHei", "Source Han Sans", "Noto Sans CJK TC Regular", "Hiragino Sans CNS", "LantingHei TC", "Source Han Serif", sans-serif;
	font-size: 20px;
	letter-spacing: 0.01em;
	}
	.en-text {
	font-family: "Source Pro", Consolas, "LingWai TC", Menlo, "Courier New", Arial, sans-serif;
	font-size: 15px;
	}
	.zh-text.match, .en-text.match {
	background-color: color-mix(in srgb, var(--primary-color) 12%, var(--background-color));
	border-left: 3px solid #f5c518;
	color: var(--text-color);
	}
	mark {
	background: #f5c518;
	color: #111827;
	padding: 1px 2px;
	border-radius: 2px;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)


	def _highlight(text, query, case_sensitive=False):
	if not text or not query:
	return html.escape(text)
	escaped_text = html.escape(text)
	escaped_query = html.escape(query)
	return regex.sub(
	rf"({regex.escape(escaped_query)})",
	r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
	escaped_text,
	flags=regex.V1 if case_sensitive else regex.IGNORECASE \| regex.V1,
	)


	def _highlight_regex(text, pattern, case_sensitive=False):
	if not text or not pattern:
	return html.escape(text)
	try:
	compiled = regex.compile(
	pattern, flags=regex.V1 if case_sensitive else regex.IGNORECASE \| regex.V1
	)
	except regex.error:
	return html.escape(text)

	parts = []
	last_end = 0
	for match in compiled.finditer(text):
	start, end = match.span()
	if start == end:
	continue
	parts.append(html.escape(text[last_end:start]))
	parts.append(
	f'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">{html.escape(text[start:end])}</mark>'
	)
	last_end = end
	parts.append(html.escape(text[last_end:]))
	return "".join(parts)


	def _join_sentences(sentences, lang):
	parts = [
	(s.get("zh_text", "") if lang == "zh" else s.get("en_text", "")).strip()
	for s in sentences
	]
	parts = [p for p in parts if p]
	if not parts:
	return ""
	if lang == "zh":
	return "".join(parts)
	return " ".join(parts)


	if "page" not in st.session_state:
	st.session_state.page = 0
	if "expanded" not in st.session_state:
	st.session_state.expanded = {}
	if "search_signature" not in st.session_state:
	st.session_state.search_signature = None

	st.title("⚖️ 全國法規資料庫華英檢索系統")
	st.caption("Taiwan Law (TWL) Chinese–English Aligned Corpus - Bilingual Concordancer")

	conn = db.get_conn(DB_PATH)

	with st.sidebar:
	st.header("搜尋範圍過濾 Filters")

	law_types = ["All", "law", "order"]
	selected_type = st.selectbox("法規/命令 Type", law_types, index=0)
	type_filter = None if selected_type == "All" else selected_type

	categories = db.list_categories(conn, type_filter)
	selected_cat = st.selectbox("機關 Category", ["All"] + categories, index=0)
	cat_filter = None if selected_cat == "All" else selected_cat

	laws = db.list_laws(conn, type_filter, cat_filter)
	law_options = ["All"] + [f"{l['law_id']} — {l['zh_name']}" for l in laws]
	selected_law = st.selectbox("單一法規/命令 Law/Order", law_options, index=0)
	law_id_filter = None
	if selected_law != "All":
	law_id_filter = selected_law.split(" — ")[0]

	max_score = st.slider("Max alignment score (lower = better)", 0.0, 1.0, 1.0, 0.05)
	max_score_filter = None if max_score >= 1.0 else max_score

	lang_options = {
	"中英 / Both": "both",
	"中文 / Chinese": "zh",
	"英文 / English": "en",
	}
	selected_lang = st.radio("搜尋語言 / Search language", list(lang_options), index=0)
	lang_filter = lang_options[selected_lang]

	st.divider()
	st.caption(f"{len(laws)} laws/orders in database")

	with st.form("search_form", clear_on_submit=False):
	col1, col2, col3 = st.columns([4, 1, 1])
	with col1:
	query = st.text_input(
	"Search", placeholder="Enter keyword or regex…", key="search_query"
	)
	with col2:
	use_regex = st.checkbox("Regex", value=False)
	case_sensitive = st.checkbox("Case sensitive", value=False)
	submitted = st.form_submit_button("Submit", use_container_width=True)
	with col3:
	per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)

	article_filter = None
	if law_id_filter:
	articles = db.get_law_articles(conn, law_id_filter)
	art_options = ["All"] + [
	f"{a['article_no_zh']} / {a['article_no_en']}"
	for a in articles
	if a["article_no_zh"] or a["article_no_en"]
	]
	selected_art = st.selectbox("Article", art_options, index=0)
	if selected_art != "All":
	parts = selected_art.split(" / ")
	article_filter = parts[0] if parts else None

	search_signature = (
	query,
	use_regex,
	case_sensitive,
	per_page,
	cat_filter,
	law_id_filter,
	article_filter,
	max_score_filter,
	lang_filter,
	)
	if st.session_state.search_signature != search_signature:
	st.session_state.page = 0
	st.session_state.expanded = {}
	st.session_state.search_signature = search_signature

	if query:
	results, total = db.search_sentences(
	conn,
	query,
	use_regex=use_regex,
	case_sensitive=case_sensitive,
	law_id=law_id_filter,
	category=cat_filter,
	article_no=article_filter,
	max_score=max_score_filter,
	lang=lang_filter,
	limit=per_page,
	offset=st.session_state.page * per_page,
	)

	st.write(f"{total} sentence pair{'s' if total != 1 else ''} found")

	if total > per_page:
	total_pages = (total + per_page - 1) // per_page
	cols = st.columns([1, 4, 1])
	with cols[0]:
	if st.button(
	"← Previous",
	disabled=st.session_state.page == 0,
	use_container_width=True,
	):
	st.session_state.page -= 1
	st.session_state.expanded = {}
	st.rerun()
	with cols[1]:
	st.write(f"Page {st.session_state.page + 1} of {total_pages}")
	with cols[2]:
	if st.button(
	"Next →",
	disabled=(st.session_state.page + 1) * per_page >= total,
	use_container_width=True,
	):
	st.session_state.page += 1
	st.session_state.expanded = {}
	st.rerun()

	for row in results:
	sid = row["id"]
	score = row["alignment_score"]
	law_ref = f"{row['law_id']} {row['zh_name']}"
	art_ref = (
	f"{row['article_no_zh']} / {row['article_no_en']}"
	if row["article_no_zh"] or row["article_no_en"]
	else ""
	)

	with st.container(border=True):
	st.markdown(
	f"`{law_ref}`{' \| ' + art_ref if art_ref else ''} \| Score: `{score:.4f}`"
	)

	zh_text = row["zh_text"] or ""
	en_text = row["en_text"] or ""

	if query and use_regex:
	zh_display = _highlight_regex(zh_text, query, case_sensitive=case_sensitive)
	en_display = _highlight_regex(en_text, query, case_sensitive=case_sensitive)
	elif query and not use_regex:
	zh_display = _highlight(zh_text, query, case_sensitive=case_sensitive)
	en_display = _highlight(en_text, query, case_sensitive=case_sensitive)
	else:
	zh_display = html.escape(zh_text)
	en_display = html.escape(en_text)

	col_zh, col_en = st.columns([2, 3])
	with col_zh:
	st.markdown(
	f'<div class="zh-text">{zh_display}</div>', unsafe_allow_html=True
	)
	with col_en:
	st.markdown(
	f'<div class="en-text">{en_display}</div>', unsafe_allow_html=True
	)

	exp_col1, exp_col2 = st.columns(2)
	with exp_col1:
	if st.button("▸ Paragraph", key=f"para_{sid}"):
	st.session_state.expanded[
	f"para_{sid}"
	] = not st.session_state.expanded.get(f"para_{sid}", False)
	with exp_col2:
	if st.button("▸ Article", key=f"art_{sid}"):
	st.session_state.expanded[
	f"art_{sid}"
	] = not st.session_state.expanded.get(f"art_{sid}", False)

	if st.session_state.expanded.get(f"para_{sid}"):
	para = db.get_paragraph(conn, sid)
	if para:
	with st.container(border=True):
	st.markdown(
	f"Paragraph ({para['article_no_zh']} / {para['article_no_en']})"
	)
	para_zh = _join_sentences(para["sentences"], "zh")
	para_en = _join_sentences(para["sentences"], "en")
	if query and use_regex:
	para_zh_display = _highlight_regex(
	para_zh, query, case_sensitive=case_sensitive
	)
	para_en_display = _highlight_regex(
	para_en, query, case_sensitive=case_sensitive
	)
	elif query and not use_regex:
	para_zh_display = _highlight(
	para_zh, query, case_sensitive=case_sensitive
	)
	para_en_display = _highlight(
	para_en, query, case_sensitive=case_sensitive
	)
	else:
	para_zh_display = html.escape(para_zh)
	para_en_display = html.escape(para_en)
	c1, c2 = st.columns([2, 3])
	with c1:
	st.markdown(
	f'<div class="zh-text match">{para_zh_display}</div>',
	unsafe_allow_html=True,
	)
	with c2:
	st.markdown(
	f'<div class="en-text match">{para_en_display}</div>',
	unsafe_allow_html=True,
	)

	if st.session_state.expanded.get(f"art_{sid}"):
	article = db.get_article(conn, sid)
	if article:
	with st.container(border=True):
	st.markdown(
	f"Article ({article['article_no_zh']} / {article['article_no_en']})"
	)
	for pi, para in enumerate(article["paragraphs"]):
	st.markdown(f"Paragraph {pi + 1}")
	art_zh = _join_sentences(para["sentences"], "zh")
	art_en = _join_sentences(para["sentences"], "en")
	if query and use_regex:
	art_zh_display = _highlight_regex(
	art_zh, query, case_sensitive=case_sensitive
	)
	art_en_display = _highlight_regex(
	art_en, query, case_sensitive=case_sensitive
	)
	elif query and not use_regex:
	art_zh_display = _highlight(
	art_zh, query, case_sensitive=case_sensitive
	)
	art_en_display = _highlight(
	art_en, query, case_sensitive=case_sensitive
	)
	else:
	art_zh_display = html.escape(art_zh)
	art_en_display = html.escape(art_en)
	contains_match = any(
	s["id"] == sid for s in para["sentences"]
	)
	c1, c2 = st.columns([2, 3])
	with c1:
	st.markdown(
	f'<div class="zh-text{" match" if contains_match else ""}">{art_zh_display}</div>',
	unsafe_allow_html=True,
	)
	with c2:
	st.markdown(
	f'<div class="en-text{" match" if contains_match else ""}">{art_en_display}</div>',
	unsafe_allow_html=True,
	)

	elif not query:
	st.info("Enter a search term above to find aligned sentence pairs.")

	st.divider()
	st.caption("TWL Concordancer \| Taiwan Law Bilingual Corpus")

	conn.close()