TWL / concordancer.py
rubentsui's picture
Upload concordancer.py with huggingface_hub
4fe179b verified
"""TWL Bilingual Concordancer — Streamlit App."""
import html
from pathlib import Path
import regex
import streamlit as st
import db
st.set_page_config(page_title="TWL Concordancer", page_icon="⚖️", layout="wide")
DB_PATH = Path(__file__).parent / "twl_concordancer.db"
st.markdown(
"""
<style>
section[data-testid="stSidebar"] > div:first-child {
top: 0;
height: 100vh;
}
section[data-testid="stSidebar"] div[data-testid="stSidebarContent"] {
padding-top: 0rem !important;
margin-top: 0rem !important;
}
section[data-testid="stSidebar"] div[data-testid="stSidebarHeader"] {
min-height: 0rem !important;
height: 0.25rem !important;
padding-top: 0rem !important;
padding-bottom: 0rem !important;
margin-bottom: 0rem !important;
}
section[data-testid="stSidebar"] div[data-testid="stSidebarUserContent"] {
padding-top: 0rem !important;
margin-top: 0rem !important;
}
div[data-testid="stMainBlockContainer"],
.main .block-container {
padding-top: 1.2rem;
}
.zh-text, .en-text {
line-height: 1.8;
padding: 6px 10px;
border-radius: 4px;
white-space: pre-wrap;
color: var(--text-color);
word-break: break-word;
}
.zh-text {
font-family: "Microsoft JhengHei", "Source Han Sans", "Noto Sans CJK TC Regular", "Hiragino Sans CNS", "LantingHei TC", "Source Han Serif", sans-serif;
font-size: 20px;
letter-spacing: 0.01em;
}
.en-text {
font-family: "Source Pro", Consolas, "LingWai TC", Menlo, "Courier New", Arial, sans-serif;
font-size: 15px;
}
.zh-text.match, .en-text.match {
background-color: color-mix(in srgb, var(--primary-color) 12%, var(--background-color));
border-left: 3px solid #f5c518;
color: var(--text-color);
}
mark {
background: #f5c518;
color: #111827;
padding: 1px 2px;
border-radius: 2px;
}
</style>
""",
unsafe_allow_html=True,
)
def _highlight(text, query, case_sensitive=False):
if not text or not query:
return html.escape(text)
escaped_text = html.escape(text)
escaped_query = html.escape(query)
return regex.sub(
rf"({regex.escape(escaped_query)})",
r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>',
escaped_text,
flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1,
)
def _highlight_regex(text, pattern, case_sensitive=False):
if not text or not pattern:
return html.escape(text)
try:
compiled = regex.compile(
pattern, flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1
)
except regex.error:
return html.escape(text)
parts = []
last_end = 0
for match in compiled.finditer(text):
start, end = match.span()
if start == end:
continue
parts.append(html.escape(text[last_end:start]))
parts.append(
f'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">{html.escape(text[start:end])}</mark>'
)
last_end = end
parts.append(html.escape(text[last_end:]))
return "".join(parts)
def _join_sentences(sentences, lang):
parts = [
(s.get("zh_text", "") if lang == "zh" else s.get("en_text", "")).strip()
for s in sentences
]
parts = [p for p in parts if p]
if not parts:
return ""
if lang == "zh":
return "".join(parts)
return " ".join(parts)
if "page" not in st.session_state:
st.session_state.page = 0
if "expanded" not in st.session_state:
st.session_state.expanded = {}
if "search_signature" not in st.session_state:
st.session_state.search_signature = None
st.title("⚖️ 全國法規資料庫 華英檢索系統")
st.caption("Taiwan Law (TWL) Chinese–English Aligned Corpus - Bilingual Concordancer")
conn = db.get_conn(DB_PATH)
with st.sidebar:
st.header("搜尋範圍過濾 Filters")
law_types = ["All", "law", "order"]
selected_type = st.selectbox("法規/命令 Type", law_types, index=0)
type_filter = None if selected_type == "All" else selected_type
categories = db.list_categories(conn, type_filter)
selected_cat = st.selectbox("機關 Category", ["All"] + categories, index=0)
cat_filter = None if selected_cat == "All" else selected_cat
laws = db.list_laws(conn, type_filter, cat_filter)
law_options = ["All"] + [f"{l['law_id']}{l['zh_name']}" for l in laws]
selected_law = st.selectbox("單一法規/命令 Law/Order", law_options, index=0)
law_id_filter = None
if selected_law != "All":
law_id_filter = selected_law.split(" — ")[0]
max_score = st.slider("Max alignment score (lower = better)", 0.0, 1.0, 1.0, 0.05)
max_score_filter = None if max_score >= 1.0 else max_score
lang_options = {
"中英 / Both": "both",
"中文 / Chinese": "zh",
"英文 / English": "en",
}
selected_lang = st.radio("搜尋語言 / Search language", list(lang_options), index=0)
lang_filter = lang_options[selected_lang]
st.divider()
st.caption(f"{len(laws)} laws/orders in database")
with st.form("search_form", clear_on_submit=False):
col1, col2, col3 = st.columns([4, 1, 1])
with col1:
query = st.text_input(
"Search", placeholder="Enter keyword or regex…", key="search_query"
)
with col2:
use_regex = st.checkbox("Regex", value=False)
case_sensitive = st.checkbox("Case sensitive", value=False)
submitted = st.form_submit_button("Submit", use_container_width=True)
with col3:
per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1)
article_filter = None
if law_id_filter:
articles = db.get_law_articles(conn, law_id_filter)
art_options = ["All"] + [
f"{a['article_no_zh']} / {a['article_no_en']}"
for a in articles
if a["article_no_zh"] or a["article_no_en"]
]
selected_art = st.selectbox("Article", art_options, index=0)
if selected_art != "All":
parts = selected_art.split(" / ")
article_filter = parts[0] if parts else None
search_signature = (
query,
use_regex,
case_sensitive,
per_page,
cat_filter,
law_id_filter,
article_filter,
max_score_filter,
lang_filter,
)
if st.session_state.search_signature != search_signature:
st.session_state.page = 0
st.session_state.expanded = {}
st.session_state.search_signature = search_signature
if query:
results, total = db.search_sentences(
conn,
query,
use_regex=use_regex,
case_sensitive=case_sensitive,
law_id=law_id_filter,
category=cat_filter,
article_no=article_filter,
max_score=max_score_filter,
lang=lang_filter,
limit=per_page,
offset=st.session_state.page * per_page,
)
st.write(f"**{total}** sentence pair{'s' if total != 1 else ''} found")
if total > per_page:
total_pages = (total + per_page - 1) // per_page
cols = st.columns([1, 4, 1])
with cols[0]:
if st.button(
"← Previous",
disabled=st.session_state.page == 0,
use_container_width=True,
):
st.session_state.page -= 1
st.session_state.expanded = {}
st.rerun()
with cols[1]:
st.write(f"Page {st.session_state.page + 1} of {total_pages}")
with cols[2]:
if st.button(
"Next →",
disabled=(st.session_state.page + 1) * per_page >= total,
use_container_width=True,
):
st.session_state.page += 1
st.session_state.expanded = {}
st.rerun()
for row in results:
sid = row["id"]
score = row["alignment_score"]
law_ref = f"{row['law_id']} {row['zh_name']}"
art_ref = (
f"{row['article_no_zh']} / {row['article_no_en']}"
if row["article_no_zh"] or row["article_no_en"]
else ""
)
with st.container(border=True):
st.markdown(
f"`{law_ref}`{' | ' + art_ref if art_ref else ''} | Score: `{score:.4f}`"
)
zh_text = row["zh_text"] or ""
en_text = row["en_text"] or ""
if query and use_regex:
zh_display = _highlight_regex(zh_text, query, case_sensitive=case_sensitive)
en_display = _highlight_regex(en_text, query, case_sensitive=case_sensitive)
elif query and not use_regex:
zh_display = _highlight(zh_text, query, case_sensitive=case_sensitive)
en_display = _highlight(en_text, query, case_sensitive=case_sensitive)
else:
zh_display = html.escape(zh_text)
en_display = html.escape(en_text)
col_zh, col_en = st.columns([2, 3])
with col_zh:
st.markdown(
f'<div class="zh-text">{zh_display}</div>', unsafe_allow_html=True
)
with col_en:
st.markdown(
f'<div class="en-text">{en_display}</div>', unsafe_allow_html=True
)
exp_col1, exp_col2 = st.columns(2)
with exp_col1:
if st.button("▸ Paragraph", key=f"para_{sid}"):
st.session_state.expanded[
f"para_{sid}"
] = not st.session_state.expanded.get(f"para_{sid}", False)
with exp_col2:
if st.button("▸ Article", key=f"art_{sid}"):
st.session_state.expanded[
f"art_{sid}"
] = not st.session_state.expanded.get(f"art_{sid}", False)
if st.session_state.expanded.get(f"para_{sid}"):
para = db.get_paragraph(conn, sid)
if para:
with st.container(border=True):
st.markdown(
f"**Paragraph** ({para['article_no_zh']} / {para['article_no_en']})"
)
para_zh = _join_sentences(para["sentences"], "zh")
para_en = _join_sentences(para["sentences"], "en")
if query and use_regex:
para_zh_display = _highlight_regex(
para_zh, query, case_sensitive=case_sensitive
)
para_en_display = _highlight_regex(
para_en, query, case_sensitive=case_sensitive
)
elif query and not use_regex:
para_zh_display = _highlight(
para_zh, query, case_sensitive=case_sensitive
)
para_en_display = _highlight(
para_en, query, case_sensitive=case_sensitive
)
else:
para_zh_display = html.escape(para_zh)
para_en_display = html.escape(para_en)
c1, c2 = st.columns([2, 3])
with c1:
st.markdown(
f'<div class="zh-text match">{para_zh_display}</div>',
unsafe_allow_html=True,
)
with c2:
st.markdown(
f'<div class="en-text match">{para_en_display}</div>',
unsafe_allow_html=True,
)
if st.session_state.expanded.get(f"art_{sid}"):
article = db.get_article(conn, sid)
if article:
with st.container(border=True):
st.markdown(
f"**Article** ({article['article_no_zh']} / {article['article_no_en']})"
)
for pi, para in enumerate(article["paragraphs"]):
st.markdown(f"*Paragraph {pi + 1}*")
art_zh = _join_sentences(para["sentences"], "zh")
art_en = _join_sentences(para["sentences"], "en")
if query and use_regex:
art_zh_display = _highlight_regex(
art_zh, query, case_sensitive=case_sensitive
)
art_en_display = _highlight_regex(
art_en, query, case_sensitive=case_sensitive
)
elif query and not use_regex:
art_zh_display = _highlight(
art_zh, query, case_sensitive=case_sensitive
)
art_en_display = _highlight(
art_en, query, case_sensitive=case_sensitive
)
else:
art_zh_display = html.escape(art_zh)
art_en_display = html.escape(art_en)
contains_match = any(
s["id"] == sid for s in para["sentences"]
)
c1, c2 = st.columns([2, 3])
with c1:
st.markdown(
f'<div class="zh-text{" match" if contains_match else ""}">{art_zh_display}</div>',
unsafe_allow_html=True,
)
with c2:
st.markdown(
f'<div class="en-text{" match" if contains_match else ""}">{art_en_display}</div>',
unsafe_allow_html=True,
)
elif not query:
st.info("Enter a search term above to find aligned sentence pairs.")
st.divider()
st.caption("TWL Concordancer | Taiwan Law Bilingual Corpus")
conn.close()