| """TWL Bilingual Concordancer — Streamlit App.""" |
|
|
| import html |
| from pathlib import Path |
|
|
| import regex |
| import streamlit as st |
|
|
| import db |
|
|
| st.set_page_config(page_title="TWL Concordancer", page_icon="⚖️", layout="wide") |
|
|
| DB_PATH = Path(__file__).parent / "twl_concordancer.db" |
|
|
| st.markdown( |
| """ |
| <style> |
| section[data-testid="stSidebar"] > div:first-child { |
| top: 0; |
| height: 100vh; |
| } |
| section[data-testid="stSidebar"] div[data-testid="stSidebarContent"] { |
| padding-top: 0rem !important; |
| margin-top: 0rem !important; |
| } |
| section[data-testid="stSidebar"] div[data-testid="stSidebarHeader"] { |
| min-height: 0rem !important; |
| height: 0.25rem !important; |
| padding-top: 0rem !important; |
| padding-bottom: 0rem !important; |
| margin-bottom: 0rem !important; |
| } |
| section[data-testid="stSidebar"] div[data-testid="stSidebarUserContent"] { |
| padding-top: 0rem !important; |
| margin-top: 0rem !important; |
| } |
| div[data-testid="stMainBlockContainer"], |
| .main .block-container { |
| padding-top: 1.2rem; |
| } |
| .zh-text, .en-text { |
| line-height: 1.8; |
| padding: 6px 10px; |
| border-radius: 4px; |
| white-space: pre-wrap; |
| color: var(--text-color); |
| word-break: break-word; |
| } |
| .zh-text { |
| font-family: "Microsoft JhengHei", "Source Han Sans", "Noto Sans CJK TC Regular", "Hiragino Sans CNS", "LantingHei TC", "Source Han Serif", sans-serif; |
| font-size: 20px; |
| letter-spacing: 0.01em; |
| } |
| .en-text { |
| font-family: "Source Pro", Consolas, "LingWai TC", Menlo, "Courier New", Arial, sans-serif; |
| font-size: 15px; |
| } |
| .zh-text.match, .en-text.match { |
| background-color: color-mix(in srgb, var(--primary-color) 12%, var(--background-color)); |
| border-left: 3px solid #f5c518; |
| color: var(--text-color); |
| } |
| mark { |
| background: #f5c518; |
| color: #111827; |
| padding: 1px 2px; |
| border-radius: 2px; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True, |
| ) |
|
|
|
|
| def _highlight(text, query, case_sensitive=False): |
| if not text or not query: |
| return html.escape(text) |
| escaped_text = html.escape(text) |
| escaped_query = html.escape(query) |
| return regex.sub( |
| rf"({regex.escape(escaped_query)})", |
| r'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">\1</mark>', |
| escaped_text, |
| flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1, |
| ) |
|
|
|
|
| def _highlight_regex(text, pattern, case_sensitive=False): |
| if not text or not pattern: |
| return html.escape(text) |
| try: |
| compiled = regex.compile( |
| pattern, flags=regex.V1 if case_sensitive else regex.IGNORECASE | regex.V1 |
| ) |
| except regex.error: |
| return html.escape(text) |
|
|
| parts = [] |
| last_end = 0 |
| for match in compiled.finditer(text): |
| start, end = match.span() |
| if start == end: |
| continue |
| parts.append(html.escape(text[last_end:start])) |
| parts.append( |
| f'<mark style="background:#fef08a;padding:1px 2px;border-radius:2px">{html.escape(text[start:end])}</mark>' |
| ) |
| last_end = end |
| parts.append(html.escape(text[last_end:])) |
| return "".join(parts) |
|
|
|
|
| def _join_sentences(sentences, lang): |
| parts = [ |
| (s.get("zh_text", "") if lang == "zh" else s.get("en_text", "")).strip() |
| for s in sentences |
| ] |
| parts = [p for p in parts if p] |
| if not parts: |
| return "" |
| if lang == "zh": |
| return "".join(parts) |
| return " ".join(parts) |
|
|
|
|
| if "page" not in st.session_state: |
| st.session_state.page = 0 |
| if "expanded" not in st.session_state: |
| st.session_state.expanded = {} |
| if "search_signature" not in st.session_state: |
| st.session_state.search_signature = None |
|
|
| st.title("⚖️ 全國法規資料庫 華英檢索系統") |
| st.caption("Taiwan Law (TWL) Chinese–English Aligned Corpus - Bilingual Concordancer") |
|
|
| conn = db.get_conn(DB_PATH) |
|
|
| with st.sidebar: |
| st.header("搜尋範圍過濾 Filters") |
|
|
| law_types = ["All", "law", "order"] |
| selected_type = st.selectbox("法規/命令 Type", law_types, index=0) |
| type_filter = None if selected_type == "All" else selected_type |
|
|
| categories = db.list_categories(conn, type_filter) |
| selected_cat = st.selectbox("機關 Category", ["All"] + categories, index=0) |
| cat_filter = None if selected_cat == "All" else selected_cat |
|
|
| laws = db.list_laws(conn, type_filter, cat_filter) |
| law_options = ["All"] + [f"{l['law_id']} — {l['zh_name']}" for l in laws] |
| selected_law = st.selectbox("單一法規/命令 Law/Order", law_options, index=0) |
| law_id_filter = None |
| if selected_law != "All": |
| law_id_filter = selected_law.split(" — ")[0] |
|
|
| max_score = st.slider("Max alignment score (lower = better)", 0.0, 1.0, 1.0, 0.05) |
| max_score_filter = None if max_score >= 1.0 else max_score |
|
|
| lang_options = { |
| "中英 / Both": "both", |
| "中文 / Chinese": "zh", |
| "英文 / English": "en", |
| } |
| selected_lang = st.radio("搜尋語言 / Search language", list(lang_options), index=0) |
| lang_filter = lang_options[selected_lang] |
|
|
| st.divider() |
| st.caption(f"{len(laws)} laws/orders in database") |
|
|
| with st.form("search_form", clear_on_submit=False): |
| col1, col2, col3 = st.columns([4, 1, 1]) |
| with col1: |
| query = st.text_input( |
| "Search", placeholder="Enter keyword or regex…", key="search_query" |
| ) |
| with col2: |
| use_regex = st.checkbox("Regex", value=False) |
| case_sensitive = st.checkbox("Case sensitive", value=False) |
| submitted = st.form_submit_button("Submit", use_container_width=True) |
| with col3: |
| per_page = st.selectbox("Per page", [10, 25, 50, 100], index=1) |
|
|
| article_filter = None |
| if law_id_filter: |
| articles = db.get_law_articles(conn, law_id_filter) |
| art_options = ["All"] + [ |
| f"{a['article_no_zh']} / {a['article_no_en']}" |
| for a in articles |
| if a["article_no_zh"] or a["article_no_en"] |
| ] |
| selected_art = st.selectbox("Article", art_options, index=0) |
| if selected_art != "All": |
| parts = selected_art.split(" / ") |
| article_filter = parts[0] if parts else None |
|
|
| search_signature = ( |
| query, |
| use_regex, |
| case_sensitive, |
| per_page, |
| cat_filter, |
| law_id_filter, |
| article_filter, |
| max_score_filter, |
| lang_filter, |
| ) |
| if st.session_state.search_signature != search_signature: |
| st.session_state.page = 0 |
| st.session_state.expanded = {} |
| st.session_state.search_signature = search_signature |
|
|
| if query: |
| results, total = db.search_sentences( |
| conn, |
| query, |
| use_regex=use_regex, |
| case_sensitive=case_sensitive, |
| law_id=law_id_filter, |
| category=cat_filter, |
| article_no=article_filter, |
| max_score=max_score_filter, |
| lang=lang_filter, |
| limit=per_page, |
| offset=st.session_state.page * per_page, |
| ) |
|
|
| st.write(f"**{total}** sentence pair{'s' if total != 1 else ''} found") |
|
|
| if total > per_page: |
| total_pages = (total + per_page - 1) // per_page |
| cols = st.columns([1, 4, 1]) |
| with cols[0]: |
| if st.button( |
| "← Previous", |
| disabled=st.session_state.page == 0, |
| use_container_width=True, |
| ): |
| st.session_state.page -= 1 |
| st.session_state.expanded = {} |
| st.rerun() |
| with cols[1]: |
| st.write(f"Page {st.session_state.page + 1} of {total_pages}") |
| with cols[2]: |
| if st.button( |
| "Next →", |
| disabled=(st.session_state.page + 1) * per_page >= total, |
| use_container_width=True, |
| ): |
| st.session_state.page += 1 |
| st.session_state.expanded = {} |
| st.rerun() |
|
|
| for row in results: |
| sid = row["id"] |
| score = row["alignment_score"] |
| law_ref = f"{row['law_id']} {row['zh_name']}" |
| art_ref = ( |
| f"{row['article_no_zh']} / {row['article_no_en']}" |
| if row["article_no_zh"] or row["article_no_en"] |
| else "" |
| ) |
|
|
| with st.container(border=True): |
| st.markdown( |
| f"`{law_ref}`{' | ' + art_ref if art_ref else ''} | Score: `{score:.4f}`" |
| ) |
|
|
| zh_text = row["zh_text"] or "" |
| en_text = row["en_text"] or "" |
|
|
| if query and use_regex: |
| zh_display = _highlight_regex(zh_text, query, case_sensitive=case_sensitive) |
| en_display = _highlight_regex(en_text, query, case_sensitive=case_sensitive) |
| elif query and not use_regex: |
| zh_display = _highlight(zh_text, query, case_sensitive=case_sensitive) |
| en_display = _highlight(en_text, query, case_sensitive=case_sensitive) |
| else: |
| zh_display = html.escape(zh_text) |
| en_display = html.escape(en_text) |
|
|
| col_zh, col_en = st.columns([2, 3]) |
| with col_zh: |
| st.markdown( |
| f'<div class="zh-text">{zh_display}</div>', unsafe_allow_html=True |
| ) |
| with col_en: |
| st.markdown( |
| f'<div class="en-text">{en_display}</div>', unsafe_allow_html=True |
| ) |
|
|
| exp_col1, exp_col2 = st.columns(2) |
| with exp_col1: |
| if st.button("▸ Paragraph", key=f"para_{sid}"): |
| st.session_state.expanded[ |
| f"para_{sid}" |
| ] = not st.session_state.expanded.get(f"para_{sid}", False) |
| with exp_col2: |
| if st.button("▸ Article", key=f"art_{sid}"): |
| st.session_state.expanded[ |
| f"art_{sid}" |
| ] = not st.session_state.expanded.get(f"art_{sid}", False) |
|
|
| if st.session_state.expanded.get(f"para_{sid}"): |
| para = db.get_paragraph(conn, sid) |
| if para: |
| with st.container(border=True): |
| st.markdown( |
| f"**Paragraph** ({para['article_no_zh']} / {para['article_no_en']})" |
| ) |
| para_zh = _join_sentences(para["sentences"], "zh") |
| para_en = _join_sentences(para["sentences"], "en") |
| if query and use_regex: |
| para_zh_display = _highlight_regex( |
| para_zh, query, case_sensitive=case_sensitive |
| ) |
| para_en_display = _highlight_regex( |
| para_en, query, case_sensitive=case_sensitive |
| ) |
| elif query and not use_regex: |
| para_zh_display = _highlight( |
| para_zh, query, case_sensitive=case_sensitive |
| ) |
| para_en_display = _highlight( |
| para_en, query, case_sensitive=case_sensitive |
| ) |
| else: |
| para_zh_display = html.escape(para_zh) |
| para_en_display = html.escape(para_en) |
| c1, c2 = st.columns([2, 3]) |
| with c1: |
| st.markdown( |
| f'<div class="zh-text match">{para_zh_display}</div>', |
| unsafe_allow_html=True, |
| ) |
| with c2: |
| st.markdown( |
| f'<div class="en-text match">{para_en_display}</div>', |
| unsafe_allow_html=True, |
| ) |
|
|
| if st.session_state.expanded.get(f"art_{sid}"): |
| article = db.get_article(conn, sid) |
| if article: |
| with st.container(border=True): |
| st.markdown( |
| f"**Article** ({article['article_no_zh']} / {article['article_no_en']})" |
| ) |
| for pi, para in enumerate(article["paragraphs"]): |
| st.markdown(f"*Paragraph {pi + 1}*") |
| art_zh = _join_sentences(para["sentences"], "zh") |
| art_en = _join_sentences(para["sentences"], "en") |
| if query and use_regex: |
| art_zh_display = _highlight_regex( |
| art_zh, query, case_sensitive=case_sensitive |
| ) |
| art_en_display = _highlight_regex( |
| art_en, query, case_sensitive=case_sensitive |
| ) |
| elif query and not use_regex: |
| art_zh_display = _highlight( |
| art_zh, query, case_sensitive=case_sensitive |
| ) |
| art_en_display = _highlight( |
| art_en, query, case_sensitive=case_sensitive |
| ) |
| else: |
| art_zh_display = html.escape(art_zh) |
| art_en_display = html.escape(art_en) |
| contains_match = any( |
| s["id"] == sid for s in para["sentences"] |
| ) |
| c1, c2 = st.columns([2, 3]) |
| with c1: |
| st.markdown( |
| f'<div class="zh-text{" match" if contains_match else ""}">{art_zh_display}</div>', |
| unsafe_allow_html=True, |
| ) |
| with c2: |
| st.markdown( |
| f'<div class="en-text{" match" if contains_match else ""}">{art_en_display}</div>', |
| unsafe_allow_html=True, |
| ) |
|
|
| elif not query: |
| st.info("Enter a search term above to find aligned sentence pairs.") |
|
|
| st.divider() |
| st.caption("TWL Concordancer | Taiwan Law Bilingual Corpus") |
|
|
| conn.close() |
|
|