NTU / src /stwebm_parquet_wa.py
rubentsui's picture
Update src/stwebm_parquet_wa.py
bd8e0e2 verified
# -*- coding: utf-8 -*-
"""
Created on Sun Dec 11 19:51:02 2022
Modified on Sun Aug 24 22:45:00 2025 to streamline code
@author: ruben
"""
import streamlit as st
from biconWebStmParquetWa import regex_search as rs
def init_session_state():
"""Initializes the session state variables."""
if 'queryresults' not in st.session_state:
st.session_state.queryresults = None
if 'page' not in st.session_state:
st.session_state.page = 1
if 'maxpage' not in st.session_state:
st.session_state.maxpage = 0
if 'minpage' not in st.session_state:
st.session_state.minpage = 1
if 'datasize' not in st.session_state:
st.session_state.datasize = 0
if 'chunksize' not in st.session_state:
st.session_state.chunksize = 20
if 'success_messages' not in st.session_state:
st.session_state.success_messages = []
def reset_session_state():
"""Resets the session state for a new session."""
st.session_state.queryresults = None
st.session_state.page = 1
st.session_state.maxpage = 0
st.session_state.minpage = 1
st.session_state.datasize = 0
st.session_state.success_messages = []
def buildTable(page):
"""Builds the HTML table for the current page of results."""
slices = st.session_state.queryresults
datasize = st.session_state.datasize
table = '<table width="100%">' # Corrected: escaped double quotes within the string
n = st.session_state.chunksize
for j in range(n):
index = (page - 1) * n + j
if index >= datasize:
break
try:
corpus, score, en, zh = slices[page - 1][j].split('\t') # Corrected: escaped tab character
except:
continue
table += (
'<tr>'
f'<td>{corpus}</td><td colspan=2>{score}</td>'
'</tr>'
'<tr>'
f'<td>{index + 1}</td><td width="45%" valign="top">{en}</td><td width="50%" valign="top">{zh}</td>' # Corrected: escaped double quotes within the string
'</tr>'
)
table += '</table>'
return table
def main():
"""Main function to run the Streamlit app."""
appTitle = '臺大法規雙語查詢系統'
sources = ('NTURegs', 'VOA')
st.set_page_config(
page_title=appTitle,
layout='wide',
initial_sidebar_state='auto',
menu_items={
'Get Help': 'https://streamlit.io/',
'Report a bug': 'https://github.com',
'About': f'**{appTitle}**\nCopyright (c) Ruben G. Tsui' # Corrected: escaped newline character
}
)
page_style = '''
<style>
.css-o18uir.e16nr0p33 {
margin-top: -125px;
}
.reportview-container .css-1lcbmhc .css-1outpf7 {
padding-top: -125px;
}
.reportview-container .main .block-container{
padding-top: 0rem;
padding-right: 0rem;
padding-left: 0rem;
padding-bottom: 0rem;}
p.europe {
font-family: Source Pro, Consolas, LingWai TC, Menlo, Courier New, Arial;
font-size: 16px;
}
p.cjk {
font-family: Microsoft Jhenghei, Source Han Sans, Noto Sans CJK TC Regular, Hiragino Sans CNS, LantingHei TC, Source Han Serif;
font-size: 18px;
}
</style>
'''
st.markdown(page_style, unsafe_allow_html=True)
st.sidebar.subheader(appTitle)
table_placeholder = st.empty()
with st.sidebar:
#if st.button("New Session"):
# reset_session_state()
# table_placeholder.empty()
query = st.text_area('輸入搜尋字串').strip()
multicorpora = st.multiselect('選擇語料庫(可複選)', sources, ['NTURegs'])
colc, cold = st.columns([1, 1])
with colc:
submit_button = st.button('搜尋')
with cold:
regex_search = st.radio("Regex search", ["Yes", "Always"], horizontal=True)
cola, colb = st.columns([1, 1])
with cola:
size = st.selectbox('筆數上限', [10, 20, 50, 100, 200, 500, 5000], index=2)
with colb:
case_sensitive = st.radio("Case sensitive", ["No", "Yes"], horizontal=True)
st.session_state.chunksize = st.slider("每頁筆數", 1, 50, 20)
# Build user interface
col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
with col1:
first_button = st.button('First')
with col2:
prev_button = st.button('Prev')
with col3:
next_button = st.button('Next')
with col4:
last_button = st.button('Last')
# Navigation
if next_button and st.session_state.page < st.session_state.maxpage:
st.session_state.page += 1
if prev_button and st.session_state.page > st.session_state.minpage:
st.session_state.page -= 1
if first_button:
st.session_state.page = st.session_state.minpage
if last_button:
st.session_state.page = st.session_state.maxpage
if submit_button:
reset_session_state()
all_results = []
success_messages = []
success_messages.append('No. of matches found: ')
for c in multicorpora:
selectedCorpus = sources.index(c)
results = rs(query, c=selectedCorpus, max_matches=size, case_sensitive=(case_sensitive == "Yes"), literal=False)
success_messages.append(f'[{c}]: {len(results)}')
all_results.extend(results)
st.session_state.success_messages = success_messages
datasize = len(all_results)
n = st.session_state.chunksize
slices = [all_results[i:i + n] for i in range(0, datasize, n)]
pagesize = len(slices)
st.session_state.datasize = datasize
st.session_state.maxpage = pagesize
st.session_state.queryresults = slices
if st.session_state.queryresults is not None:
table_placeholder.empty()
with table_placeholder.container():
if st.session_state.success_messages:
messages = " | ".join(st.session_state.success_messages)
st.markdown(f'''
<div style="
border: 1px solid yellow;
padding: 5px;
border-radius: 5px;
font-size: 0.9em;
margin-bottom: 10px;
background-color: black;
color: yellow;
margin-top: -25px;">
{messages}
</div>
''', unsafe_allow_html=True)
st.markdown(f"page {st.session_state.page} of {st.session_state.maxpage}")
table = buildTable(st.session_state.page)
st.markdown(table, unsafe_allow_html=True)
if __name__ == '__main__':
init_session_state()
main()