NTU

Running

App Files Files Community

NTU / src /stwebm_parquet_wa.py

rubentsui

Update src/stwebm_parquet_wa.py

bd8e0e2 verified 13 days ago

raw

history blame contribute delete

6.93 kB

	# -- coding: utf-8 --
	"""
	Created on Sun Dec 11 19:51:02 2022
	Modified on Sun Aug 24 22:45:00 2025 to streamline code
	@author: ruben
	"""

	import streamlit as st
	from biconWebStmParquetWa import regex_search as rs

	def init_session_state():
	"""Initializes the session state variables."""
	if 'queryresults' not in st.session_state:
	st.session_state.queryresults = None
	if 'page' not in st.session_state:
	st.session_state.page = 1
	if 'maxpage' not in st.session_state:
	st.session_state.maxpage = 0
	if 'minpage' not in st.session_state:
	st.session_state.minpage = 1
	if 'datasize' not in st.session_state:
	st.session_state.datasize = 0
	if 'chunksize' not in st.session_state:
	st.session_state.chunksize = 20
	if 'success_messages' not in st.session_state:
	st.session_state.success_messages = []

	def reset_session_state():
	"""Resets the session state for a new session."""
	st.session_state.queryresults = None
	st.session_state.page = 1
	st.session_state.maxpage = 0
	st.session_state.minpage = 1
	st.session_state.datasize = 0
	st.session_state.success_messages = []

	def buildTable(page):
	"""Builds the HTML table for the current page of results."""
	slices = st.session_state.queryresults
	datasize = st.session_state.datasize
	table = '<table width="100%">' # Corrected: escaped double quotes within the string
	n = st.session_state.chunksize
	for j in range(n):
	index = (page - 1) * n + j
	if index >= datasize:
	break
	try:
	corpus, score, en, zh = slices[page - 1][j].split('\t') # Corrected: escaped tab character
	except:
	continue
	table += (
	'<tr>'
	f'<td>{corpus}</td><td colspan=2>{score}</td>'
	'</tr>'
	'<tr>'
	f'<td>{index + 1}</td><td width="45%" valign="top">{en}</td><td width="50%" valign="top">{zh}</td>' # Corrected: escaped double quotes within the string
	'</tr>'
	)
	table += '</table>'
	return table

	def main():
	"""Main function to run the Streamlit app."""
	appTitle = '臺大法規雙語查詢系統'
	sources = ('NTURegs', 'VOA')

	st.set_page_config(
	page_title=appTitle,
	layout='wide',
	initial_sidebar_state='auto',
	menu_items={
	'Get Help': 'https://streamlit.io/',
	'Report a bug': 'https://github.com',
	'About': f'{appTitle}\nCopyright (c) Ruben G. Tsui' # Corrected: escaped newline character
	}
	)

	page_style = '''
	<style>
	.css-o18uir.e16nr0p33 {
	margin-top: -125px;
	}
	.reportview-container .css-1lcbmhc .css-1outpf7 {
	padding-top: -125px;
	}
	.reportview-container .main .block-container{
	padding-top: 0rem;
	padding-right: 0rem;
	padding-left: 0rem;
	padding-bottom: 0rem;}
	p.europe {
	font-family: Source Pro, Consolas, LingWai TC, Menlo, Courier New, Arial;
	font-size: 16px;
	}
	p.cjk {
	font-family: Microsoft Jhenghei, Source Han Sans, Noto Sans CJK TC Regular, Hiragino Sans CNS, LantingHei TC, Source Han Serif;
	font-size: 18px;
	}
	</style>
	'''
	st.markdown(page_style, unsafe_allow_html=True)

	st.sidebar.subheader(appTitle)

	table_placeholder = st.empty()

	with st.sidebar:
	#if st.button("New Session"):
	# reset_session_state()
	# table_placeholder.empty()

	query = st.text_area('輸入搜尋字串').strip()
	multicorpora = st.multiselect('選擇語料庫（可複選）', sources, ['NTURegs'])

	colc, cold = st.columns([1, 1])
	with colc:
	submit_button = st.button('搜尋')
	with cold:
	regex_search = st.radio("Regex search", ["Yes", "Always"], horizontal=True)

	cola, colb = st.columns([1, 1])
	with cola:
	size = st.selectbox('筆數上限', [10, 20, 50, 100, 200, 500, 5000], index=2)
	with colb:
	case_sensitive = st.radio("Case sensitive", ["No", "Yes"], horizontal=True)

	st.session_state.chunksize = st.slider("每頁筆數", 1, 50, 20)

	# Build user interface
	col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
	with col1:
	first_button = st.button('First')
	with col2:
	prev_button = st.button('Prev')
	with col3:
	next_button = st.button('Next')
	with col4:
	last_button = st.button('Last')

	# Navigation
	if next_button and st.session_state.page < st.session_state.maxpage:
	st.session_state.page += 1
	if prev_button and st.session_state.page > st.session_state.minpage:
	st.session_state.page -= 1
	if first_button:
	st.session_state.page = st.session_state.minpage
	if last_button:
	st.session_state.page = st.session_state.maxpage

	if submit_button:
	reset_session_state()
	all_results = []
	success_messages = []
	success_messages.append('No. of matches found: ')
	for c in multicorpora:
	selectedCorpus = sources.index(c)
	results = rs(query, c=selectedCorpus, max_matches=size, case_sensitive=(case_sensitive == "Yes"), literal=False)
	success_messages.append(f'[{c}]: {len(results)}')
	all_results.extend(results)

	st.session_state.success_messages = success_messages

	datasize = len(all_results)
	n = st.session_state.chunksize
	slices = [all_results[i:i + n] for i in range(0, datasize, n)]
	pagesize = len(slices)

	st.session_state.datasize = datasize
	st.session_state.maxpage = pagesize
	st.session_state.queryresults = slices

	if st.session_state.queryresults is not None:
	table_placeholder.empty()
	with table_placeholder.container():
	if st.session_state.success_messages:
	messages = " \| ".join(st.session_state.success_messages)
	st.markdown(f'''
	<div style="
	border: 1px solid yellow;
	padding: 5px;
	border-radius: 5px;
	font-size: 0.9em;
	margin-bottom: 10px;
	background-color: black;
	color: yellow;
	margin-top: -25px;">
	{messages}
	</div>
	''', unsafe_allow_html=True)

	st.markdown(f"page {st.session_state.page} of {st.session_state.maxpage}")
	table = buildTable(st.session_state.page)
	st.markdown(table, unsafe_allow_html=True)

	if __name__ == '__main__':
	init_session_state()
	main()