rubentsui commited on
Commit
ce5a2cb
·
verified ·
1 Parent(s): 89c8272

Upload 3 files

Browse files
src/NTURegswa.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69e6661df8d4df83e0d7525b7d425091f33fc95e6e6f5a0b06ede7b9ab686fe
3
+ size 2103648
src/biconWebStmParquetWa.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/home/rubentsui/anaconda3/bin/python
2
+ # coding: utf-8
3
+
4
+ import regex as re
5
+ import sys
6
+ #import time, datetime
7
+ import gzip, bz2, lzma
8
+ from collections import Counter
9
+ import itertools
10
+ from lemminflect import getAllInflections #, getAllLemmas
11
+ from opencc import OpenCC
12
+ import polars as pl
13
+ import struct
14
+
15
+ openCC = OpenCC('t2s')
16
+
17
+ def file_open(filepath):
18
+ #Function to allowing opening files based on file extension
19
+ if filepath.endswith('.gz'):
20
+ return gzip.open(filepath, 'rt', encoding='utf8')
21
+ elif filepath.endswith('.bz2'):
22
+ return bz2.open(filepath, 'rt', encoding='utf8')
23
+ elif filepath.endswith('.xz'):
24
+ return lzma.open(filepath, 'rt', encoding='utf8')
25
+ else:
26
+ return open(filepath, 'r', encoding='utf8')
27
+
28
+
29
+ class color:
30
+ PURPLE = '\033[95m'
31
+ CYAN = '\033[96m'
32
+ DARKCYAN = '\033[36m'
33
+ BLUE = '<font color="blue">'
34
+ GREEN = '<font color="#36f307">'
35
+ YELLOW = '\033[93m'
36
+ RED = '<font color="red">'
37
+ BOLD = '<b>'
38
+ UNDERLINE = '\033[4m'
39
+ END = '</b></font>'
40
+ CURSOR_UP = '<font color="blue"><b>' #+ "\033[F" #'\033[1;1H'
41
+
42
+
43
+ def flatten(l): # flatten a nested list
44
+ def flatten0(l):
45
+ for i in l:
46
+ if isinstance(i,list):
47
+ yield from flatten0(i)
48
+ else:
49
+ yield i
50
+ return list(flatten0(l))
51
+
52
+ def getInflections(s):
53
+ '''
54
+ Get all inflections of the lemma s: Verb, Noun or Adjective
55
+ '''
56
+ infl = getAllInflections(s)
57
+ phr = []
58
+ for t in infl.values():
59
+ phr.extend(list(t))
60
+ if not phr:
61
+ return [s]
62
+ else:
63
+ return list(set(phr))
64
+
65
+
66
+ def mergeDicts(D1, D2):
67
+ '''
68
+ Input example:
69
+ D1 = {'a':2, 'b':3, 'c': 1}
70
+ D2 = {'b':5, 'c': 0, 'd': 7}
71
+ Output:
72
+ D = {'a':2, 'b': 3+5, 'c': 1+0, 'd': 7}
73
+ '''
74
+ return dict(Counter(D1) + Counter(D2))
75
+
76
+
77
+ def sortTuples(L):
78
+ '''
79
+ L: list of 2-tuples in the form [(1,2), (1,3), (4,3), (3,10), (4,5), (9,2)]
80
+ Sort by 1st number in tuple then by 2nd number, both in ascending order
81
+ '''
82
+ return None
83
+
84
+ def buildLexicon(matched_tokens, alignment, e, z):
85
+ '''
86
+ (1, 3), (1,4), (1, 5) becomes {e[1]: {' '.join([z[3], z[4], z[5]]): 1}}
87
+ '''
88
+ alignment = sorted(alignment)
89
+ L = dict()
90
+ for (i, j) in alignment:
91
+ if e[i] in matched_tokens:
92
+ s = e[i]; t = z[j]
93
+ if s not in L:
94
+ L[s] = [t]
95
+ else:
96
+ L[s].append(t)
97
+ for k in L:
98
+ v = ' '.join(L[k])
99
+ L[k] = {v: 1}
100
+
101
+ return L
102
+
103
+
104
+
105
+ def sentAlignHighlight(s, alignment, e, z):
106
+ # s = list of matches (each "match" is a tuple (i, j) where e[i] is mapped to z[j])
107
+ # e = list of tokens
108
+ # z = list of tokens
109
+ sep = ' '
110
+ e_marked = list(e)
111
+ z_marked = list(z)
112
+ for (i, j) in alignment:
113
+ if e[i] in s:
114
+ src = e[i]; tgt = z[j]
115
+ e_marked[i] = f'{color.RED}{color.BOLD}{e[i]}{color.END}'
116
+ z_marked[j] = f'{color.GREEN}{color.BOLD}{z[j]}{color.END}'
117
+ #e_marked[i] = f'{color.CYAN}{color.BOLD}{e[i]}{color.END}'
118
+ #z_marked[j] = f'{color.YELLOW}{color.BOLD}{z[j]}{color.END}'
119
+
120
+ return sep.join(e_marked), sep.join(z_marked)
121
+
122
+
123
+ def bindata2tuplelist(binary_data):
124
+ '''
125
+ The reverse of the above.
126
+ '''
127
+ # Unpack from bytes
128
+ unpacked_list = []
129
+ num_tuples = len(binary_data) // 4 # Each tuple is 4 bytes (H + H)
130
+ for i in range(num_tuples):
131
+ offset = i * 4
132
+ tup = struct.unpack('>HH', binary_data[offset:offset+4])
133
+ unpacked_list.append(tup)
134
+
135
+ return unpacked_list
136
+
137
+
138
+ corpora = """
139
+ [0] TWP
140
+ [1] Patten
141
+ [2] UNPC
142
+ [3] FIN
143
+ [4] QING
144
+ [5] TWL
145
+ [6] NTURegs
146
+ [7] FTV
147
+ [8] SAT
148
+ [9] CIA
149
+ [10] NEJM
150
+ [11] VOA
151
+ [12] NYT
152
+ [13] BBC
153
+ [14] Quixote
154
+ [15] Wiki
155
+ [16] TEST
156
+ """.strip().split("\n")
157
+
158
+ C = {k: c.split()[-1]+".xz" for k, c in enumerate(corpora)}
159
+ C2 = {k: c.split()[-1] for k, c in enumerate(corpora)}
160
+
161
+
162
+ def sz(s, c=0, max_matches=50, stats_only=False):
163
+ '''
164
+ s: Chinese search phrase
165
+ '''
166
+
167
+ corpus = C[c]
168
+ corpus_code = C2[c]
169
+ html_text = []
170
+ Lexicon = dict()
171
+
172
+ # buit search phrase (sp) regexp
173
+ sp = s.split() # split string into a list of tokens by white spaces
174
+ regex_phr = []
175
+ for s0 in sp:
176
+ inflections = getInflections(s0)
177
+ regex_phr.append(re.compile(fr"\b({'|'.join(inflections)})\b"))
178
+ #html_text += f"regex_phr = {regex_phr}<br>"
179
+
180
+ #sys.exit(0)
181
+ if not regex_phr:
182
+ html_text.append(f"Sorry, zero matches found for search phrase [{s}].\n")
183
+ return None
184
+
185
+ cnt = 0
186
+ raw_cnt = 0
187
+ num_matches = 0
188
+ with file_open(corpus) as fi:
189
+ for line in fi:
190
+ raw_cnt += 1
191
+ if line.strip().count('\t') < 2:
192
+ continue
193
+ #html_text += f"raw_cnt = [{raw_cnt}]; line fewer than 2 tabs<br>"
194
+ #html_text += line.strip() + '<br>'
195
+ #html_text += '-'*80 + '<br>'
196
+ score, en, zh = line.strip().split('\t', maxsplit=2)
197
+ en = en.replace('``', '‘‘').replace("''", '’’')
198
+ e = en.split()
199
+ z = zh.split()
200
+ en_marked, zh_marked = None, None
201
+
202
+ MATCHED_ALL = True
203
+ matches_list = []
204
+ for r in regex_phr:
205
+ matches = r.findall(zh)
206
+ if matches:
207
+ matches_list.extend(matches)
208
+ MATCHED_ALL &= (len(matches)>0)
209
+ matches_list = list(set(matches_list))
210
+ if MATCHED_ALL:
211
+ #print(f"All words matched!")
212
+ cnt += 1
213
+ #alignments = myaligner.get_word_aligns(en, zh)
214
+ #a = alignments[align_method]
215
+ #a = align_word(en, zh)
216
+ a = align_word(zh, en)
217
+ #print(f"alignment = {a}")
218
+ #print(f"matches_list = {matches_list}")
219
+ zh_marked, en_marked = sentAlignHighlight(flatten(matches_list), a, z, e)
220
+ if stats_only:
221
+ pass
222
+ else:
223
+ lineOut = f'[{corpus_code}]\t{score}\t<p class="chinese">{zh_marked}</p>\t<p class="europe">{en_marked}</p>'
224
+ html_text.append(lineOut)
225
+ L = buildLexicon(flatten(matches_list), a, z, e)
226
+ for k in L:
227
+ if k in Lexicon:
228
+ Lexicon[k] = mergeDicts(Lexicon[k], L[k])
229
+ else:
230
+ Lexicon[k] = L[k]
231
+ #print(f"Lexicon per match: {L}")
232
+ #print()
233
+
234
+ if cnt >= max_matches: break
235
+
236
+ summary = f"No. of matches: {cnt}\n<br>\n"
237
+ #print(Lexicon)
238
+ for k1 in Lexicon:
239
+ v1 = Lexicon[k1]
240
+ #html_text += f"[{k1}]\n"
241
+ s = [(k2, v1[k2]) for k2 in sorted(v1, key=v1.get, reverse=True)]
242
+ for k2, v2 in s:
243
+ if k2:
244
+ summary += f"{v2}\t{k2}\n<br>\n"
245
+ #html_text.append(summary)
246
+
247
+ return html_text, summary
248
+
249
+
250
+ def se(s, c=0, max_matches=50, stats_only=False):
251
+ '''
252
+ s: English search phrase
253
+ '''
254
+ corpus = C[c]
255
+ corpus_code = C2[c]
256
+ html_text = []
257
+ Lexicon = dict()
258
+
259
+ # buit search phrase (sp) regexp
260
+ sp = s.split() # split string into a list of tokens by white spaces
261
+ # regexp
262
+ regex_phr = None
263
+ if len(sp) == 1: # regexp for single-word search phrase
264
+ inflections = getInflections(s)
265
+ num = len(inflections)
266
+ if num > 0:
267
+ regex_phr = [re.compile(fr"\b({'|'.join(inflections)})\b", flags=re.IGNORECASE)]
268
+ else: # multi-word search phrase
269
+ regex_phr = []
270
+ for s0 in sp:
271
+ inflections = getInflections(s0)
272
+ regex_phr.append(re.compile(fr"\b({'|'.join(inflections)})\b", flags=re.IGNORECASE))
273
+
274
+ #sys.exit(0)
275
+
276
+ if not regex_phr:
277
+ html_text.append(f"Sorry, zero matches found for search phrase [{s}].\n<br>\n")
278
+ return None
279
+
280
+ cnt = 0
281
+ raw_cnt = 0
282
+ num_matches = 0
283
+ with file_open(corpus) as fi:
284
+ for line in fi:
285
+ raw_cnt += 1
286
+ if line.strip().count('\t') < 2:
287
+ continue
288
+ score, en, zh = line.strip().split('\t', maxsplit=2)
289
+ en = en.replace('``', '‘‘').replace("''", '’’')
290
+ e = en.split()
291
+ z = zh.split()
292
+ en_marked, zh_marked = None, None
293
+
294
+ MATCHED_ALL = True
295
+ matches_list = []
296
+ for r in regex_phr:
297
+ matches = r.findall(en)
298
+ matches_list.append(matches)
299
+ MATCHED_ALL &= (len(matches)>0)
300
+ if MATCHED_ALL:
301
+ #print(f"All words matched!")
302
+ cnt += 1
303
+ #alignments = myaligner.get_word_aligns(en, zh)
304
+ #a = alignments[align_method]
305
+ a = align_word(en, zh)
306
+ en_marked, zh_marked = sentAlignHighlight(flatten(matches_list), a, e, z)
307
+ if stats_only:
308
+ pass
309
+ else:
310
+ lineOut = f'[{corpus_code}]\t{score}\t<p class="europe">{en_marked}</p>\t<p class="chinese">{zh_marked}</p>'
311
+ html_text.append(lineOut)
312
+ L = buildLexicon(flatten(matches_list), a, e, z)
313
+ for k in L:
314
+ if k in Lexicon:
315
+ Lexicon[k] = mergeDicts(Lexicon[k], L[k])
316
+ else:
317
+ Lexicon[k] = L[k]
318
+
319
+ if cnt >= max_matches: break
320
+
321
+ summary = f"No. of matches: {cnt}\n<br>\n"
322
+ #print(Lexicon)
323
+ for k1 in Lexicon:
324
+ v1 = Lexicon[k1]
325
+ #html_text += f"[{k1}]<br>"
326
+ s = [(k2, v1[k2]) for k2 in sorted(v1, key=v1.get, reverse=True)]
327
+ for k2, v2 in s:
328
+ if k2:
329
+ summary += f"{v2}\t{k2}\n<br>\n"
330
+ #html_text.append(summary)
331
+ return html_text, summary
332
+
333
+
334
+ regex_zh = re.compile(r"[一-龥]")
335
+
336
+ def s(ss, c=0, max_matches=100, stats_only=False):
337
+
338
+ actual_search_function = None
339
+ if regex_zh.findall(ss): # Chinese characters found
340
+ actual_search_function = sz
341
+ if c in [99]: # ROCLaws has en, zh reversed
342
+ actual_search_function = se
343
+ print(f"Search by Chinese: actual search function = [{actual_search_function}]")
344
+ else: # Non-Chinese
345
+ actual_search_function = se
346
+ if c in [99]: # ROCLaws has en, zh reversed
347
+ actual_search_function = sz
348
+
349
+ return actual_search_function(ss, c=c, max_matches=max_matches, stats_only=stats_only)
350
+
351
+
352
+ def tokenIndices(ss, i, j):
353
+ '''
354
+ Given: input indices (i, j) of the string ss (tokens separated by single spaces),
355
+ Return: the list indices of ss.split() that correspond to the substring ss[i:j]
356
+ '''
357
+ L = ss.split()
358
+ part1 = ss[:i].split()
359
+ part2 = ss[i:j].split()
360
+ part3 = ss[j:].split()
361
+ return len(part1), len(L) - len(part3) # these are the list indices
362
+
363
+
364
+ def regex_search(ss, c=0, max_matches=100, stats_only=False, literal=False):
365
+
366
+ zhSearch = False
367
+ if regex_zh.findall(ss): # Chinese characters found
368
+ zhSearch = True
369
+
370
+ corpus = C[c]
371
+ corpus_code = C2[c]
372
+ results = []
373
+
374
+ df = pl.read_parquet(f"{corpus_code}wa.parquet")
375
+ search_str = r"(?i)"
376
+ if zhSearch:
377
+ column = 'zh'
378
+ else:
379
+ column = 'en'
380
+ res = df.filter(
381
+ pl.col(column).str.contains(fr"({ss})", literal=literal)
382
+ )
383
+ query_results = res.to_dict(as_series=False)
384
+ length = len(query_results['en'])
385
+ #length = 5
386
+ p = re.compile(fr"({ss})")
387
+ cnt = 0
388
+ for i in range(length):
389
+ cnt += 1
390
+ en = query_results['en'][i]
391
+ zh = query_results['zh'][i]
392
+ enList = en.split()
393
+ zhList = zh.split()
394
+ was = bindata2tuplelist(query_results['word_alignments'][i])
395
+ #print('was = ', was)
396
+ #enSub, zhSub = en, zh
397
+ if zhSearch:
398
+ #a = align_word(openCC.convert(zh), en)
399
+ #a = [(zhList[j], enList[i]) for (i, j) in was]
400
+ a = was
401
+ for m in p.finditer(zh):
402
+ ii = m.start()
403
+ jj = m.end()
404
+ k, q = tokenIndices(zh, ii, jj)
405
+ for idx in range(k, q):
406
+ zhList[idx] = f"{color.RED}{color.BOLD}{zhList[idx]}{color.END}"
407
+
408
+ idxT = [e for (e, z) in a if z == idx] # target indices
409
+ #print(f"idxT = {idxT}")
410
+ for iT in idxT:
411
+ enList[iT] = f"{color.GREEN}{color.BOLD}{enList[iT]}{color.END}"
412
+
413
+ zhSub = ' '.join(zhList)
414
+ enSub = ' '.join(enList)
415
+ score = '_score_'
416
+ lineOut = f'[{corpus_code}]\t{score}\t<p class="chinese">{zhSub}</p>\t<p class="europe">{enSub}</p>'
417
+ results.append(lineOut)
418
+ else:
419
+ #a = align_word(en, openCC.convert(zh))
420
+ #a = [(enList[i], zhList[j]) for (i, j) in was]
421
+ a = was
422
+
423
+ for m in p.finditer(en):
424
+ ii = m.start()
425
+ jj = m.end()
426
+ k, q = tokenIndices(en, ii, jj)
427
+ for idx in range(k, q):
428
+ enList[idx] = f"{color.RED}{color.BOLD}{enList[idx]}{color.END}"
429
+ idxT = [z for (e, z) in a if e == idx] # target indices
430
+ for iT in idxT:
431
+ zhList[iT] = f"{color.GREEN}{color.BOLD}{zhList[iT]}{color.END}"
432
+
433
+ zhSub = ' '.join(zhList)
434
+ enSub = ' '.join(enList)
435
+
436
+ score = 1
437
+ lineOut = f'[{corpus_code}]\t{score}\t<p class="europe">{enSub}</p>\t<p class="chinese">{zhSub}</p>'
438
+ results.append(lineOut)
439
+
440
+ if cnt > max_matches: break
441
+
442
+ return results
443
+
444
+ #'''
445
+ #EXAMPLES of REGEX RESEARCH
446
+ #(take[sn]|taking|took) .{1,20} for granted
447
+ #'''
448
+
449
+ if __name__ == '__main__':
450
+
451
+ print('\n\n'+'='*100)
452
+ print("""Usage:
453
+ Chinese search phrase
454
+ s('打擊 犯罪', c=0)
455
+ English search phrase
456
+ s('preemptive strike', c=0, mac_matches=200)
457
+ Type C (Capital "C") followed by the <Enter> key to see a list of corpora available.
458
+ """)
459
+ for c in C:
460
+ print(f"c={c}: {C[c][:-3]}")
461
+
462
+
src/stwebm_parquet_wa.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Dec 11 19:51:02 2022
4
+
5
+ @author: ruben
6
+ """
7
+
8
+ import streamlit as st
9
+ #from streamlit.components.v1 import html
10
+ #import streamlit.components.v1 as components
11
+ import pandas as pd
12
+ #import numpy as np
13
+ #from datetime import datetime
14
+
15
+ from biconWebStmParquetWa import s as search, regex_search as rs
16
+
17
+ # Session variables
18
+ if 'queryresults' not in st.session_state:
19
+ st.session_state.queryresults = None
20
+
21
+ if 'page' not in st.session_state:
22
+ st.session_state.page = 1 # starts at 1, not 0
23
+
24
+ if 'maxpage' not in st.session_state:
25
+ st.session_state.maxpage = 0
26
+
27
+ if 'minpage' not in st.session_state:
28
+ st.session_state.minpage = 1
29
+
30
+ if 'datasize' not in st.session_state:
31
+ st.session_state.datasize = 0
32
+
33
+ if 'chunksize' not in st.session_state:
34
+ st.session_state.chunksize = 3
35
+
36
+
37
+ #if 'table' not in st.session_state:
38
+ # st.session_state.table = ''
39
+ table = 'Empty Table'
40
+
41
+
42
+ def buildTable(page):
43
+ # Build table
44
+ slices = st.session_state.queryresults
45
+ datasize = st.session_state.datasize
46
+ table = '<table width="100%">\n'
47
+ n = st.session_state.chunksize
48
+ for j in range(n):
49
+ index = (page-1)*n + j
50
+ if index >= datasize: break
51
+ try:
52
+ corpus, score, en, zh = slices[page-1][j].split('\t')
53
+ except:
54
+ continue
55
+ table += '<tr>\n'
56
+ table += f'<td>{corpus}</td><td colspan=2>{score}</td>\n'
57
+ table += '</tr>\n'
58
+ table += '<tr>\n'
59
+ table += f'<td>{index+1}</td><td width="45%" valign="top">{en}</td><td width="50%" valign="top">{zh}</td>'
60
+ table += '</tr>\n'
61
+ table += '</table>'
62
+ return table
63
+
64
+
65
+
66
+ appTitle= '國教院華英雙語索引典系統2.0β版'
67
+ appTitle= '華英雙語索引典系統2.0β版'
68
+ sources = ('TWP', 'Patten', 'UNPC', 'FIN', 'QING', 'TWL', 'NTURegs', 'FTV', 'SAT', 'CIA',
69
+ 'NEJM', 'VOA', 'NYT', 'BBC', 'Quixote', 'Wiki', 'TEST')
70
+ corpus_labels = ('光華雜誌', '彭定康', '聯合國平行語料庫', '清史', '台灣法律(全國法規資料庫)', '臺大法規', '民視英語新聞', '科學人', '美國華人史', '新英格蘭醫學期刊', '美國之音', '紐約時報中文網', 'BBC', '唐吉柯德', '維基百科', '測試')
71
+
72
+ files = [c + '.xz' for c in sources]
73
+
74
+
75
+ st.set_page_config(
76
+ page_title=appTitle,
77
+ #page_icon=icon,
78
+ layout='wide',
79
+ initial_sidebar_state='auto',
80
+ menu_items={
81
+ 'Get Help': 'https://streamlit.io/',
82
+ 'Report a bug': 'https://github.com',
83
+ 'About': f'**{appTitle}**\nCopyright (c) Ruben G. Tsui'
84
+ }
85
+ )
86
+
87
+ page_style = '''
88
+ <style>
89
+ .css-o18uir.e16nr0p33 {
90
+ margin-top: -125px;
91
+ }
92
+ .reportview-container .css-1lcbmhc .css-1outpf7 {{
93
+ padding-top: -125px;
94
+ }}
95
+ .reportview-container .main .block-container{{
96
+ padding-top: 0rem;
97
+ padding-right: 0rem;
98
+ padding-left: 0rem;
99
+ padding-bottom: 0rem;}}
100
+ .europe {
101
+ font-family: Consolas, Menlo, Courier New, Arial;
102
+ font-size: 14px;
103
+ }
104
+ .chinese {
105
+ /* font-family: Xingkai TC; */
106
+ font-family: Microsoft Jhenghei, Source Han Sans, Hiragino Sans CNS, LantingHei TC, Source Han Serif;
107
+ font-size: 36px;
108
+ font-weight: lighter;
109
+ }
110
+ </style>
111
+ '''
112
+ st.markdown(page_style, unsafe_allow_html=True)
113
+
114
+ # Sidebar
115
+ st.sidebar.subheader(appTitle)
116
+
117
+ with st.sidebar:
118
+
119
+ #query = st.sidebar.text_input('輸入搜尋字串').strip()
120
+ query = st.sidebar.text_area('輸入搜尋字串').strip()
121
+ multicorpora = st.multiselect('選擇語料庫(可複選)', sources, ['TWP', 'FTV'])
122
+
123
+ colc, cold = st.sidebar.columns([1, 1])
124
+ with colc:
125
+ submit_button = st.button('搜尋')
126
+ with cold:
127
+ regex_search = st.radio(
128
+ "Regex search",
129
+ ["No", "Yes"], horizontal=True
130
+ )
131
+
132
+
133
+ cola, colb = st.sidebar.columns([1, 1])
134
+ with cola:
135
+ size = st.selectbox('筆數上限', [10,20,50,100,200,500,5000], index=2)
136
+ with colb:
137
+ stats_only = st.radio(
138
+ "Stats only",
139
+ ["No", "Yes"], horizontal=True
140
+ )
141
+
142
+ st.session_state.chunksize = st.slider("每頁筆數", 1, 20, 20)
143
+
144
+ # Build user interface
145
+ col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
146
+ with col1:
147
+ first_button = st.button('First')
148
+ with col2:
149
+ prev_button = st.button('Prev')
150
+ with col3:
151
+ next_button = st.button('Next')
152
+ with col4:
153
+ last_button = st.button('Last')
154
+
155
+ # Navigation
156
+ if next_button:
157
+ if st.session_state.page < st.session_state.maxpage:
158
+ st.session_state.page += 1
159
+
160
+ if prev_button:
161
+ if st.session_state.page > st.session_state.minpage:
162
+ st.session_state.page -= 1
163
+
164
+ if first_button:
165
+ st.session_state.page = st.session_state.minpage
166
+
167
+ if last_button:
168
+ st.session_state.page = st.session_state.maxpage
169
+
170
+ page = st.session_state.page
171
+
172
+
173
+ n = st.session_state.chunksize # chunk size (no. of rows per chunk)
174
+
175
+ # Logic to query database when button is pressed
176
+ divider = '-'*80
177
+ if submit_button:
178
+
179
+ # reset certin parameters
180
+ st.session_state.queryresults = None
181
+ st.session_state.page = 1 # starts at 1, not 0
182
+ st.session_state.maxpage = 0
183
+ st.session_state.minpage = 1
184
+ st.session_state.datasize = 0
185
+ #st.session_state.chunksize = 3
186
+
187
+
188
+ #selectedCorpus = corpora.index(True)
189
+ selectedCorpora = multicorpora
190
+
191
+ if regex_search == 'Yes':
192
+
193
+ #st.write('Regex search selected!!')
194
+ all_results = [] # results from all corpora selected
195
+ for c in selectedCorpora:
196
+ selectedCorpus = sources.index(c)
197
+ results = rs(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"), literal=False)
198
+ #st.write(f'No. of matches found in [{c}]: {len(results)}')
199
+ st.success(f'No. of matches found in [{c}]: {len(results)}')
200
+ all_results.extend(results)
201
+
202
+ datasize = len(all_results)
203
+ slices = [all_results[i:i+n] for i in range(0, datasize, n)]
204
+ pagesize = len(slices) # total no. of pages available
205
+
206
+ st.session_state.datasize = datasize
207
+ st.session_state.maxpage = pagesize
208
+ st.session_state.queryresults = slices
209
+
210
+ else:
211
+
212
+ all_results = [] # results from all corpora selected
213
+ all_summaries = []
214
+ for c in selectedCorpora:
215
+ selectedCorpus = sources.index(c)
216
+ results, summary = search(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"))
217
+ #results = regex_search(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"), literal=true)
218
+ st.markdown(f'Corpus [{c}]: {len(results)} matches')
219
+ all_results.extend(results)
220
+ all_summaries.append(summary)
221
+
222
+ datasize = len(all_results)
223
+ slices = [all_results[i:i+n] for i in range(0, datasize, n)]
224
+ pagesize = len(slices) # total no. of pages available
225
+
226
+ st.session_state.datasize = datasize
227
+ st.session_state.maxpage = pagesize
228
+ st.session_state.queryresults = slices
229
+
230
+
231
+ #st.write(st.session_state)
232
+
233
+ if st.session_state.queryresults != None:
234
+ #col1a, col2a = st.columns([1, 2])
235
+ #with col1a:
236
+ st.markdown(f"page {st.session_state.page} of {st.session_state.maxpage}")
237
+ #with col2a:
238
+ # st.slider("pages", 1, st.session_state.maxpage, st.session_state.page)
239
+ table = buildTable(st.session_state.page)
240
+ st.markdown(table, unsafe_allow_html=True)
241
+ #st.markdown(all_summaries, unsafe_allow_html=True)
242
+ #table = buildTable(st.session_state.page)
243
+ #st.markdown(table, unsafe_allow_html=True)
244
+
245
+ #st.write("That's all, folks!")
246
+ #st.write(table)
247
+