NTU

Running

App Files Files Community

rubentsui commited on May 23, 2025

Commit

ce5a2cb

verified ·

1 Parent(s): 89c8272

Upload 3 files

Browse files

Files changed (3) hide show

src/NTURegswa.parquet +3 -0
src/biconWebStmParquetWa.py +462 -0
src/stwebm_parquet_wa.py +247 -0

src/NTURegswa.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a69e6661df8d4df83e0d7525b7d425091f33fc95e6e6f5a0b06ede7b9ab686fe
+size 2103648

src/biconWebStmParquetWa.py ADDED Viewed

	@@ -0,0 +1,462 @@

+#!/home/rubentsui/anaconda3/bin/python
+# coding: utf-8
+import regex as re
+import sys
+#import time, datetime
+import gzip, bz2, lzma
+from collections import Counter
+import itertools
+from lemminflect import getAllInflections #, getAllLemmas
+from opencc import OpenCC
+import polars as pl
+import struct
+openCC = OpenCC('t2s')
+def file_open(filepath):
+    #Function to allowing opening files based on file extension
+    if filepath.endswith('.gz'):
+        return gzip.open(filepath, 'rt', encoding='utf8')
+    elif filepath.endswith('.bz2'):
+        return bz2.open(filepath, 'rt', encoding='utf8')
+    elif filepath.endswith('.xz'):
+        return lzma.open(filepath, 'rt', encoding='utf8')
+    else:
+        return open(filepath, 'r', encoding='utf8')
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   BLUE = '<font color="blue">'
+   GREEN = '<font color="#36f307">'
+   YELLOW = '\033[93m'
+   RED = '<font color="red">'
+   BOLD = '<b>'
+   UNDERLINE = '\033[4m'
+   END = '</b></font>'
+   CURSOR_UP = '<font color="blue"><b>' #+ "\033[F" #'\033[1;1H'
+def flatten(l):  # flatten a nested list
+    def flatten0(l):
+        for i in l:
+            if isinstance(i,list):
+                yield from flatten0(i)
+            else:
+                yield i
+    return list(flatten0(l))
+def getInflections(s):
+    '''
+    Get all inflections of the lemma s: Verb, Noun or Adjective
+    '''
+    infl = getAllInflections(s)
+    phr = []
+    for t in infl.values():
+        phr.extend(list(t))
+    if not phr:
+        return [s]
+    else:
+        return list(set(phr))
+def mergeDicts(D1, D2):
+    '''
+    Input example:
+        D1 = {'a':2, 'b':3, 'c': 1}
+        D2 = {'b':5, 'c': 0, 'd': 7}
+    Output:
+        D  = {'a':2, 'b': 3+5, 'c': 1+0, 'd': 7}
+    '''
+    return dict(Counter(D1) + Counter(D2))
+def sortTuples(L):
+    '''
+    L: list of 2-tuples in the form [(1,2), (1,3), (4,3), (3,10), (4,5), (9,2)]
+    Sort by 1st number in tuple then by 2nd number, both in ascending order
+    '''
+    return None
+def buildLexicon(matched_tokens, alignment, e, z):
+    '''
+    (1, 3), (1,4), (1, 5) becomes {e[1]: {' '.join([z[3], z[4], z[5]]): 1}}
+    '''
+    alignment = sorted(alignment)
+    L = dict()
+    for (i, j) in alignment:
+        if e[i] in matched_tokens:
+            s = e[i]; t = z[j]
+            if s not in L:
+                L[s] = [t]
+            else:
+                L[s].append(t)
+    for k in L:
+        v = ' '.join(L[k])
+        L[k] = {v: 1}
+    return L
+def sentAlignHighlight(s, alignment, e, z):
+    # s = list of matches (each "match" is a tuple (i, j) where e[i] is mapped to z[j])
+    # e = list of tokens
+    # z = list of tokens
+    sep = ' '
+    e_marked = list(e)
+    z_marked = list(z)
+    for (i, j) in alignment:
+        if e[i] in s:
+            src = e[i]; tgt = z[j]
+            e_marked[i] = f'{color.RED}{color.BOLD}{e[i]}{color.END}'
+            z_marked[j] = f'{color.GREEN}{color.BOLD}{z[j]}{color.END}'
+            #e_marked[i] = f'{color.CYAN}{color.BOLD}{e[i]}{color.END}'
+            #z_marked[j] = f'{color.YELLOW}{color.BOLD}{z[j]}{color.END}'
+    return sep.join(e_marked), sep.join(z_marked)
+def bindata2tuplelist(binary_data):
+    '''
+    The reverse of the above.
+    '''
+    # Unpack from bytes
+    unpacked_list = []
+    num_tuples = len(binary_data) // 4 # Each tuple is 4 bytes (H + H)
+    for i in range(num_tuples):
+        offset = i * 4
+        tup = struct.unpack('>HH', binary_data[offset:offset+4])
+        unpacked_list.append(tup)
+    return unpacked_list
+corpora = """
+[0] TWP
+[1] Patten
+[2] UNPC
+[3] FIN
+[4] QING
+[5] TWL
+[6] NTURegs
+[7] FTV
+[8] SAT
+[9] CIA
+[10] NEJM
+[11] VOA
+[12] NYT
+[13] BBC
+[14] Quixote
+[15] Wiki
+[16] TEST
+""".strip().split("\n")
+C = {k: c.split()[-1]+".xz" for k, c in enumerate(corpora)}
+C2 = {k: c.split()[-1] for k, c in enumerate(corpora)}
+def sz(s, c=0, max_matches=50, stats_only=False):
+    '''
+    s: Chinese search phrase
+    '''
+    corpus = C[c]
+    corpus_code = C2[c]
+    html_text = []
+    Lexicon = dict()
+    # buit search phrase (sp) regexp
+    sp = s.split() # split string into a list of tokens by white spaces
+    regex_phr = []
+    for s0 in sp:
+        inflections = getInflections(s0)
+        regex_phr.append(re.compile(fr"\b({'|'.join(inflections)})\b"))
+    #html_text += f"regex_phr = {regex_phr}<br>"
+    #sys.exit(0)
+    if not regex_phr:
+        html_text.append(f"Sorry, zero matches found for search phrase [{s}].\n")
+        return None
+    cnt = 0
+    raw_cnt = 0
+    num_matches = 0
+    with file_open(corpus) as fi:
+        for line in fi:
+            raw_cnt += 1
+            if line.strip().count('\t') < 2:
+                continue
+                #html_text += f"raw_cnt = [{raw_cnt}]; line fewer than 2 tabs<br>"
+                #html_text += line.strip() + '<br>'
+                #html_text += '-'*80 + '<br>'
+            score, en, zh = line.strip().split('\t', maxsplit=2)
+            en = en.replace('``', '‘‘').replace("''", '’’')
+            e = en.split()
+            z = zh.split()
+            en_marked, zh_marked = None, None
+            MATCHED_ALL = True
+            matches_list = []
+            for r in regex_phr:
+                matches = r.findall(zh)
+                if matches:
+                    matches_list.extend(matches)
+                MATCHED_ALL &= (len(matches)>0)
+            matches_list = list(set(matches_list))
+            if MATCHED_ALL:
+                #print(f"All words matched!")
+                cnt += 1
+                #alignments = myaligner.get_word_aligns(en, zh)
+                #a = alignments[align_method]
+                #a = align_word(en, zh)
+                a = align_word(zh, en)
+                #print(f"alignment = {a}")
+                #print(f"matches_list = {matches_list}")
+                zh_marked, en_marked = sentAlignHighlight(flatten(matches_list), a, z, e)
+                if stats_only:
+                    pass
+                else:
+                    lineOut = f'[{corpus_code}]\t{score}\t<p class="chinese">{zh_marked}</p>\t<p class="europe">{en_marked}</p>'
+                    html_text.append(lineOut)
+                L = buildLexicon(flatten(matches_list), a, z, e)
+                for k in L:
+                    if k in Lexicon:
+                        Lexicon[k] = mergeDicts(Lexicon[k], L[k])
+                    else:
+                        Lexicon[k] = L[k]
+                #print(f"Lexicon per match: {L}")
+                #print()
+            if cnt >= max_matches: break
+    summary = f"No. of matches: {cnt}\n<br>\n"
+    #print(Lexicon)
+    for k1 in Lexicon:
+        v1 = Lexicon[k1]
+        #html_text += f"[{k1}]\n"
+        s = [(k2, v1[k2]) for k2 in sorted(v1, key=v1.get, reverse=True)]
+        for k2, v2 in s:
+            if k2:
+                summary += f"{v2}\t{k2}\n<br>\n"
+    #html_text.append(summary)
+    return html_text, summary
+def se(s, c=0, max_matches=50, stats_only=False):
+    '''
+    s: English search phrase
+    '''
+    corpus = C[c]
+    corpus_code = C2[c]
+    html_text = []
+    Lexicon = dict()
+    # buit search phrase (sp) regexp
+    sp = s.split() # split string into a list of tokens by white spaces
+    # regexp
+    regex_phr = None
+    if len(sp) == 1: # regexp for single-word search phrase
+        inflections = getInflections(s)
+        num = len(inflections)
+        if num > 0:
+            regex_phr = [re.compile(fr"\b({'|'.join(inflections)})\b", flags=re.IGNORECASE)]
+    else:  # multi-word search phrase
+        regex_phr = []
+        for s0 in sp:
+            inflections = getInflections(s0)
+            regex_phr.append(re.compile(fr"\b({'|'.join(inflections)})\b", flags=re.IGNORECASE))
+    #sys.exit(0)
+    if not regex_phr:
+        html_text.append(f"Sorry, zero matches found for search phrase [{s}].\n<br>\n")
+        return None
+    cnt = 0
+    raw_cnt = 0
+    num_matches = 0
+    with file_open(corpus) as fi:
+        for line in fi:
+            raw_cnt += 1
+            if line.strip().count('\t') < 2:
+                continue
+            score, en, zh = line.strip().split('\t', maxsplit=2)
+            en = en.replace('``', '‘‘').replace("''", '’’')
+            e = en.split()
+            z = zh.split()
+            en_marked, zh_marked = None, None
+            MATCHED_ALL = True
+            matches_list = []
+            for r in regex_phr:
+                matches = r.findall(en)
+                matches_list.append(matches)
+                MATCHED_ALL &= (len(matches)>0)
+            if MATCHED_ALL:
+                #print(f"All words matched!")
+                cnt += 1
+                #alignments = myaligner.get_word_aligns(en, zh)
+                #a = alignments[align_method]
+                a = align_word(en, zh)
+                en_marked, zh_marked = sentAlignHighlight(flatten(matches_list), a, e, z)
+                if stats_only:
+                    pass
+                else:
+                    lineOut = f'[{corpus_code}]\t{score}\t<p class="europe">{en_marked}</p>\t<p class="chinese">{zh_marked}</p>'
+                    html_text.append(lineOut)
+                L = buildLexicon(flatten(matches_list), a, e, z)
+                for k in L:
+                    if k in Lexicon:
+                        Lexicon[k] = mergeDicts(Lexicon[k], L[k])
+                    else:
+                        Lexicon[k] = L[k]
+            if cnt >= max_matches: break
+    summary = f"No. of matches: {cnt}\n<br>\n"
+    #print(Lexicon)
+    for k1 in Lexicon:
+        v1 = Lexicon[k1]
+        #html_text += f"[{k1}]<br>"
+        s = [(k2, v1[k2]) for k2 in sorted(v1, key=v1.get, reverse=True)]
+        for k2, v2 in s:
+            if k2:
+                summary += f"{v2}\t{k2}\n<br>\n"
+    #html_text.append(summary)
+    return html_text, summary
+regex_zh = re.compile(r"[一-龥]")
+def s(ss, c=0, max_matches=100, stats_only=False):
+    actual_search_function = None
+    if regex_zh.findall(ss):  # Chinese characters found
+        actual_search_function = sz
+        if c in [99]:  # ROCLaws has en, zh reversed
+            actual_search_function = se
+        print(f"Search by Chinese: actual search function = [{actual_search_function}]")
+    else: # Non-Chinese
+        actual_search_function = se
+        if c in [99]:  # ROCLaws has en, zh reversed
+            actual_search_function = sz
+    return actual_search_function(ss, c=c, max_matches=max_matches, stats_only=stats_only)
+def tokenIndices(ss, i, j):
+    '''
+    Given: input indices (i, j) of the string ss (tokens separated by single spaces),
+    Return: the list indices of ss.split() that correspond to the substring ss[i:j]
+    '''
+    L = ss.split()
+    part1 = ss[:i].split()
+    part2 = ss[i:j].split()
+    part3 = ss[j:].split()
+    return len(part1), len(L) - len(part3)  # these are the list indices
+def regex_search(ss, c=0, max_matches=100, stats_only=False, literal=False):
+    zhSearch = False
+    if regex_zh.findall(ss):  # Chinese characters found
+        zhSearch = True
+    corpus = C[c]
+    corpus_code = C2[c]
+    results = []
+    df = pl.read_parquet(f"{corpus_code}wa.parquet")
+    search_str = r"(?i)"
+    if zhSearch:
+        column = 'zh'
+    else:
+        column = 'en'
+    res = df.filter(
+        pl.col(column).str.contains(fr"({ss})", literal=literal)
+    )
+    query_results = res.to_dict(as_series=False)
+    length =  len(query_results['en'])
+    #length = 5
+    p = re.compile(fr"({ss})")
+    cnt = 0
+    for i in range(length):
+        cnt += 1
+        en = query_results['en'][i]
+        zh = query_results['zh'][i]
+        enList = en.split()
+        zhList = zh.split()
+        was = bindata2tuplelist(query_results['word_alignments'][i])
+        #print('was = ', was)
+        #enSub, zhSub = en, zh
+        if zhSearch:
+            #a = align_word(openCC.convert(zh), en)
+            #a = [(zhList[j], enList[i]) for (i, j) in was]
+            a = was
+            for m in p.finditer(zh):
+                ii = m.start()
+                jj = m.end()
+                k, q = tokenIndices(zh, ii, jj)
+                for idx in range(k, q):
+                    zhList[idx] = f"{color.RED}{color.BOLD}{zhList[idx]}{color.END}"
+                    idxT = [e for (e, z) in a if z == idx]  # target indices
+                    #print(f"idxT = {idxT}")
+                    for iT in idxT:
+                        enList[iT] = f"{color.GREEN}{color.BOLD}{enList[iT]}{color.END}"
+            zhSub = ' '.join(zhList)
+            enSub = ' '.join(enList)
+            score = '_score_'
+            lineOut = f'[{corpus_code}]\t{score}\t<p class="chinese">{zhSub}</p>\t<p class="europe">{enSub}</p>'
+            results.append(lineOut)
+        else:
+            #a = align_word(en, openCC.convert(zh))
+            #a = [(enList[i], zhList[j]) for (i, j) in was]
+            a = was
+            for m in p.finditer(en):
+                ii = m.start()
+                jj = m.end()
+                k, q = tokenIndices(en, ii, jj)
+                for idx in range(k, q):
+                    enList[idx] = f"{color.RED}{color.BOLD}{enList[idx]}{color.END}"
+                    idxT = [z for (e, z) in a if e == idx]  # target indices
+                    for iT in idxT:
+                        zhList[iT] = f"{color.GREEN}{color.BOLD}{zhList[iT]}{color.END}"
+            zhSub = ' '.join(zhList)
+            enSub = ' '.join(enList)
+            score = 1
+            lineOut = f'[{corpus_code}]\t{score}\t<p class="europe">{enSub}</p>\t<p class="chinese">{zhSub}</p>'
+            results.append(lineOut)
+        if cnt > max_matches: break
+    return results
+ #'''
+ #EXAMPLES of REGEX RESEARCH
+ #(take[sn]|taking|took) .{1,20} for granted
+ #'''
+if __name__ == '__main__':
+    print('\n\n'+'='*100)
+    print("""Usage:
+    Chinese search phrase
+        s('打擊 犯罪', c=0)
+    English search phrase
+        s('preemptive strike', c=0, mac_matches=200)
+    Type C (Capital "C") followed by the <Enter> key to see a list of corpora available.
+    """)
+    for c in C:
+        print(f"c={c}: {C[c][:-3]}")

src/stwebm_parquet_wa.py ADDED Viewed

	@@ -0,0 +1,247 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Sun Dec 11 19:51:02 2022
+@author: ruben
+"""
+import streamlit as st
+#from streamlit.components.v1 import html
+#import streamlit.components.v1 as components
+import pandas as pd
+#import numpy as np
+#from datetime import datetime
+from biconWebStmParquetWa import s as search, regex_search as rs
+# Session variables
+if 'queryresults' not in st.session_state:
+    st.session_state.queryresults = None
+if 'page' not in st.session_state:
+    st.session_state.page = 1  # starts at 1, not 0
+if 'maxpage' not in st.session_state:
+    st.session_state.maxpage = 0
+if 'minpage' not in st.session_state:
+    st.session_state.minpage = 1
+if 'datasize' not in st.session_state:
+    st.session_state.datasize = 0
+if 'chunksize' not in st.session_state:
+    st.session_state.chunksize = 3
+#if 'table' not in st.session_state:
+#    st.session_state.table = ''
+table = 'Empty Table'
+def buildTable(page):
+    # Build table
+    slices = st.session_state.queryresults
+    datasize = st.session_state.datasize
+    table = '<table width="100%">\n'
+    n = st.session_state.chunksize
+    for j in range(n):
+        index = (page-1)*n + j
+        if index >= datasize: break
+        try:
+            corpus, score, en, zh = slices[page-1][j].split('\t')
+        except:
+            continue
+        table += '<tr>\n'
+        table += f'<td>{corpus}</td><td colspan=2>{score}</td>\n'
+        table += '</tr>\n'
+        table += '<tr>\n'
+        table += f'<td>{index+1}</td><td width="45%" valign="top">{en}</td><td width="50%" valign="top">{zh}</td>'
+        table += '</tr>\n'
+    table += '</table>'
+    return table
+appTitle= '國教院華英雙語索引典系統2.0β版'
+appTitle= '華英雙語索引典系統2.0β版'
+sources = ('TWP', 'Patten', 'UNPC', 'FIN', 'QING', 'TWL', 'NTURegs', 'FTV', 'SAT', 'CIA',
+           'NEJM', 'VOA', 'NYT', 'BBC', 'Quixote', 'Wiki', 'TEST')
+corpus_labels = ('光華雜誌', '彭定康', '聯合國平行語料庫', '清史', '台灣法律(全國法規資料庫)', '臺大法規', '民視英語新聞', '科學人', '美國華人史', '新英格蘭醫學期刊', '美國之音', '紐約時報中文網', 'BBC', '唐吉柯德', '維基百科', '測試')
+files = [c + '.xz' for c in sources]
+st.set_page_config(
+    page_title=appTitle,
+    #page_icon=icon,
+    layout='wide',
+    initial_sidebar_state='auto',
+    menu_items={
+        'Get Help': 'https://streamlit.io/',
+        'Report a bug': 'https://github.com',
+        'About': f'**{appTitle}**\nCopyright (c) Ruben G. Tsui'
+        }
+)
+page_style = '''
+        <style>
+            .css-o18uir.e16nr0p33 {
+              margin-top: -125px;
+            }
+            .reportview-container .css-1lcbmhc .css-1outpf7 {{
+                padding-top: -125px;
+            }}
+            .reportview-container .main .block-container{{
+                padding-top: 0rem;
+                padding-right: 0rem;
+                padding-left: 0rem;
+                padding-bottom: 0rem;}}
+            .europe {
+				font-family: Consolas, Menlo, Courier New, Arial;
+                font-size: 14px;
+			}
+			.chinese {
+				/* font-family: Xingkai TC; */
+				font-family: Microsoft Jhenghei, Source Han Sans, Hiragino Sans CNS, LantingHei TC, Source Han Serif;
+                font-size: 36px;
+                font-weight: lighter;
+			}
+        </style>
+'''
+st.markdown(page_style, unsafe_allow_html=True)
+# Sidebar
+st.sidebar.subheader(appTitle)
+with st.sidebar:
+    #query = st.sidebar.text_input('輸入搜尋字串').strip()
+    query = st.sidebar.text_area('輸入搜尋字串').strip()
+    multicorpora = st.multiselect('選擇語料庫（可複選）', sources, ['TWP', 'FTV'])
+    colc, cold = st.sidebar.columns([1, 1])
+    with colc:
+        submit_button = st.button('搜尋')
+    with cold:
+        regex_search = st.radio(
+            "Regex search",
+            ["No", "Yes"], horizontal=True
+        )
+    cola, colb = st.sidebar.columns([1, 1])
+    with cola:
+        size = st.selectbox('筆數上限', [10,20,50,100,200,500,5000], index=2)
+    with colb:
+        stats_only = st.radio(
+            "Stats only",
+            ["No", "Yes"], horizontal=True
+        )
+    st.session_state.chunksize = st.slider("每頁筆數", 1, 20, 20)
+    # Build user interface
+    col1, col2, col3, col4 = st.columns([1, 1, 1, 1])
+    with col1:
+        first_button = st.button('First')
+    with col2:
+        prev_button = st.button('Prev')
+    with col3:
+        next_button = st.button('Next')
+    with col4:
+        last_button = st.button('Last')
+# Navigation
+if next_button:
+    if st.session_state.page < st.session_state.maxpage:
+        st.session_state.page += 1
+if prev_button:
+    if st.session_state.page > st.session_state.minpage:
+        st.session_state.page -= 1
+if first_button:
+    st.session_state.page = st.session_state.minpage
+if last_button:
+    st.session_state.page = st.session_state.maxpage
+page = st.session_state.page
+n = st.session_state.chunksize   # chunk size (no. of rows per chunk)
+# Logic to query database when button is pressed
+divider = '-'*80
+if submit_button:
+    # reset certin parameters
+    st.session_state.queryresults = None
+    st.session_state.page = 1  # starts at 1, not 0
+    st.session_state.maxpage = 0
+    st.session_state.minpage = 1
+    st.session_state.datasize = 0
+    #st.session_state.chunksize = 3
+    #selectedCorpus = corpora.index(True)
+    selectedCorpora = multicorpora
+    if regex_search == 'Yes':
+        #st.write('Regex search selected!!')
+        all_results = [] # results from all corpora selected
+        for c in selectedCorpora:
+            selectedCorpus = sources.index(c)
+            results = rs(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"), literal=False)
+            #st.write(f'No. of matches found in [{c}]: {len(results)}')
+            st.success(f'No. of matches found in [{c}]: {len(results)}')
+            all_results.extend(results)
+        datasize = len(all_results)
+        slices = [all_results[i:i+n] for i in range(0, datasize, n)]
+        pagesize = len(slices) # total no. of pages available
+        st.session_state.datasize = datasize
+        st.session_state.maxpage = pagesize
+        st.session_state.queryresults = slices
+    else:
+        all_results = [] # results from all corpora selected
+        all_summaries = []
+        for c in selectedCorpora:
+            selectedCorpus = sources.index(c)
+            results, summary = search(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"))
+            #results = regex_search(query, c=selectedCorpus, max_matches=size, stats_only=(stats_only=="Yes"), literal=true)
+            st.markdown(f'Corpus [{c}]: {len(results)} matches')
+            all_results.extend(results)
+            all_summaries.append(summary)
+        datasize = len(all_results)
+        slices = [all_results[i:i+n] for i in range(0, datasize, n)]
+        pagesize = len(slices) # total no. of pages available
+        st.session_state.datasize = datasize
+        st.session_state.maxpage = pagesize
+        st.session_state.queryresults = slices
+#st.write(st.session_state)
+if st.session_state.queryresults != None:
+    #col1a, col2a = st.columns([1, 2])
+    #with col1a:
+    st.markdown(f"page {st.session_state.page} of {st.session_state.maxpage}")
+    #with col2a:
+    #    st.slider("pages", 1, st.session_state.maxpage, st.session_state.page)
+    table = buildTable(st.session_state.page)
+    st.markdown(table, unsafe_allow_html=True)
+    #st.markdown(all_summaries, unsafe_allow_html=True)
+#table = buildTable(st.session_state.page)
+#st.markdown(table, unsafe_allow_html=True)
+#st.write("That's all, folks!")
+#st.write(table)