Spaces:
Sleeping
Sleeping
| import sys, os | |
| import regex as re | |
| from datetime import datetime, timedelta | |
| from pathlib import Path, PurePath | |
| from math import ceil | |
| from random import seed as seed | |
| import numpy as np | |
| #import sqlite3 | |
| #from tqdm import tqdm | |
| import torch | |
| from sentence_splitter import SentenceSplitter, split_text_into_sentences | |
| #from nltk import word_tokenize | |
| import unicodedata | |
| import pysbd | |
| #import opencc | |
| import pandas as pd | |
| import openpyxl | |
| from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font | |
| from openpyxl.utils.dataframe import dataframe_to_rows | |
| from dp_utils import make_alignment_types, read_alignments, \ | |
| read_in_embeddings, make_doc_embedding, vecalign, yield_overlaps | |
| from score import score_multiple, log_final_scores | |
| #from sentence_transformers import SentenceTransformer, models, util | |
| #s2tw = opencc.OpenCC('s2tw.json') | |
| # Support for llama-cpp-python | |
| from llama_cpp import Llama | |
| #%% | |
| start_time = datetime.now() | |
| dev = ['cuda', 'mps', 'cpu'][2] # cpu only | |
| if dev in ['cuda', 'mps']: | |
| n_gpu_layers = -1 | |
| else: | |
| n_gpu_layers = 0 | |
| m = 2 | |
| model_name = ['Alibaba-NLP/gte-multilingual-base', 'ibm-granite/granite-embedding-278m-multilingual', 'LaBSEq80', 'LaBSEfp16', 'google/embeddinggemma-300m', 'paraphrase-multilingual-MiniLM-L12-v2'][m] | |
| model_path = ['', '', 'src/labse.Q8_0.gguf', '', '', ''][m] | |
| model_name_short = ['alibaba-gte-multilingual', 'ibm-granite', 'LaBSE-gguf-q80', 'LaBSE-gguf-fp16', 'embeddinggemma-300m','paraphrase'][m] | |
| #%% | |
| print(f"Now running bitext mining with transformer model [{model_path}] on device [{dev}]...", flush=True) | |
| llm = Llama( | |
| model_path=model_path, | |
| embedding=True, | |
| n_gpu_layers=n_gpu_layers, # Uncomment to use GPU acceleration | |
| # seed=1337, # Uncomment to set a specific seed | |
| # n_ctx=2048, # Uncomment to increase the context window | |
| #pooling_type=1, | |
| verbose=False, | |
| ) | |
| print(f"Finished loading model: {model_name}.", flush=True) | |
| end_time = datetime.now() - start_time | |
| print(f"Model-loading time: {end_time.seconds} secs", flush=True) | |
| #%% | |
| def encodeVectors(ss): | |
| #return torch.as_tensor([llm.embed(s, normalize=True) for s in ss]) | |
| #return torch.as_tensor(llm.embed(ss, normalize=True)) | |
| return [torch.as_tensor(llm.embed(s, normalize=True)) for s in ss] | |
| #%% | |
| def print_alignments(alignments, scores=None, file=sys.stdout): | |
| if scores is not None: | |
| for (x, y), s in zip(alignments, scores): | |
| print('%s:%s:%.6f' % (x, y, s), file=file) | |
| else: | |
| for x, y in alignments: | |
| print('%s:%s' % (x, y), file=file) | |
| def file_open(filepath): | |
| #Function to allowing opening files based on file extension | |
| if filepath.endswith('.gz'): | |
| return gzip.open(filepath, 'rt', encoding='utf8') | |
| elif filepath.endswith('.bz2'): | |
| return bz2.open(filepath, 'rt', encoding='utf8') | |
| elif filepath.endswith('.xz'): | |
| return lzma.open(filepath, 'rt', encoding='utf8') | |
| else: | |
| return open(filepath, 'r', encoding='utf8') | |
| def getLines(fin): | |
| ''' | |
| Retrive lines from a file or (later) sqlite3 database | |
| ''' | |
| lines = file_open(fin).readlines() | |
| return [s.strip() for s in lines if s.strip() != ''] | |
| def getSentIndex(lines): | |
| """ | |
| dictionary look-up: | |
| keys = sentence or overlapped sentences | |
| value = index | |
| """ | |
| sent2line = dict() | |
| for ii, line in enumerate(lines): | |
| if line.strip() in sent2line: | |
| raise Exception('got multiple embeddings for the same line') | |
| sent2line[line.strip()] = ii | |
| return sent2line | |
| def getOverlaps(lines, num_overlaps): | |
| output = set() | |
| for out_line in yield_overlaps(lines, num_overlaps): | |
| output.add(out_line) | |
| # for reproducibility | |
| output = list(output) | |
| output.sort() | |
| return output | |
| def normalizeText(text): | |
| text = text.replace("\xad", '') # remove Unicode soft hyphen | |
| return unicodedata.normalize("NFKC", text) # remove Unicode , among others | |
| # Sentence tokenizer | |
| # regex to identify Chinese sentence boundaries | |
| #regex_zh_sent_delim = re.compile(r"([。!?;][」』”〕》〗】))\]]?)") | |
| #regex_zh_sent_delim = re.compile(r"([。?;][」』”〕》〗】))\]]?)") | |
| #regex_zh_sent_delim = re.compile(r'(?P<quotation_mark>([。?!…]{1,2})[」』〕》〗】\])”’"\')])') | |
| #regex_zh_sent_delim = re.compile(r"[。!?]") | |
| regex_zh_sent_delim = re.compile(r"([。?!…][」』”’\'\"〕》〗】))\]]{0,3})") | |
| def normalizeTextZh(text): | |
| text = text.replace("\xad", '') # remove Unicode | |
| #text = text.replace("!", "!").replace(";", ";") | |
| return unicodedata.normalize("NFKD", text) # remove Unicode , among others | |
| def sentencizeZh(s): | |
| ''' | |
| turn long string s into a list of sentences | |
| ''' | |
| s = normalizeTextZh(s) | |
| s = s.replace(',',',').replace(';',';').replace("!", "!").replace(":", ":").replace("?", "?") | |
| ss = regex_zh_sent_delim.sub(r"\1\n", s).split("\n") | |
| return [s.strip() for s in ss if s.strip() != ''] | |
| def sentencize(s, lang='en'): | |
| if lang in ['zh', 'ja']: | |
| return sentencizeZh(s) | |
| else: # lang in ['en', 'es', 'fr', 'de', 'it', etc. ] | |
| splitter = SentenceSplitter(language=lang) | |
| sentseg = pysbd.Segmenter(language=lang, clean=False) | |
| s = normalizeText(s) | |
| ss = splitter.split(text=s) | |
| #ss = sentseg.segment(s) | |
| return [s.strip() for s in ss if s.strip() != ''] | |
| def convertChinesePunctuations(txt): | |
| ''' | |
| Convert “”‘’ to, respeectively 「」『』 | |
| ''' | |
| punctHans2Hant = {'“”‘’': '「」『』'} | |
| for k in punctHans2Hant: | |
| v = punctHans2Hant[k] | |
| for ps, pt in zip(k, v): | |
| txt = txt.replace(ps, pt) | |
| return txt | |
| def align(sS, sT, alignment_max_size=4): | |
| # make runs consistent | |
| seed(42) | |
| np.random.seed(42) | |
| # source | |
| overlapsS = getOverlaps(sS, alignment_max_size) # create "overlapped" sentences | |
| s2idxS = getSentIndex(overlapsS) # create "sentence-to-index" lookup table | |
| embedS = encodeVectors(overlapsS) # encode a list of sentences | |
| src_line_embeddings = torch.vstack(embedS).cpu().numpy() # turns a list of sentences into a tensor object | |
| # target | |
| overlapsT = getOverlaps(sT, alignment_max_size) | |
| s2idxT = getSentIndex(overlapsT) | |
| embedT = encodeVectors(overlapsT) | |
| overlapsS = getOverlaps(sS, alignment_max_size) | |
| tgt_line_embeddings = torch.vstack(embedT).cpu().numpy() | |
| #print(f"src_line_embeddings has shape: [{src_line_embeddings.shape}]") | |
| #print(f"tgt_line_embeddings has shape: [{tgt_line_embeddings.shape}]") | |
| #sys.exit(0) | |
| width_over2 = ceil(alignment_max_size / 2.0) + 5 | |
| test_alignments = [] | |
| stack_list = [] | |
| #src_lines = open(finS, 'rt', encoding="utf-8").readlines() | |
| vecs0 = make_doc_embedding(s2idxS, src_line_embeddings, sS, alignment_max_size) | |
| #tgt_lines = open(finT, 'rt', encoding="utf-8").readlines() | |
| vecs1 = make_doc_embedding(s2idxT, tgt_line_embeddings, sT, alignment_max_size) | |
| final_alignment_types = make_alignment_types(alignment_max_size) | |
| stack = vecalign(vecs0=vecs0, | |
| vecs1=vecs1, | |
| final_alignment_types=final_alignment_types, | |
| del_percentile_frac=0.2, | |
| width_over2=width_over2, | |
| max_size_full_dp=300, | |
| costs_sample_size=20000, | |
| num_samps_for_norm=100) | |
| # write final alignments to fk\ile | |
| #print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores']) | |
| #test_alignments.append(stack[0]['final_alignments']) | |
| #stack_list.append(stack) | |
| alignments = stack[0]['final_alignments'] | |
| scores = stack[0]['alignment_scores'] | |
| aligned_sentences = [] | |
| if scores is not None: | |
| for (idxS, idxT), score in zip(alignments, scores): | |
| sbS = [] # sentence block - source | |
| for i in idxS: | |
| sbS.append(sS[i]) | |
| sbT = [] # sentence block - target | |
| for i in idxT: | |
| sbT.append(sT[i]) | |
| #aligned_sentences.append(f"{score:.5f}\t{idxS}\t{' '.join(sbS)}\t{idxT}\t{' '.join(sbT)}") | |
| #aligned_sentences.append([score, idxS, ' '.join(sbS), idxT, ' '.join(sbT)]) | |
| if langS in ['zh', 'ja']: | |
| sepS = '' | |
| else: | |
| sepS = ' ' | |
| if langT in ['zh', 'ja']: | |
| sepT = '' | |
| else: | |
| sepT = ' ' | |
| #aligned_sentences.append([score, idxS, joinedSegmentsS, idxT, joinedSegmentsT]) | |
| aligned_sentences.append([score, idxS, sepS.join(sbS), idxT, sepT.join(sbT)]) | |
| return aligned_sentences | |
| #%% | |
| def createExcel(fin): | |
| """ fin = plain text aligned text | |
| """ | |
| # Create a new workbook | |
| wb = openpyxl.Workbook() | |
| # Select the active sheet | |
| ws = wb.active | |
| # Set column widths | |
| ws.column_dimensions['A'].width = 10 | |
| ws.column_dimensions['B'].width = 10 | |
| ws.column_dimensions['C'].width = 10 | |
| ws.column_dimensions['D'].width = 50 | |
| ws.column_dimensions['E'].width = 10 | |
| ws.column_dimensions['F'].width = 65 | |
| data = open(fin, 'r', encoding='utf-8').readlines() | |
| df = pd.DataFrame([x.split('\t') for x in data], columns=['cosdist', 'cols_s', langS, 'cols_t', langT]) | |
| for r in dataframe_to_rows(df, index=True, header=True): | |
| ws.append(r) | |
| # Set cell alignment | |
| alignment = Alignment(horizontal='general', | |
| vertical='top', | |
| wrap_text=True) | |
| cnt = len(data) | |
| for row in ws[f'A1:F{cnt+10}']: | |
| for cell in row: | |
| cell.alignment = alignment | |
| # Save the workbook | |
| base = Path(fin).stem | |
| fon_xlsx = Path(fin).parent / f'{base}.xlsx' | |
| wb.save(fon_xlsx) | |
| #%% | |
| if __name__ == '__main__': | |
| print(sys.argv) | |
| #sys.exit(0) | |
| alignment_max_size = 7 | |
| print(f"alignment_max_size = {alignment_max_size}") | |
| ########################################################### | |
| # Step 1 Use chapter separator? | |
| # Step 2 Convert to Traditional Chinese? | |
| ########################################################### | |
| USE_REGEX_CHAPTER_SEPARATOR = False # True # False | |
| ########################################################### | |
| # Step 3 Choose language pair (translation direction) | |
| ########################################################### | |
| langS = 'zh' | |
| langT = 'en' | |
| out_langS, out_langT = langS, langT | |
| ########################################################### | |
| # Step 5 Choose input file folder | |
| ########################################################### | |
| base_folder = '.' | |
| in_folder = "." | |
| out_folder = "." | |
| base_fn = sys.argv[1] | |
| ###################################################################### | |
| # regex for dividing text into chunks (chapter, book, section, etc.) | |
| DregexS = {'': r"", | |
| 'ghosttown': r"[0123456789]{1,3}.*", | |
| } | |
| DregexT = {'': r"\n(", | |
| 'ghosttown': r"\d{1,3}\..*", | |
| } | |
| if True: | |
| finS = f"{base_fn}.{langS}.txt" | |
| finT = f"{base_fn}.{langT}.txt" | |
| fon = f"{out_folder}/{base_fn}.vecalign.n{alignment_max_size}.{model_name_short}.{dev}.{out_langS}-{out_langT}.txt" | |
| print(f"processing [{finS}] and [{finT}] to create [{fon}]...") | |
| txtS = open(finS, "r", encoding="utf-8").read() | |
| if USE_REGEX_CHAPTER_SEPARATOR: | |
| regexS = f"\n({DregexS[base_fn]})\n" | |
| chS = re.split(regexS, txtS) | |
| else: | |
| chS = [txtS] | |
| print(f"chS has {len(chS)} elements", flush=True) | |
| txtT = open(finT, "r", encoding="utf-8").read() | |
| if USE_REGEX_CHAPTER_SEPARATOR: | |
| regexT = f"\n({DregexT[base_fn]})\n" | |
| chT = re.split(regexT, txtT) | |
| else: | |
| chT = [txtT] | |
| print(f"chT has {len(chT)} elements", flush=True) | |
| if len(chS) == len(chT): | |
| print("Both have the same number of elements!") | |
| else: | |
| hS = [chS[i] for i in range(len(chS)) if i % 2 == 1] | |
| sizeS = len(hS) | |
| hT = [chT[i] for i in range(len(chT)) if i % 2 == 1] | |
| sizeT = len(hT) | |
| if sizeS > sizeT: | |
| for j in range(sizeS - sizeT): | |
| hT.append('') | |
| elif sizeS < sizeT: | |
| for j in range(sizeT - sizeS): | |
| hS.append('') | |
| with open(f'{out_folder}/{base_fn}.vecalign.n{alignment_max_size}.{out_langS}-{out_langT}.ChapterMathchings.txt', 'w', encoding='utf-8') as fo: | |
| for s,t in zip(hS, hT): | |
| fo.write(f"{s}\t{t}\n") | |
| sys.exit(0) | |
| #sys.exit(0) | |
| ch_cnt = 0 | |
| for cS, cT in zip(chS, chT): | |
| #if cT[:2] not in ['天戰']: continue | |
| ch_cnt += 1 | |
| print(f"processing segment [{ch_cnt}]...", flush=True) | |
| # Source | |
| pS = cS.strip().split("\n") | |
| pS = [s.strip() for s in pS if s.strip()!=''] | |
| sS = [] | |
| for p in pS: | |
| sS.extend(sentencize(p, lang=langS)) | |
| sS = [s.strip() for s in sS if s.strip()!=''] | |
| ## convert source from simplified Chinese to traditional Chinese | |
| # Target | |
| pT = cT.strip().split("\n") | |
| pT = [s.strip() for s in pT if s.strip()!=''] | |
| sT = [] | |
| for p in pT: | |
| sT.extend(sentencize(p, lang=langT)) | |
| sT = [s.strip() for s in sT if s.strip()!=''] | |
| ## convert target from simplified Chinese to traditional Chinese | |
| with open(fon, "a", encoding="utf-8", newline="\n") as fo: | |
| #for score, idxE, e, idxZ, z in align(sE, sZ, alignment_max_size=alignment_max_size): | |
| # headers | |
| fo.write("cosdist\tsrcidx\tzh\ttgtidx\ten\n") | |
| for score, idxS, ss, idxT, tt in align(sS, sT, alignment_max_size=alignment_max_size): | |
| #fo.write(f"{base}\t{score:.4f}\t{idxS}\t{ss}\t{idxT}\t{tt}\n") | |
| fo.write(f"{score:.4f}\t{idxS}\t{ss}\t{idxT}\t{tt}\n") | |
| fo.flush() | |
| print('-'*25) | |
| fon_xlsx = fon | |
| #print("Creating Excel file...") | |
| #createExcel(fon_xlsx) | |
| print('='*25) | |
| #%% | |