Spaces:

rubentsui
/

BitextAlign

Sleeping

App Files Files Community

BitextAlign / src /alignGenericGGUF.py

rubentsui

Update src/alignGenericGGUF.py

45336c4 verified about 2 months ago

raw

history blame contribute delete

14.4 kB

	import sys, os
	import regex as re
	from datetime import datetime, timedelta
	from pathlib import Path, PurePath
	from math import ceil
	from random import seed as seed
	import numpy as np
	#import sqlite3
	#from tqdm import tqdm
	import torch
	from sentence_splitter import SentenceSplitter, split_text_into_sentences
	#from nltk import word_tokenize
	import unicodedata
	import pysbd
	#import opencc

	import pandas as pd
	import openpyxl
	from openpyxl.styles import PatternFill, Border, Side, Alignment, Protection, Font
	from openpyxl.utils.dataframe import dataframe_to_rows


	from dp_utils import make_alignment_types, read_alignments, \
	read_in_embeddings, make_doc_embedding, vecalign, yield_overlaps

	from score import score_multiple, log_final_scores

	#from sentence_transformers import SentenceTransformer, models, util

	#s2tw = opencc.OpenCC('s2tw.json')

	# Support for llama-cpp-python
	from llama_cpp import Llama
	#%%
	start_time = datetime.now()

	dev = ['cuda', 'mps', 'cpu'][2] # cpu only
	if dev in ['cuda', 'mps']:
	n_gpu_layers = -1
	else:
	n_gpu_layers = 0

	m = 2
	model_name = ['Alibaba-NLP/gte-multilingual-base', 'ibm-granite/granite-embedding-278m-multilingual', 'LaBSEq80', 'LaBSEfp16', 'google/embeddinggemma-300m', 'paraphrase-multilingual-MiniLM-L12-v2'][m]
	model_path = ['', '', 'src/labse.Q8_0.gguf', '', '', ''][m]
	model_name_short = ['alibaba-gte-multilingual', 'ibm-granite', 'LaBSE-gguf-q80', 'LaBSE-gguf-fp16', 'embeddinggemma-300m','paraphrase'][m]
	#%%
	print(f"Now running bitext mining with transformer model [{model_path}] on device [{dev}]...", flush=True)
	llm = Llama(
	model_path=model_path,
	embedding=True,
	n_gpu_layers=n_gpu_layers, # Uncomment to use GPU acceleration
	# seed=1337, # Uncomment to set a specific seed
	# n_ctx=2048, # Uncomment to increase the context window
	#pooling_type=1,
	verbose=False,
	)
	print(f"Finished loading model: {model_name}.", flush=True)


	end_time = datetime.now() - start_time
	print(f"Model-loading time: {end_time.seconds} secs", flush=True)

	#%%

	def encodeVectors(ss):

	#return torch.as_tensor([llm.embed(s, normalize=True) for s in ss])
	#return torch.as_tensor(llm.embed(ss, normalize=True))

	return [torch.as_tensor(llm.embed(s, normalize=True)) for s in ss]

	#%%
	def print_alignments(alignments, scores=None, file=sys.stdout):
	if scores is not None:
	for (x, y), s in zip(alignments, scores):
	print('%s:%s:%.6f' % (x, y, s), file=file)
	else:
	for x, y in alignments:
	print('%s:%s' % (x, y), file=file)


	def file_open(filepath):
	#Function to allowing opening files based on file extension
	if filepath.endswith('.gz'):
	return gzip.open(filepath, 'rt', encoding='utf8')
	elif filepath.endswith('.bz2'):
	return bz2.open(filepath, 'rt', encoding='utf8')
	elif filepath.endswith('.xz'):
	return lzma.open(filepath, 'rt', encoding='utf8')
	else:
	return open(filepath, 'r', encoding='utf8')

	def getLines(fin):
	'''
	Retrive lines from a file or (later) sqlite3 database
	'''
	lines = file_open(fin).readlines()
	return [s.strip() for s in lines if s.strip() != '']

	def getSentIndex(lines):
	"""
	dictionary look-up:
	keys = sentence or overlapped sentences
	value = index
	"""
	sent2line = dict()
	for ii, line in enumerate(lines):
	if line.strip() in sent2line:
	raise Exception('got multiple embeddings for the same line')
	sent2line[line.strip()] = ii
	return sent2line

	def getOverlaps(lines, num_overlaps):
	output = set()
	for out_line in yield_overlaps(lines, num_overlaps):
	output.add(out_line)

	# for reproducibility
	output = list(output)
	output.sort()
	return output

	def normalizeText(text):
	text = text.replace("\xad", '') # remove Unicode soft hyphen
	return unicodedata.normalize("NFKC", text) # remove Unicode , among others

	# Sentence tokenizer

	# regex to identify Chinese sentence boundaries
	#regex_zh_sent_delim = re.compile(r"([。！？；][」』”〕》〗】)）\]]?)")
	#regex_zh_sent_delim = re.compile(r"([。？；][」』”〕》〗】)）\]]?)")
	#regex_zh_sent_delim = re.compile(r'(?P<quotation_mark>([。？！…]{1,2})[」』〕》〗】\])”’"\'）])')
	#regex_zh_sent_delim = re.compile(r"[。！？]")
	regex_zh_sent_delim = re.compile(r"([。？！…][」』”’\'\"〕》〗】)）\]]{0,3})")

	def normalizeTextZh(text):
	text = text.replace("\xad", '') # remove Unicode
	#text = text.replace("!", "！").replace(";", "；")
	return unicodedata.normalize("NFKD", text) # remove Unicode , among others

	def sentencizeZh(s):
	'''
	turn long string s into a list of sentences
	'''
	s = normalizeTextZh(s)
	s = s.replace(',','，').replace(';','；').replace("!", "！").replace(":", "：").replace("?", "？")
	ss = regex_zh_sent_delim.sub(r"\1\n", s).split("\n")
	return [s.strip() for s in ss if s.strip() != '']


	def sentencize(s, lang='en'):
	if lang in ['zh', 'ja']:
	return sentencizeZh(s)
	else: # lang in ['en', 'es', 'fr', 'de', 'it', etc. ]
	splitter = SentenceSplitter(language=lang)
	sentseg = pysbd.Segmenter(language=lang, clean=False)
	s = normalizeText(s)
	ss = splitter.split(text=s)
	#ss = sentseg.segment(s)
	return [s.strip() for s in ss if s.strip() != '']

	def convertChinesePunctuations(txt):
	'''
	Convert “”‘’ to, respeectively 「」『』
	'''
	punctHans2Hant = {'“”‘’': '「」『』'}
	for k in punctHans2Hant:
	v = punctHans2Hant[k]
	for ps, pt in zip(k, v):
	txt = txt.replace(ps, pt)
	return txt


	def align(sS, sT, alignment_max_size=4):

	# make runs consistent
	seed(42)
	np.random.seed(42)

	# source
	overlapsS = getOverlaps(sS, alignment_max_size) # create "overlapped" sentences
	s2idxS = getSentIndex(overlapsS) # create "sentence-to-index" lookup table
	embedS = encodeVectors(overlapsS) # encode a list of sentences
	src_line_embeddings = torch.vstack(embedS).cpu().numpy() # turns a list of sentences into a tensor object
	# target
	overlapsT = getOverlaps(sT, alignment_max_size)
	s2idxT = getSentIndex(overlapsT)
	embedT = encodeVectors(overlapsT)
	overlapsS = getOverlaps(sS, alignment_max_size)
	tgt_line_embeddings = torch.vstack(embedT).cpu().numpy()

	#print(f"src_line_embeddings has shape: [{src_line_embeddings.shape}]")
	#print(f"tgt_line_embeddings has shape: [{tgt_line_embeddings.shape}]")
	#sys.exit(0)

	width_over2 = ceil(alignment_max_size / 2.0) + 5

	test_alignments = []
	stack_list = []

	#src_lines = open(finS, 'rt', encoding="utf-8").readlines()
	vecs0 = make_doc_embedding(s2idxS, src_line_embeddings, sS, alignment_max_size)

	#tgt_lines = open(finT, 'rt', encoding="utf-8").readlines()
	vecs1 = make_doc_embedding(s2idxT, tgt_line_embeddings, sT, alignment_max_size)

	final_alignment_types = make_alignment_types(alignment_max_size)

	stack = vecalign(vecs0=vecs0,
	vecs1=vecs1,
	final_alignment_types=final_alignment_types,
	del_percentile_frac=0.2,
	width_over2=width_over2,
	max_size_full_dp=300,
	costs_sample_size=20000,
	num_samps_for_norm=100)

	# write final alignments to fk\ile
	#print_alignments(stack[0]['final_alignments'], stack[0]['alignment_scores'])
	#test_alignments.append(stack[0]['final_alignments'])
	#stack_list.append(stack)

	alignments = stack[0]['final_alignments']
	scores = stack[0]['alignment_scores']

	aligned_sentences = []
	if scores is not None:
	for (idxS, idxT), score in zip(alignments, scores):
	sbS = [] # sentence block - source
	for i in idxS:
	sbS.append(sS[i])
	sbT = [] # sentence block - target
	for i in idxT:
	sbT.append(sT[i])

	#aligned_sentences.append(f"{score:.5f}\t{idxS}\t{' '.join(sbS)}\t{idxT}\t{' '.join(sbT)}")
	#aligned_sentences.append([score, idxS, ' '.join(sbS), idxT, ' '.join(sbT)])

	if langS in ['zh', 'ja']:
	sepS = ''
	else:
	sepS = ' '

	if langT in ['zh', 'ja']:
	sepT = ''
	else:
	sepT = ' '

	#aligned_sentences.append([score, idxS, joinedSegmentsS, idxT, joinedSegmentsT])
	aligned_sentences.append([score, idxS, sepS.join(sbS), idxT, sepT.join(sbT)])

	return aligned_sentences
	#%%

	def createExcel(fin):

	""" fin = plain text aligned text
	"""

	# Create a new workbook
	wb = openpyxl.Workbook()
	# Select the active sheet
	ws = wb.active
	# Set column widths
	ws.column_dimensions['A'].width = 10
	ws.column_dimensions['B'].width = 10
	ws.column_dimensions['C'].width = 10
	ws.column_dimensions['D'].width = 50
	ws.column_dimensions['E'].width = 10
	ws.column_dimensions['F'].width = 65

	data = open(fin, 'r', encoding='utf-8').readlines()

	df = pd.DataFrame([x.split('\t') for x in data], columns=['cosdist', 'cols_s', langS, 'cols_t', langT])

	for r in dataframe_to_rows(df, index=True, header=True):
	ws.append(r)

	# Set cell alignment
	alignment = Alignment(horizontal='general',
	vertical='top',
	wrap_text=True)
	cnt = len(data)
	for row in ws[f'A1:F{cnt+10}']:
	for cell in row:
	cell.alignment = alignment

	# Save the workbook
	base = Path(fin).stem
	fon_xlsx = Path(fin).parent / f'{base}.xlsx'
	wb.save(fon_xlsx)

	#%%

	if __name__ == '__main__':


	print(sys.argv)
	#sys.exit(0)

	alignment_max_size = 7
	print(f"alignment_max_size = {alignment_max_size}")

	###########################################################
	# Step 1 Use chapter separator?
	# Step 2 Convert to Traditional Chinese?
	###########################################################
	USE_REGEX_CHAPTER_SEPARATOR = False # True # False

	###########################################################
	# Step 3 Choose language pair (translation direction)
	###########################################################
	langS = 'zh'
	langT = 'en'

	out_langS, out_langT = langS, langT

	###########################################################
	# Step 5 Choose input file folder
	###########################################################
	base_folder = '.'

	in_folder = "."
	out_folder = "."

	base_fn = sys.argv[1]

	######################################################################
	# regex for dividing text into chunks (chapter, book, section, etc.)
	DregexS = {'': r"",
	'ghosttown': r"[０１２３４５６７８９]{1,3}.*",
	}

	DregexT = {'': r"\n(",
	'ghosttown': r"\d{1,3}\..*",
	}

	if True:

	finS = f"{base_fn}.{langS}.txt"
	finT = f"{base_fn}.{langT}.txt"

	fon = f"{out_folder}/{base_fn}.vecalign.n{alignment_max_size}.{model_name_short}.{dev}.{out_langS}-{out_langT}.txt"

	print(f"processing [{finS}] and [{finT}] to create [{fon}]...")

	txtS = open(finS, "r", encoding="utf-8").read()
	if USE_REGEX_CHAPTER_SEPARATOR:
	regexS = f"\n({DregexS[base_fn]})\n"
	chS = re.split(regexS, txtS)
	else:
	chS = [txtS]
	print(f"chS has {len(chS)} elements", flush=True)

	txtT = open(finT, "r", encoding="utf-8").read()
	if USE_REGEX_CHAPTER_SEPARATOR:
	regexT = f"\n({DregexT[base_fn]})\n"
	chT = re.split(regexT, txtT)
	else:
	chT = [txtT]
	print(f"chT has {len(chT)} elements", flush=True)

	if len(chS) == len(chT):
	print("Both have the same number of elements!")
	else:
	hS = [chS[i] for i in range(len(chS)) if i % 2 == 1]
	sizeS = len(hS)
	hT = [chT[i] for i in range(len(chT)) if i % 2 == 1]
	sizeT = len(hT)
	if sizeS > sizeT:
	for j in range(sizeS - sizeT):
	hT.append('')
	elif sizeS < sizeT:
	for j in range(sizeT - sizeS):
	hS.append('')
	with open(f'{out_folder}/{base_fn}.vecalign.n{alignment_max_size}.{out_langS}-{out_langT}.ChapterMathchings.txt', 'w', encoding='utf-8') as fo:
	for s,t in zip(hS, hT):
	fo.write(f"{s}\t{t}\n")
	sys.exit(0)

	#sys.exit(0)

	ch_cnt = 0
	for cS, cT in zip(chS, chT):

	#if cT[:2] not in ['天戰']: continue

	ch_cnt += 1
	print(f"processing segment [{ch_cnt}]...", flush=True)

	# Source
	pS = cS.strip().split("\n")
	pS = [s.strip() for s in pS if s.strip()!='']
	sS = []
	for p in pS:
	sS.extend(sentencize(p, lang=langS))
	sS = [s.strip() for s in sS if s.strip()!='']
	## convert source from simplified Chinese to traditional Chinese

	# Target
	pT = cT.strip().split("\n")
	pT = [s.strip() for s in pT if s.strip()!='']
	sT = []
	for p in pT:
	sT.extend(sentencize(p, lang=langT))
	sT = [s.strip() for s in sT if s.strip()!='']
	## convert target from simplified Chinese to traditional Chinese

	with open(fon, "a", encoding="utf-8", newline="\n") as fo:
	#for score, idxE, e, idxZ, z in align(sE, sZ, alignment_max_size=alignment_max_size):
	# headers
	fo.write("cosdist\tsrcidx\tzh\ttgtidx\ten\n")
	for score, idxS, ss, idxT, tt in align(sS, sT, alignment_max_size=alignment_max_size):
	#fo.write(f"{base}\t{score:.4f}\t{idxS}\t{ss}\t{idxT}\t{tt}\n")
	fo.write(f"{score:.4f}\t{idxS}\t{ss}\t{idxT}\t{tt}\n")
	fo.flush()

	print('-'*25)
	fon_xlsx = fon
	#print("Creating Excel file...")
	#createExcel(fon_xlsx)
	print('='*25)




	#%%