hoanguyenthanh07's picture
Upload folder using huggingface_hub
248a67b verified
from src.backend.lm_module import BaseModule
from typing import List
import numpy as np
class ChunkerModule(BaseModule):
def chunking(self, text: str) -> List:
texts_token = self._tokenizer(
text,
return_offsets_mapping=True,
verbose=False,
add_special_tokens=False,
padding=False,
truncation=False
)
texts_responses = []
# all_output =[]
times = 0
start = 0
input_model = []
while start + 512 < len(texts_token['input_ids']):
input_model = []
for inp in self._model.inputs:
input_model.append(np.array([texts_token[inp.name][start:start + 512]]))
outputs = self._model.run(data=input_model)[self._model.outputs[0].name][0] # -> 1 x bz x length -> bz x length
outputs =[value_outputs[0] for value_outputs in outputs]
# create batch
all_index = [index + 1 for index, value in enumerate(outputs) if value > 0]
all_index = [512*times + index for index in all_index]
if len(all_index) > 0:
texts_responses.extend(all_index)
start = texts_responses[-1]
else:
start += 512
times += 1
# final batch
for inp in self._model.inputs:
input_model.append(np.array([texts_token[inp.name][start:]]))
outputs = self._model.run(data=input_model)[self._model.outputs[0].name][0]
outputs =[value_outputs[0] for value_outputs in outputs]
all_index = [index + 1 for index, value in enumerate(outputs) if value > 0]
all_index = [start + index for index in all_index]
texts_responses.extend(all_index)
# decode
start = 0
return_value = []
if texts_responses == []:
return_value.append(self._tokenizer.decode(texts_token['input_ids'][start:]))
return return_value
for index in texts_responses:
return_value.append(self._tokenizer.decode(texts_token['input_ids'][start:index]))
start = index
return_value = [value for value in return_value if value != ""]
return return_value
def merge_subtexts_fix(self, list_sub_text, max_tokens: int = None):
merged_texts = []
current_num_token = 0
current_merge = ""
#
if max_tokens is None:
# max_tokens = self.context_module.tokenizer.model_max_length
raise ValueError("max_tokens must be set")
#
for subtext in list_sub_text:
#
text_responses = self._tokenizer(subtext)
num_chars = np.shape(text_responses['input_ids'])[-1]
#
if current_num_token + num_chars > max_tokens:
if current_merge:
merged_texts.append(current_merge.strip())
current_num_token = num_chars
current_merge = subtext
else:
current_num_token += num_chars
current_merge += " " + subtext
#
if current_merge:
merged_texts.append(current_merge.strip())
return merged_texts