| from src.backend.lm_module import BaseModule |
| from typing import List |
| import numpy as np |
|
|
| class ChunkerModule(BaseModule): |
|
|
| def chunking(self, text: str) -> List: |
| |
| texts_token = self._tokenizer( |
| text, |
| return_offsets_mapping=True, |
| verbose=False, |
| add_special_tokens=False, |
| padding=False, |
| truncation=False |
| ) |
|
|
| texts_responses = [] |
| |
| times = 0 |
| start = 0 |
| input_model = [] |
| while start + 512 < len(texts_token['input_ids']): |
| input_model = [] |
| for inp in self._model.inputs: |
| input_model.append(np.array([texts_token[inp.name][start:start + 512]])) |
| outputs = self._model.run(data=input_model)[self._model.outputs[0].name][0] |
| outputs =[value_outputs[0] for value_outputs in outputs] |
| |
| all_index = [index + 1 for index, value in enumerate(outputs) if value > 0] |
| all_index = [512*times + index for index in all_index] |
| if len(all_index) > 0: |
| texts_responses.extend(all_index) |
| start = texts_responses[-1] |
| else: |
| start += 512 |
| times += 1 |
| |
| for inp in self._model.inputs: |
| input_model.append(np.array([texts_token[inp.name][start:]])) |
| outputs = self._model.run(data=input_model)[self._model.outputs[0].name][0] |
| outputs =[value_outputs[0] for value_outputs in outputs] |
| all_index = [index + 1 for index, value in enumerate(outputs) if value > 0] |
| all_index = [start + index for index in all_index] |
| texts_responses.extend(all_index) |
| |
| start = 0 |
| return_value = [] |
| if texts_responses == []: |
| return_value.append(self._tokenizer.decode(texts_token['input_ids'][start:])) |
| return return_value |
| |
| for index in texts_responses: |
| return_value.append(self._tokenizer.decode(texts_token['input_ids'][start:index])) |
| start = index |
| return_value = [value for value in return_value if value != ""] |
| return return_value |
| |
| def merge_subtexts_fix(self, list_sub_text, max_tokens: int = None): |
| merged_texts = [] |
| current_num_token = 0 |
| current_merge = "" |
| |
| if max_tokens is None: |
| |
| raise ValueError("max_tokens must be set") |
| |
| for subtext in list_sub_text: |
| |
| text_responses = self._tokenizer(subtext) |
| num_chars = np.shape(text_responses['input_ids'])[-1] |
| |
| if current_num_token + num_chars > max_tokens: |
| if current_merge: |
| merged_texts.append(current_merge.strip()) |
| current_num_token = num_chars |
| current_merge = subtext |
| else: |
| current_num_token += num_chars |
| current_merge += " " + subtext |
| |
| if current_merge: |
| merged_texts.append(current_merge.strip()) |
| return merged_texts |