File size: 3,325 Bytes
e489264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from threading import Thread
import constants
from pathlib import Path
import random
from typing import Union, Any, List
from interfaces import IProcess, IProcessor
from processes import (
    RandomCharRemover,
    RandomCharsInjector,
    RandomCharsSwapper,
    RandomNeighborReplacer,
    RandomWordsCollapsor,
    PunctuationRemover,
    SentencePermutation,
)


class FilesProcessor(IProcessor):
    def __init__(
            self, processes: List[IProcess],
            n_dist: int = 32
            ) -> None:
        self.processes = processes
        self.n_dist = n_dist
        self.__dist = False
        self.__cache = []

    def file_run(self, file: Union[str, Path]) -> Any:
        result = file
        for process in self.processes:
            result = process.execute(result)
        return result

    def run(
            self,
            files: List[Union[str, Path]]
            ) -> Any:
        result = list(map(self.file_run, files))
        if self.__dist is True:
            self.__cache.append(result)
            return
        return result

    def _divde(self, data: List[Any]):
        items_per_div = len(data) // self.n_dist
        divs = []
        for i in range(items_per_div):
            start = i * items_per_div
            end = (i + 1) * items_per_div
            if i == (items_per_div - 1):
                end = len(divs)
            divs.append(data[start: end])
        return divs

    def dist_run(
            self,
            files: List[Union[str, Path]]
            ) -> Any:
        self.__dist = True
        self.__cache = []
        divs = self._divde(files)
        threads = []
        for div in divs:
            t = Thread(target=self.run, args=(div,))
            t.start()
            threads.append(t)
        for t in threads:
            t.join()
        self.__dist = False
        results = []
        for item in self.__cache:
            results.extend(item)
        self.__cache = []
        return results


class TextDistorter(IProcessor):
    def __init__(
            self, ratio: float, processes: List[IProcess]
            ) -> None:
        super().__init__()
        self.ratio = ratio
        self.processes = processes

    def run(self, line: str) -> str:
        length = len(line)
        n = int(self.ratio * length)
        for _ in range(n):
            line = random.choice(self.processes).execute(line)
        return line

    def dist_run(self):
        # TODO
        pass


class TextProcessor(IProcessor):
    def __init__(self, processes: List[IProcess]) -> None:
        super().__init__()
        self.processes = processes

    def run(self, sentence: str):
        for process in self.processes:
            sentence = process.execute(sentence)
        return sentence

    def dist_run(self, sentence: str) -> str:
        return self.run(sentence)


def get_text_distorter(ratio, sentences: List[str]):

    return TextDistorter(
        ratio=ratio,
        processes=[
            SentencePermutation(sentences),
            RandomCharsInjector(constants.KURDISH_CHARS),
            RandomCharsSwapper(),
            RandomCharRemover(),
            RandomWordsCollapsor(),
            RandomNeighborReplacer(
                constants.KEYBOARD_KEYS, constants.KEYBOARD_BLANK
                )
        ]
    )