laurievb commited on
Commit
9d18279
·
verified ·
1 Parent(s): e807787

Delete scripts

Browse files
scripts/openlid.py DELETED
@@ -1,87 +0,0 @@
1
- import unicodedata
2
- import emoji
3
- import sys
4
-
5
- class Demojizer:
6
- """
7
- based on:
8
- https://github.com/carpedm20/emoji/blob/d8bbfe455c6fcd12b96ed1dce6e0978fe7a47431/emoji/core.py#L141
9
- """
10
-
11
- def _get_search_tree(self):
12
- _SEARCH_TREE = {}
13
- for emj in emoji.unicode_codes.EMOJI_DATA:
14
- sub_tree = _SEARCH_TREE
15
- lastidx = len(emj) - 1
16
- for i, char in enumerate(emj):
17
- if char not in sub_tree:
18
- sub_tree[char] = {}
19
- sub_tree = sub_tree[char]
20
- if i == lastidx:
21
- sub_tree["data"] = emoji.unicode_codes.EMOJI_DATA[emj]
22
- return _SEARCH_TREE
23
-
24
- def __init__(self) -> None:
25
- self.search_tree = self._get_search_tree()
26
-
27
- def __call__(self, string: str, replace_str: str):
28
- result = []
29
- i = 0
30
- length = len(string)
31
- state = 0
32
- while i < length:
33
- consumed = False
34
- char = string[i]
35
- if char in self.search_tree:
36
- j = i + 1
37
- sub_tree = self.search_tree[char]
38
- while j < length and string[j] in sub_tree:
39
- sub_tree = sub_tree[string[j]]
40
- j += 1
41
- if "data" in sub_tree:
42
- state = 1
43
- consumed = True
44
- result.append(replace_str)
45
- i = j - 1
46
- else:
47
- state = 0
48
- elif state == 1:
49
- if char.isspace():
50
- consumed = True
51
- else:
52
- state = 0
53
-
54
- if not consumed and char != "\ufe0e" and char != "\ufe0f":
55
- result.append(char)
56
- i += 1
57
-
58
- return "".join(result)
59
-
60
-
61
- def _get_replacer(replace_by: str = " ") -> str:
62
- non_printable_map = {
63
- ord(c): replace_by
64
- for c in (chr(i) for i in range(sys.maxunicode + 1))
65
- # same as \p{C} in perl
66
- # see https://www.unicode.org/reports/tr44/#General_Category_Values
67
- if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
68
- }
69
-
70
- def replace_non_printing_char(line) -> str:
71
- return line.translate(non_printable_map)
72
-
73
- return replace_non_printing_char
74
-
75
-
76
- def clean_text(input_text: str) -> str:
77
- """cleans input text prior to LID"""
78
- replace_nonprint = _get_replacer(" ")
79
- demoji = Demojizer()
80
-
81
- clean = replace_nonprint(input_text)
82
- clean = unicodedata.normalize("NFKC", clean)
83
- clean = demoji(clean, "")
84
-
85
- return clean
86
-
87
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/prepare_openlid_v2_for_model_training.sh DELETED
@@ -1,17 +0,0 @@
1
- #!/bin/bash
2
- # author: laurie
3
- # script to sample OpenLID-v2 prior to training
4
- # usage: bash prepare_opelid_v2_for_model_training.sh PATH_TO_OPENLID-V2
5
- set -eo pipefail
6
-
7
- START_DIR=${PWD}
8
- echo "starting dir is ${START_DIR}"
9
- INPUT_DATA=$1 # should be openlid-v2 dataset
10
- echo "using openlid-v2 data from ${1}"
11
-
12
- echo "generating counts in stats/"
13
- mkdir -p stats
14
- cut -f2 -d$'\t' $INPUT_DATA | uniq -c > stats/openlid-v2-unsampled.counts
15
-
16
- echo "applying temperature sampling..."
17
- python scripts/sample_with_temperature.py $INPUT_DATA stats/openlid-v2-unsampled.counts > openlid-v2-sampled.tsv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/sample_with_temperature.py DELETED
@@ -1,98 +0,0 @@
1
- """samples with temperature, grouping by language code. assumes input files is sorted by language group"""
2
-
3
- import argparse
4
- import logging
5
- import random
6
- import sys
7
-
8
- def parse_args():
9
- parser = argparse.ArgumentParser()
10
- parser.add_argument("corpus_filepath", type=str, help="path to input corpus to sample")
11
- parser.add_argument("linecounts_filepath", type=str, help="path to file containing line counts of input corpus (from 'uniq -c')")
12
- return parser.parse_args()
13
-
14
- # def count_lines(file):
15
- # def blocks(files, size=65536):
16
- # while True:
17
- # b = files.read(size)
18
- # if not b: break
19
- # yield b
20
- # with open(file, "r",encoding="utf-8",errors='ignore') as f:
21
- # return (sum(bl.count("\n") for bl in blocks(f)))
22
-
23
- def main():
24
- logging.basicConfig(
25
- level=logging.INFO,
26
- filename='sampling.log',
27
- filemode='w',
28
- format='%(asctime)s %(levelname)s: %(message)s',
29
- datefmt='%m/%d/%Y %I:%M:%S %p')
30
- logger = logging.getLogger(__name__)
31
-
32
- args = parse_args()
33
-
34
- logger.info(f"creating counts lookup dict from {args.linecounts_filepath}")
35
- with open(args.linecounts_filepath) as f:
36
- total_raw_lines = 0
37
- lc_lookup = dict()
38
- for line in f:
39
- count, lang = line.strip().split(' ')
40
- count = int(count)
41
- lc_lookup[lang] = {"raw_lines": count}
42
- total_raw_lines += count
43
-
44
- logger.info(f"lookup dict finished ({len(lc_lookup)} entries)")
45
- logger.info(f"dataset contains {total_raw_lines} lines")
46
-
47
- # calculate lines to keep with (((raw_lines_in_lang / total_line_count) ** 0.3) / total_proprotions) * total lines
48
-
49
- # calculate proportions
50
- logger.info("calculating sampling factors")
51
- total_sampling_factors = 0
52
- for lang in lc_lookup:
53
- # we sample lines proportional to this so smaller langs are upsampled and larger langs are downsampled
54
- sampling_factor = (lc_lookup[lang]['raw_lines'] / total_raw_lines) ** 0.3
55
- lc_lookup[lang]["sampling_factor"] = sampling_factor
56
- total_sampling_factors += sampling_factor
57
-
58
- logger.info(f"sampling factor total is {total_sampling_factors}")
59
- logger.info(f"calculating number of lines to sample")
60
- total_lines_to_sample = 0
61
- for lang in lc_lookup:
62
- lines_to_sample = round(lc_lookup[lang]["sampling_factor"]/total_sampling_factors * total_raw_lines)
63
- lc_lookup[lang]['lines_to_sample'] = lines_to_sample
64
- total_lines_to_sample += lines_to_sample
65
- prop_size_difference = abs((total_raw_lines - total_lines_to_sample)/total_lines_to_sample)
66
- assert prop_size_difference < 0.01 # sense check that sampled corpus is right size
67
- logger.info(
68
- f"total raw lines is {total_raw_lines}, total sampled lines is {total_lines_to_sample} ({prop_size_difference:.3%} difference)")
69
-
70
- # assume input file is sorted by group
71
- logger.info(f"sampling from {args.corpus_filepath}")
72
- with open(args.corpus_filepath, "r") as f:
73
- single_lang_line_store = []
74
- langcode = ""
75
- while line := f.readline():
76
- line = line.strip()
77
- _, nextlang, _ = line.split('\t')
78
- if langcode == nextlang or langcode == "": # same language
79
- single_lang_line_store.append(line)
80
- else: # language change, time to sample and write out
81
- raw_lines_in_lang = len(single_lang_line_store)
82
- assert raw_lines_in_lang == lc_lookup[langcode]["raw_lines"] # sanity check it's same data
83
- num_lines_to_keep = lc_lookup[langcode]["lines_to_sample"]
84
- logger.info(f"finished reading {langcode}: read in {raw_lines_in_lang}, writing {num_lines_to_keep}")
85
- if raw_lines_in_lang > num_lines_to_keep:
86
- sampled_lines_gc = (x for x in random.sample(single_lang_line_store, num_lines_to_keep))
87
- else: # need to oversample, so now use sampling with replacement
88
- sampled_lines_gc = (x for x in random.choices(single_lang_line_store, k=num_lines_to_keep))
89
- for out in sampled_lines_gc:
90
- sys.stdout.write(f"{out}\n")
91
- logger.info(f"finished writing {langcode} to stdout, now collecting lines for {nextlang}")
92
- single_lang_line_store = [line]
93
- langcode = nextlang
94
- logger.info("sampling complete!")
95
-
96
-
97
- if __name__ == "__main__":
98
- main()