File size: 2,628 Bytes
6f31f53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from .imports import *


def get_strategy():
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
        print('Running on TPU ', tpu.master())
        BATCH_SIZE = 100
    except ValueError:
        tpu = None

    # If TPU is not available, use default strategy (CPU/GPU)
    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
    else:
        strategy = tf.distribute.get_strategy()
        print('Running on CPU/GPU')


def get_data_json() -> dict[str, list[dict[str, str | int]]]:
    fname = './02_NWPU_caption/dataset_nwpu.json'
    with open(fname) as f:
        data = json.load(f)
    return data

def extractor():
    def getSentence(label: str):
        rg = re.split('[ ,]', re.sub(r'[^\x00-\x7F]+', ' ', label))
        return [x for x in rg if x]

    raw_data = get_data_json()
    processed_data = {
        f'02_NWPU_RESISC45/{category}/{dictionary["filename"]}': [
          dictionary['raw'],
          dictionary['raw_1'],
          dictionary['raw_2'],
          dictionary['raw_3'],
          dictionary['raw_4']
        ] for category in raw_data for dictionary in raw_data[category]
    }
    very_processed_data = {key: [getSentence(x) for x in processed_data[key]] for key in processed_data}
    return very_processed_data


def store_tokenizer_and_data():
    processed_data = extractor()
    all_words = {word.lower() for name in processed_data for sentence in processed_data[name] for word in sentence}
    all_words = ['', '[begin]', '[end]'] + list(sorted(all_words))
    tokenizer = {word: i for i, word in enumerate(all_words)}
    tokenized_data = {key: [[tokenizer[word.lower()] for word in ['[begin]'] + sentence + ['[end]']] for sentence in processed_data[key]] for key in processed_data}
    with open('tokenizer.json', 'w') as f:
        json.dump(tokenizer, f)
    with open('data.json', 'w') as f:
        json.dump(tokenized_data, f)


def load_tokenizer():
    with open('tokenizer.json') as f:
        return json.load(f)


def split_data():
    with open('data.json') as f:
        data = json.load(f)
    test_lst = []
    train_lst = []
    for i, k in enumerate(data):
        if i % 5 > 3:
            tgt_lst = test_lst
        else:
            tgt_lst = train_lst
        tgt_lst.append((k, data[k]))
    with open('train.json', 'w') as f:
        json.dump(train_lst, f)
    with open('test.json', 'w') as f:
        json.dump(test_lst, f)