| | from .imports import *
|
| |
|
| |
|
| | def get_strategy():
|
| | try:
|
| | tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
|
| | print('Running on TPU ', tpu.master())
|
| | BATCH_SIZE = 100
|
| | except ValueError:
|
| | tpu = None
|
| |
|
| |
|
| | if tpu:
|
| | tf.config.experimental_connect_to_cluster(tpu)
|
| | tf.tpu.experimental.initialize_tpu_system(tpu)
|
| | strategy = tf.distribute.TPUStrategy(tpu)
|
| | else:
|
| | strategy = tf.distribute.get_strategy()
|
| | print('Running on CPU/GPU')
|
| |
|
| |
|
| | def get_data_json() -> dict[str, list[dict[str, str | int]]]:
|
| | fname = './02_NWPU_caption/dataset_nwpu.json'
|
| | with open(fname) as f:
|
| | data = json.load(f)
|
| | return data
|
| |
|
| | def extractor():
|
| | def getSentence(label: str):
|
| | rg = re.split('[ ,]', re.sub(r'[^\x00-\x7F]+', ' ', label))
|
| | return [x for x in rg if x]
|
| |
|
| | raw_data = get_data_json()
|
| | processed_data = {
|
| | f'02_NWPU_RESISC45/{category}/{dictionary["filename"]}': [
|
| | dictionary['raw'],
|
| | dictionary['raw_1'],
|
| | dictionary['raw_2'],
|
| | dictionary['raw_3'],
|
| | dictionary['raw_4']
|
| | ] for category in raw_data for dictionary in raw_data[category]
|
| | }
|
| | very_processed_data = {key: [getSentence(x) for x in processed_data[key]] for key in processed_data}
|
| | return very_processed_data
|
| |
|
| |
|
| | def store_tokenizer_and_data():
|
| | processed_data = extractor()
|
| | all_words = {word.lower() for name in processed_data for sentence in processed_data[name] for word in sentence}
|
| | all_words = ['', '[begin]', '[end]'] + list(sorted(all_words))
|
| | tokenizer = {word: i for i, word in enumerate(all_words)}
|
| | tokenized_data = {key: [[tokenizer[word.lower()] for word in ['[begin]'] + sentence + ['[end]']] for sentence in processed_data[key]] for key in processed_data}
|
| | with open('tokenizer.json', 'w') as f:
|
| | json.dump(tokenizer, f)
|
| | with open('data.json', 'w') as f:
|
| | json.dump(tokenized_data, f)
|
| |
|
| |
|
| | def load_tokenizer():
|
| | with open('tokenizer.json') as f:
|
| | return json.load(f)
|
| |
|
| |
|
| | def split_data():
|
| | with open('data.json') as f:
|
| | data = json.load(f)
|
| | test_lst = []
|
| | train_lst = []
|
| | for i, k in enumerate(data):
|
| | if i % 5 > 3:
|
| | tgt_lst = test_lst
|
| | else:
|
| | tgt_lst = train_lst
|
| | tgt_lst.append((k, data[k]))
|
| | with open('train.json', 'w') as f:
|
| | json.dump(train_lst, f)
|
| | with open('test.json', 'w') as f:
|
| | json.dump(test_lst, f) |