Spaces:
Sleeping
Sleeping
| # After saving processed_data.json | |
| from utils import tokenize, build_vocab, save_vocab | |
| from utils import load_data | |
| import json | |
| def prepare_training_data(processed_data, vocab_path='vocab.json'): | |
| tokenized_texts = [] | |
| for entry in processed_data: | |
| if isinstance(entry, str): | |
| tokens = tokenize(entry) | |
| tokenized_texts.append(tokens) | |
| elif isinstance(entry, list): | |
| for item in entry: | |
| if isinstance(item, str): | |
| tokens = tokenize(item) | |
| tokenized_texts.append(tokens) | |
| vocab = build_vocab(tokenized_texts) | |
| save_vocab(vocab, vocab_path) | |
| return tokenized_texts, vocab | |
| if __name__ == "__main__": | |
| data = load_data() | |
| tokenized_texts, vocab = prepare_training_data(data) | |
| # Save tokenized data | |
| with open('data/processed/tokenized_data.json', 'w', encoding='utf-8') as f: | |
| json.dump(tokenized_texts, f, ensure_ascii=False, indent=4) | |
| print("Data processing complete. Tokenized data saved to data/processed/tokenized_data.json") | |