|
|
import re |
|
|
from collections import Counter |
|
|
|
|
|
def read_sentences(file_path): |
|
|
with open(file_path, 'r', encoding='utf-8') as file: |
|
|
sentences = file.readlines() |
|
|
return sentences |
|
|
|
|
|
def preprocess_sentences(sentences): |
|
|
|
|
|
sentences = [sentence.strip() for sentence in sentences] |
|
|
|
|
|
|
|
|
sentences = ['<start> ' + sentence + ' <end>' for sentence in sentences] |
|
|
|
|
|
|
|
|
tokenized_sentences = [re.findall(r"[\w']+|[.,!?;]", sentence) for sentence in sentences] |
|
|
|
|
|
return tokenized_sentences |
|
|
|
|
|
def create_vocab(tokenized_sentences): |
|
|
|
|
|
word_freq = Counter([word for sentence in tokenized_sentences for word in sentence]) |
|
|
|
|
|
|
|
|
sorted_words = sorted(word_freq, key=word_freq.get, reverse=True) |
|
|
|
|
|
|
|
|
special_tokens = ['<pad>', '<start>', '<end>', '<unk>'] |
|
|
sorted_words = special_tokens + sorted_words |
|
|
|
|
|
|
|
|
word_to_id = {word: idx for idx, word in enumerate(sorted_words)} |
|
|
|
|
|
|
|
|
id_to_word = {idx: word for word, idx in word_to_id.items()} |
|
|
|
|
|
return word_to_id, id_to_word |
|
|
|
|
|
|
|
|
file_path = "Files/merged.txt" |
|
|
sentences = read_sentences(file_path) |
|
|
tokenized_sentences = preprocess_sentences(sentences) |
|
|
|
|
|
word_to_id, id_to_word = create_vocab(tokenized_sentences) |
|
|
|