| import pandas as pd |
| from collections import defaultdict, Counter |
|
|
| def save_model(models, model_file, word_to_id, id_to_word): |
| print(f"Saving model to {model_file}...") |
| model_data = [] |
| |
| vocab_data = [] |
| for word, word_id in word_to_id.items(): |
| vocab_data.append({'word': word, 'id': word_id}) |
| |
| if 1 in models: |
| for word_id, count in models[1].items(): |
| model_data.append({'n': 1, 'prefix': '_UNIGRAM_', 'suffix': word_id, 'count': count}) |
| |
| for n, prefixes in models.items(): |
| if n > 1: |
| for prefix, counter in prefixes.items(): |
| prefix_str = ' '.join(map(str, prefix)) |
| for suffix, count in counter.items(): |
| model_data.append({ |
| 'n': n, 'prefix': prefix_str, 'suffix': suffix, 'count': count |
| }) |
| |
| df_model = pd.DataFrame(model_data) |
| df_vocab = pd.DataFrame(vocab_data) |
| |
| combined_df = pd.concat([ |
| df_model.assign(data_type='model'), |
| df_vocab.assign(data_type='vocab') |
| ], ignore_index=True) |
| |
| combined_df.to_feather(model_file) |
| print("Model saved successfully.") |
|
|
| def load_model(model_file): |
| print(f"Loading model from {model_file}...") |
| df = pd.read_feather(model_file) |
| |
| models = defaultdict(lambda: defaultdict(Counter)) |
| word_to_id = {} |
| id_to_word = {} |
| |
| if 'data_type' in df.columns: |
| vocab_df = df[df['data_type'] == 'vocab'] |
| for _, row in vocab_df.iterrows(): |
| word = row['word'] |
| word_id = row['id'] |
| word_to_id[word] = word_id |
| id_to_word[word_id] = word |
| |
| model_df = df[df['data_type'] == 'model'] |
| else: |
| model_df = df |
| |
| unigram_df = model_df[model_df['n'] == 1] |
| for _, row in unigram_df.iterrows(): |
| models[1][row['suffix']] = row['count'] |
| |
| ngram_df = model_df[model_df['n'] > 1] |
| for _, row in ngram_df.iterrows(): |
| n, prefix_str, suffix, count = row['n'], row['prefix'], row['suffix'], row['count'] |
| prefix = tuple(map(int, prefix_str.split())) |
| models[n][prefix][suffix] += count |
| |
| print("Model loaded successfully.") |
| |
| if word_to_id and id_to_word: |
| return models, word_to_id, id_to_word |
| else: |
| return models |
|
|