AgGPT-16 / AgGPT_Feather.py
AGofficial's picture
Upload 7 files
cb29702 verified
import pandas as pd
from collections import defaultdict, Counter
def save_model(models, model_file, word_to_id, id_to_word):
print(f"Saving model to {model_file}...")
model_data = []
vocab_data = []
for word, word_id in word_to_id.items():
vocab_data.append({'word': word, 'id': word_id})
if 1 in models:
for word_id, count in models[1].items():
model_data.append({'n': 1, 'prefix': '_UNIGRAM_', 'suffix': word_id, 'count': count})
for n, prefixes in models.items():
if n > 1:
for prefix, counter in prefixes.items():
prefix_str = ' '.join(map(str, prefix))
for suffix, count in counter.items():
model_data.append({
'n': n, 'prefix': prefix_str, 'suffix': suffix, 'count': count
})
df_model = pd.DataFrame(model_data)
df_vocab = pd.DataFrame(vocab_data)
combined_df = pd.concat([
df_model.assign(data_type='model'),
df_vocab.assign(data_type='vocab')
], ignore_index=True)
combined_df.to_feather(model_file)
print("Model saved successfully.")
def load_model(model_file):
print(f"Loading model from {model_file}...")
df = pd.read_feather(model_file)
models = defaultdict(lambda: defaultdict(Counter))
word_to_id = {}
id_to_word = {}
if 'data_type' in df.columns:
vocab_df = df[df['data_type'] == 'vocab']
for _, row in vocab_df.iterrows():
word = row['word']
word_id = row['id']
word_to_id[word] = word_id
id_to_word[word_id] = word
model_df = df[df['data_type'] == 'model']
else:
model_df = df
unigram_df = model_df[model_df['n'] == 1]
for _, row in unigram_df.iterrows():
models[1][row['suffix']] = row['count']
ngram_df = model_df[model_df['n'] > 1]
for _, row in ngram_df.iterrows():
n, prefix_str, suffix, count = row['n'], row['prefix'], row['suffix'], row['count']
prefix = tuple(map(int, prefix_str.split()))
models[n][prefix][suffix] += count
print("Model loaded successfully.")
if word_to_id and id_to_word:
return models, word_to_id, id_to_word
else:
return models