|
|
import argparse
|
|
|
import collections
|
|
|
import gzip
|
|
|
import html
|
|
|
import json
|
|
|
import os
|
|
|
import random
|
|
|
import re
|
|
|
import torch
|
|
|
from tqdm import tqdm
|
|
|
import numpy as np
|
|
|
from utils import *
|
|
|
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaConfig, AutoTokenizer, AutoModel
|
|
|
|
|
|
|
|
|
def load_data(args):
|
|
|
|
|
|
item2feature_path = os.path.join(args.root, f'{args.dataset}.user.json')
|
|
|
item2feature = load_json(item2feature_path)
|
|
|
|
|
|
return item2feature
|
|
|
|
|
|
def generate_text(item2feature, features):
|
|
|
item_text_list = []
|
|
|
|
|
|
for item in item2feature:
|
|
|
data = item2feature[item]
|
|
|
text = []
|
|
|
|
|
|
for i in range(len(data)):
|
|
|
meta_value = clean_text(data[i])
|
|
|
text.append(meta_value.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
item_text_list.append([int(item), text])
|
|
|
|
|
|
return item_text_list
|
|
|
|
|
|
def preprocess_text(args):
|
|
|
print('Process text data ......')
|
|
|
print('Dataset:', args.dataset)
|
|
|
|
|
|
item2feature = load_data(args)
|
|
|
item2feature = item2feature['user_explicit_preference']
|
|
|
|
|
|
item_text_list = generate_text(item2feature)
|
|
|
|
|
|
|
|
|
|
|
|
return item_text_list
|
|
|
|
|
|
def generate_item_embedding(args, item_text_list, tokenizer, model, word_drop_ratio=-1):
|
|
|
print(f'Generate Text Embedding ......')
|
|
|
print('Dataset:', args.dataset)
|
|
|
|
|
|
items, texts = zip(*item_text_list)
|
|
|
order_texts = [[0]] * len(items)
|
|
|
for item, text in zip(items, texts):
|
|
|
order_texts[item] = text
|
|
|
for text in order_texts:
|
|
|
assert text != [0]
|
|
|
|
|
|
embeddings = []
|
|
|
start, batch_size = 0, 1
|
|
|
with torch.no_grad():
|
|
|
while start < len(order_texts):
|
|
|
if (start+1) % 100 == 0:
|
|
|
print("==>", start + 1)
|
|
|
field_texts = order_texts[start: start + batch_size]
|
|
|
|
|
|
field_texts = zip(*field_texts)
|
|
|
|
|
|
field_embeddings = []
|
|
|
for sentences in field_texts:
|
|
|
sentences = list(sentences)
|
|
|
|
|
|
if word_drop_ratio > 0:
|
|
|
print(f'Word drop with p={word_drop_ratio}')
|
|
|
new_sentences = []
|
|
|
for sent in sentences:
|
|
|
new_sent = []
|
|
|
sent = sent.split(' ')
|
|
|
for wd in sent:
|
|
|
rd = random.random()
|
|
|
if rd > word_drop_ratio:
|
|
|
new_sent.append(wd)
|
|
|
new_sent = ' '.join(new_sent)
|
|
|
new_sentences.append(new_sent)
|
|
|
sentences = new_sentences
|
|
|
encoded_sentences = tokenizer(sentences, max_length=args.max_sent_len,
|
|
|
truncation=True, return_tensors='pt',padding="longest").to(args.device)
|
|
|
outputs = model(input_ids=encoded_sentences.input_ids,
|
|
|
attention_mask=encoded_sentences.attention_mask)
|
|
|
|
|
|
masked_output = outputs.last_hidden_state * encoded_sentences['attention_mask'].unsqueeze(-1)
|
|
|
mean_output = masked_output.sum(dim=1) / encoded_sentences['attention_mask'].sum(dim=-1, keepdim=True)
|
|
|
mean_output = mean_output.detach().cpu()
|
|
|
field_embeddings.append(mean_output)
|
|
|
|
|
|
field_mean_embedding = torch.stack(field_embeddings, dim=0).mean(dim=0)
|
|
|
embeddings.append(field_mean_embedding)
|
|
|
start += batch_size
|
|
|
|
|
|
embeddings = torch.cat(embeddings, dim=0).numpy()
|
|
|
print('Embeddings shape: ', embeddings.shape)
|
|
|
|
|
|
|
|
|
np.save(args.save_path, embeddings)
|
|
|
|
|
|
|
|
|
def parse_args():
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument('--dataset', type=str, default='Arts', help='Instruments / Arts / Games')
|
|
|
parser.add_argument('--root', type=str, default="")
|
|
|
parser.add_argument('--gpu_id', type=int, default=2, help='ID of running GPU')
|
|
|
parser.add_argument('--plm_name', type=str, default='llama')
|
|
|
parser.add_argument('--plm_checkpoint', type=str,
|
|
|
default='')
|
|
|
parser.add_argument('--max_sent_len', type=int, default=2048)
|
|
|
parser.add_argument('--word_drop_ratio', type=float, default=-1, help='word drop ratio, do not drop by default')
|
|
|
parser.add_argument('--save_path', type=str, default="")
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
args = parse_args()
|
|
|
|
|
|
args.root = os.path.join(args.root, args.dataset)
|
|
|
|
|
|
device = set_device(args.gpu_id)
|
|
|
args.device = device
|
|
|
|
|
|
item_text_list = preprocess_text(args)
|
|
|
|
|
|
plm_tokenizer, plm_model = load_plm(args.plm_checkpoint)
|
|
|
if plm_tokenizer.pad_token_id is None:
|
|
|
plm_tokenizer.pad_token_id = 0
|
|
|
plm_model = plm_model.to(device)
|
|
|
|
|
|
generate_item_embedding(args, item_text_list,plm_tokenizer,
|
|
|
plm_model, word_drop_ratio=args.word_drop_ratio) |