Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| import os | |
| import os.path as osp | |
| import gzip | |
| import pickle | |
| import json | |
| import torch | |
| import pandas as pd | |
| import numpy as np | |
| from collections import Counter | |
| from tqdm import tqdm | |
| from huggingface_hub import hf_hub_download | |
| import zipfile | |
| from ogb.utils.url import download_url | |
| from src.benchmarks.semistruct.knowledge_base import SemiStructureKB | |
| from src.tools.process_text import clean_data, compact_text | |
| from src.tools.node import df_row_to_dict, Node, register_node | |
| from src.tools.io import save_files, load_files | |
| PROCESSED_DATASET = { | |
| "repo": "snap-stanford/stark", | |
| "file": "skb/amazon/processed.zip", | |
| } | |
| class AmazonSemiStruct(SemiStructureKB): | |
| REVIEW_CATEGORIES = set(['Amazon_Fashion','All_Beauty','Appliances', | |
| 'Arts_Crafts_and_Sewing','Automotive','Books', | |
| 'CDs_and_Vinyl','Cell_Phones_and_Accessories', | |
| 'Clothing_Shoes_and_Jewelry','Digital_Music', | |
| 'Electronics','Gift_Cards','Grocery_and_Gourmet_Food', | |
| 'Home_and_Kitchen','Industrial_and_Scientific', 'Kindle_Store', | |
| 'Luxury_Beauty','Magazine_Subscriptions', 'Movies_and_TV', | |
| 'Musical_Instruments', 'Office_Products','Patio_Lawn_and_Garden', | |
| 'Pet_Supplies','Prime_Pantry','Software','Sports_and_Outdoors', | |
| 'Tools_and_Home_Improvement','Toys_and_Games','Video_Games']) | |
| # single answers | |
| QA_CATEGORIES = set(['Appliances','Arts_Crafts_and_Sewing', 'Automotive', | |
| 'Baby','Beauty','Cell_Phones_and_Accessories', | |
| 'Clothing_Shoes_and_Jewelry','Electronics', | |
| 'Grocery_and_Gourmet_Food','Health_and_Personal_Care', | |
| 'Home_and_Kitchen','Musical_Instruments','Office_Products', | |
| 'Patio_Lawn_and_Garden','Pet_Supplies','Sports_and_Outdoors', | |
| 'Tools_and_Home_Improvement','Toys_and_Games','Video_Games']) | |
| COMMON = set(['Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', | |
| 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Electronics', | |
| 'Grocery_and_Gourmet_Food', 'Home_and_Kitchen', 'Musical_Instruments', | |
| 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Sports_and_Outdoors', | |
| 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games']) | |
| sub_category = 'data/amazon/category_list.json' | |
| SUB_CATEGORIES = set(json.load(open(sub_category, 'r'))) | |
| link_columns = ['also_buy', 'also_view'] | |
| review_columns = ['reviewerID', 'summary', 'style', 'reviewText', 'vote', 'overall', 'verified', 'reviewTime'] | |
| qa_columns = ['questionType', 'answerType', 'question', 'answer', 'answerTime'] | |
| meta_columns = ['asin', 'title', 'global_category', 'category', 'price', 'brand', 'feature', | |
| 'rank', 'details', 'description'] | |
| candidate_types = ['product'] | |
| node_attr_dict = {'product': ['title', 'dimensions', 'weight', 'description', 'features', 'reviews', 'Q&A'], | |
| 'brand': ['brand_name'], | |
| 'category': ['category_name'], | |
| 'color': ['color_name']} | |
| def __init__(self, | |
| root, | |
| categories: list, | |
| meta_link_types=['brand', 'category', 'color'], | |
| max_entries=25, | |
| download_processed=True, | |
| **kwargs): | |
| ''' | |
| Args: | |
| root (str): root directory to store the data | |
| categories (list): product categories | |
| meta_link_types (list): a list which may contain entries in node info | |
| that used to consruct meta links, e.g. ['category', 'brand'] | |
| will construct entity nodes of catrgory and brand which link | |
| to corresponding nodes | |
| max_entries (int): maximum number of review & qa entries to show in the description | |
| indirected (bool): make the graph indirected | |
| ''' | |
| self.root = root | |
| self.max_entries = max_entries | |
| self.raw_data_dir = osp.join(root, 'raw') | |
| self.processed_data_dir = osp.join(root, 'processed') | |
| os.makedirs(self.raw_data_dir, exist_ok=True) | |
| os.makedirs(self.processed_data_dir, exist_ok=True) | |
| # construct the graph based on link info in the raw data | |
| cache_path = None if meta_link_types is None else \ | |
| osp.join(self.processed_data_dir, 'cache', '-'.join(meta_link_types)) | |
| if not osp.exists(osp.join(cache_path, 'node_info.pkl')) and download_processed: | |
| print('Downloading processed data...') | |
| processed_path = hf_hub_download( | |
| PROCESSED_DATASET["repo"], | |
| PROCESSED_DATASET["file"], | |
| repo_type="dataset" | |
| ) | |
| with zipfile.ZipFile(processed_path, 'r') as zip_ref: | |
| zip_ref.extractall(self.root) | |
| os.remove(processed_path) | |
| print('Downloaded processed data!') | |
| if not (cache_path is None) and osp.exists(cache_path): | |
| print(f'Load cached graph with meta link types {meta_link_types}') | |
| processed_data = load_files(cache_path) | |
| else: | |
| print(f'Start processing raw data...') | |
| print(f'{meta_link_types=}') | |
| processed_data = self._process_raw(categories) | |
| if meta_link_types: | |
| # customize the graph by adding meta links | |
| processed_data = self.post_process(processed_data, meta_link_types=meta_link_types, cache_path=cache_path) | |
| super(AmazonSemiStruct, self).__init__(**processed_data, **kwargs) | |
| def __getitem__(self, idx): | |
| idx = int(idx) | |
| node_info = self.node_info[idx] | |
| node = Node() | |
| register_node(node, node_info) | |
| return node | |
| def get_chunk_info(self, idx, attribute): | |
| if not hasattr(self[idx], attribute): return '' | |
| node_attr = getattr(self[idx], attribute) | |
| if 'feature' in attribute: | |
| features = [] | |
| if len(node_attr): | |
| for feature_idx, feature in enumerate(node_attr): | |
| if feature == '': continue | |
| if 'asin' in feature.lower(): continue | |
| features.append(feature) | |
| chunk = ' '.join(features) | |
| elif 'review' in attribute: | |
| chunk = '' | |
| if len(node_attr): | |
| scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node_attr] | |
| ranks = np.argsort(-np.array(scores)) | |
| for idx, review_idx in enumerate(ranks): | |
| review = node_attr[review_idx] | |
| chunk += 'The review \"' + str(review['summary']) + '\"' | |
| chunk += 'states that \"' + str(review['reviewText']) + '\". ' | |
| if idx > self.max_entries: break | |
| elif 'qa' in attribute: | |
| chunk = '' | |
| if len(node_attr): | |
| for idx, question in enumerate(node_attr): | |
| chunk += 'The question is \"' + str(question['question']) + '\", ' | |
| chunk += 'and the answer is \"' + str(question['answer']) + '\". ' | |
| if idx > self.max_entries: | |
| break | |
| elif 'description' in attribute and len(node_attr): | |
| chunk = " ".join(node_attr) | |
| else: | |
| chunk = node_attr | |
| return chunk | |
| def get_doc_info(self, idx, | |
| add_rel=True, | |
| compact=False): | |
| if self.node_type_dict[int(self.node_types[idx])] == 'brand': | |
| return f'brand name: {self[idx].brand_name}' | |
| if self.node_type_dict[int(self.node_types[idx])] == 'category': | |
| return f'category name: {self[idx].category_name}' | |
| if self.node_type_dict[int(self.node_types[idx])] == 'color': | |
| return f'color name: {self[idx].color_name}' | |
| node = self[idx] | |
| doc = f'- product: {node.title}\n' | |
| if hasattr(node, 'brand'): | |
| doc += f'- brand: {node.brand}\n' | |
| try: | |
| dimensions, weight = node.details.dictionary.product_dimensions.split(' ; ') | |
| doc += (f'- dimensions: {dimensions}\n' | |
| f'- weight: {weight}\n') | |
| except: pass | |
| if len(node.description): | |
| description = " ".join(node.description).strip(" ") | |
| if len(description) > 0: | |
| doc += f'- description: {description}\n' | |
| feature_text = f'- features: \n' | |
| if len(node.feature): | |
| for feature_idx, feature in enumerate(node.feature): | |
| if feature == '': continue | |
| if 'asin' in feature.lower(): continue | |
| feature_text += (f'#{feature_idx + 1}: {feature}\n') | |
| else: feature_text = '' | |
| if len(node.review): | |
| review_text = f'- reviews: \n' | |
| scores = [0 if pd.isnull(review['vote']) else int(review['vote'].replace(",","")) for review in node.review] | |
| ranks = np.argsort(-np.array(scores)) | |
| for i, review_idx in enumerate(ranks): | |
| review = node.review[review_idx] | |
| review_text += (f'#{review_idx + 1}:\n' | |
| f'summary: {review["summary"]}\n' | |
| f'text: "{review["reviewText"]}"\n') | |
| if i > self.max_entries: break | |
| else: review_text = '' | |
| if len(node.qa): | |
| qa_text = f'- Q&A: \n' | |
| for qa_idx, qa in enumerate(node.qa): | |
| qa_text += (f'#{qa_idx + 1}:\n' | |
| f'question: "{qa["question"]}"\n' | |
| f'answer: "{qa["answer"]}"\n') | |
| if qa_idx > self.max_entries: break | |
| else: qa_text = '' | |
| doc += feature_text + review_text + qa_text | |
| if add_rel: | |
| doc += self.get_rel_info(idx) | |
| if compact: | |
| doc = compact_text(doc) | |
| return doc | |
| def get_rel_info(self, idx, rel_types=None, n_rel=-1): | |
| doc = '' | |
| rel_types = self.rel_type_lst() if rel_types is None else rel_types | |
| n_also_buy = self.get_neighbor_nodes(idx, 'also_buy') | |
| n_also_view = self.get_neighbor_nodes(idx, 'also_view') | |
| n_has_brand = self.get_neighbor_nodes(idx, 'has_brand') | |
| str_also_buy = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_buy)] | |
| str_also_view = [f"#{idx + 1}: " + self[i].title + '\n' for idx, i in enumerate(n_also_view)] | |
| if len(str_also_buy) == 0: str_also_buy = '' | |
| if len(str_also_view) == 0: str_also_view = '' | |
| str_has_brand = '' | |
| if len(n_has_brand): | |
| str_has_brand = f' brand: {self[n_has_brand[0]].brand_name}\n' | |
| str_also_buy = ''.join(str_also_buy) | |
| str_also_view = ''.join(str_also_view) | |
| if len(str_also_buy): | |
| doc += f' products also purchased: \n{str_also_buy}' | |
| if len(str_also_view): | |
| doc += f' products also viewed: \n{str_also_view}' | |
| if len(n_has_brand): | |
| doc += str_has_brand | |
| if len(doc): | |
| doc = '- relations:\n' + doc | |
| return doc | |
| def _process_raw(self, categories): | |
| if 'all' in categories: | |
| review_categories = self.REVIEW_CATEGORIES | |
| qa_categories = self.QA_CATEGORIES | |
| else: | |
| qa_categories = review_categories = categories | |
| assert len(set(categories) - self.COMMON) == 0, f'invalid categories exist' | |
| if osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')): | |
| print(f'Load processed data from {self.processed_data_dir}') | |
| loaded_files = load_files(self.processed_data_dir) | |
| loaded_files.update( | |
| {'node_types': torch.zeros(len(loaded_files['node_info'])), | |
| 'node_type_dict': {0: 'product'}}) | |
| return loaded_files | |
| print(f'Check data downloading...') | |
| for category in review_categories: | |
| review_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2' | |
| if not os.path.exists(osp.join(self.raw_data_dir, f'{category}.json.gz')): | |
| print(f'Downloading {category} data...') | |
| download_url(f'{review_header}/categoryFiles/{category}.json.gz', self.raw_data_dir) | |
| download_url(f'{review_header}/metaFiles2/meta_{category}.json.gz', self.raw_data_dir) | |
| for category in qa_categories: | |
| qa_header = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon/qa' | |
| if not os.path.exists(osp.join(self.raw_data_dir, f'qa_{category}.json.gz')): | |
| print(f'Downloading {category} QA data...') | |
| download_url(f'{qa_header}/qa_{category}.json.gz', self.raw_data_dir) | |
| if not osp.exists(osp.join(self.processed_data_dir, 'node_info.pkl')): | |
| ckt_path = 'data/amazon/intermediate' | |
| print('Loading data... It might take a while') | |
| # read amazon QA data | |
| df_qa_path = os.path.join(ckt_path, 'df_qa.pkl') | |
| if os.path.exists(df_qa_path): | |
| df_qa = pd.read_pickle(df_qa_path) | |
| else: | |
| df_qa = pd.concat([read_qa(osp.join(self.raw_data_dir, f'qa_{category}.json.gz')) | |
| for category in qa_categories])[['asin'] + self.qa_columns] | |
| df_qa.to_pickle(df_qa_path) | |
| print('df_qa loaded') | |
| # read amazon review data | |
| df_review_path = os.path.join(ckt_path, 'df_review.pkl') | |
| if os.path.exists(df_review_path): | |
| df_review = pd.read_pickle(df_review_path) | |
| else: | |
| df_review = pd.concat([read_review(osp.join(self.raw_data_dir, f'{category}.json.gz')) | |
| for category in review_categories])[['asin'] + self.review_columns] | |
| df_review.to_pickle(df_review_path) | |
| print('df_review loaded') | |
| # read amazon meta data from amazon review & amazon kdd | |
| df_ucsd_meta_path = os.path.join(ckt_path, 'df_ucsd_meta.pkl') | |
| if os.path.exists(df_ucsd_meta_path): | |
| df_ucsd_meta = pd.read_pickle(df_ucsd_meta_path) | |
| else: | |
| meta_df_lst = [] | |
| for category in review_categories: | |
| cat_review = read_review(osp.join(self.raw_data_dir, f'meta_{category}.json.gz')) | |
| cat_review.insert(0, 'global_category', category.replace('_', ' ')) | |
| meta_df_lst.append(cat_review) | |
| df_ucsd_meta = pd.concat(meta_df_lst) | |
| df_ucsd_meta.to_pickle(df_ucsd_meta_path) | |
| print('df_ucsd_meta loaded') | |
| print('Preprocessing data...') | |
| df_ucsd_meta = df_ucsd_meta.drop_duplicates(subset='asin', keep='first') | |
| df_meta = df_ucsd_meta[self.meta_columns + self.link_columns] | |
| # Merge dataframes | |
| df_review_meta = df_review.merge(df_meta, left_on='asin', right_on='asin') | |
| unique_asin = np.unique(np.array(df_review_meta['asin'])) | |
| # Filer items with both meta and review data | |
| df_qa_reduced = df_qa[df_qa['asin'].isin(unique_asin)] | |
| df_review_reduced = df_review[df_review['asin'].isin(unique_asin)] | |
| df_meta_reduced = df_meta[df_meta['asin'].isin(unique_asin)].reset_index() | |
| def get_map(df): | |
| asin2id, id2asin = {}, {} | |
| for idx in range(len(df)): | |
| asin2id[df['asin'][idx]] = idx | |
| id2asin[idx] = df['asin'][idx] | |
| return asin2id, id2asin | |
| print('Construct node info and graph...') | |
| # get mapping from asin to node id and its reversed mapping | |
| self.asin2id, self.id2asin = get_map(df_meta_reduced) | |
| node_info = self.construct_raw_node_info(df_meta_reduced, df_review_reduced, df_qa_reduced) | |
| edge_index, edge_types = self.create_raw_product_graph(df_meta_reduced, | |
| columns=self.link_columns) | |
| edge_type_dict = {0: 'also_buy', 1: 'also_view'} | |
| processed_data = { | |
| 'node_info': node_info, | |
| 'edge_index': edge_index, | |
| 'edge_types': edge_types, | |
| 'edge_type_dict': edge_type_dict} | |
| print(f'Saving to {self.processed_data_dir}...') | |
| save_files(save_path=self.processed_data_dir, **processed_data) | |
| processed_data.update({'node_types': torch.zeros(len(processed_data['node_info'])), | |
| 'node_type_dict': {0: 'product'}}) | |
| return processed_data | |
| def post_process(self, raw_info, meta_link_types, cache_path=None): | |
| print(f'Adding meta link types {meta_link_types}') | |
| node_info = raw_info['node_info'] | |
| edge_type_dict = raw_info['edge_type_dict'] | |
| node_type_dict = raw_info['node_type_dict'] | |
| node_types = raw_info['node_types'].tolist() | |
| edge_index = raw_info['edge_index'].tolist() | |
| edge_types = raw_info['edge_types'].tolist() | |
| n_e_types, n_n_types = len(edge_type_dict), len(node_type_dict) | |
| for i, link_type in enumerate(meta_link_types): | |
| if link_type == 'brand': | |
| values = np.array([node_info_i[link_type] for node_info_i in node_info.values() if link_type in node_info_i.keys()]) | |
| indices = np.array([idx for idx, node_info_i in enumerate(node_info.values()) if link_type in node_info_i.keys()]) | |
| elif link_type in ['category', 'color']: | |
| value_list = [] | |
| indice_list = [] | |
| for idx, node_info_i in enumerate(node_info.values()): | |
| if link_type in node_info_i.keys(): | |
| value_list.extend(node_info_i[link_type]) | |
| indice_list.extend([idx for _ in range(len(node_info_i[link_type]))]) | |
| values = np.array(value_list) | |
| indices = np.array(indice_list) | |
| else: | |
| raise Exception(f'Invalid meta link type {link_type}') | |
| cur_n_nodes = len(node_info) | |
| node_type_dict[n_n_types + i] = link_type | |
| edge_type_dict[n_e_types + i] = "has_" + link_type | |
| unique = np.unique(values) | |
| for j, unique_j in tqdm(enumerate(unique)): | |
| node_info[cur_n_nodes + j] = {link_type + '_name': unique_j} | |
| ids = indices[np.array(values == unique_j)] | |
| edge_index[0].extend(list(ids)) | |
| edge_index[1].extend([cur_n_nodes + j for _ in range(len(ids))]) | |
| edge_types.extend([i + n_e_types for _ in range(len(ids))]) | |
| node_types.extend([n_n_types + i for _ in range(len(unique))]) | |
| print(f'finished adding {link_type}') | |
| edge_index = torch.LongTensor(edge_index) | |
| edge_types = torch.LongTensor(edge_types) | |
| node_types = torch.LongTensor(node_types) | |
| files = {'node_info': node_info, | |
| 'edge_index': edge_index, | |
| 'edge_types': edge_types, | |
| 'edge_type_dict': edge_type_dict, | |
| 'node_type_dict': node_type_dict, | |
| 'node_types': node_types | |
| } | |
| if cache_path is not None: | |
| save_files(cache_path, **files) | |
| return files | |
| def _process_brand(self, brand): | |
| brand = brand.strip(" \".*+,-_!@#$%^&*();\/|<>\'\t\n\r\\") | |
| if len(brand) > 3 and brand[:3] == 'by ': | |
| brand = brand[3:] | |
| if len(brand) > 4 and brand[-4:] == '.com': | |
| brand = brand[:-4] | |
| if len(brand) > 4 and brand[:4] == 'www.': | |
| brand = brand[4:] | |
| if len(brand) > 100: | |
| brand = brand.split(' ')[0] | |
| return brand | |
| def construct_raw_node_info(self, df_meta, df_review, df_qa): | |
| node_info = {} | |
| for idx, asin in self.id2asin.items(): | |
| node_info[idx] = {} | |
| node_info[idx]['review'] = [] | |
| node_info[idx]['qa'] = [] | |
| ###################### Assign color ######################## | |
| def assign_colors(df_review, lower_limit=20): | |
| # asign to color | |
| df_review = df_review[['asin', 'style']] | |
| df_review = df_review.dropna(subset=['style']) | |
| raw_color_dict = {} | |
| for idx, row in tqdm(df_review.iterrows()): | |
| asin, style = row['asin'], row['style'] | |
| for key in style.keys(): | |
| if 'color' in key.lower(): | |
| try: | |
| raw_color_dict[asin] | |
| except: | |
| raw_color_dict[asin] = [] | |
| raw_color_dict[asin].append( | |
| style[key].strip().lower() if isinstance(style[key], str) else style[key][0].strip()) | |
| all_color_values = [] | |
| for asin in raw_color_dict.keys(): | |
| raw_color_dict[asin] = list(set(raw_color_dict[asin])) | |
| all_color_values.extend(raw_color_dict[asin]) | |
| print('number of all colors', len(all_color_values)) | |
| color_counter = Counter(all_color_values) | |
| print('number of unique colors', len(color_counter)) | |
| color_counter = {k: v for k, v in sorted(color_counter.items(), key=lambda item: item[1], reverse=True)} | |
| selected_colors = [] | |
| for color, number in color_counter.items(): | |
| if number > lower_limit and len(color) > 2 and len(color.split(' ')) < 5 and color.isnumeric() is False: | |
| selected_colors.append(color) | |
| print('number of selected colors', len(selected_colors)) | |
| filtered_color_dict = {} | |
| total_color_connections = 0 | |
| for asin in raw_color_dict.keys(): | |
| filtered_color_dict[asin] = [] | |
| for value in raw_color_dict[asin]: | |
| if value in selected_colors: | |
| filtered_color_dict[asin].append(value) | |
| total_color_connections += len(filtered_color_dict[asin]) | |
| print('number of linked products', len(filtered_color_dict)) | |
| print('number of total connections', total_color_connections) | |
| return filtered_color_dict | |
| filtered_color_dict_path = os.path.join('data/amazon/intermediate', | |
| 'filtered_color_dict.pkl') | |
| if os.path.exists(filtered_color_dict_path): | |
| with open(filtered_color_dict_path, 'rb') as f: | |
| filtered_color_dict = pickle.load(f) | |
| else: | |
| filtered_color_dict = assign_colors(df_review) | |
| with open(filtered_color_dict_path, 'wb') as f: | |
| pickle.dump(filtered_color_dict, f) | |
| for i in tqdm(range(len(df_meta))): | |
| df_meta_i = df_meta.iloc[i] | |
| asin = df_meta_i['asin'] | |
| idx = self.asin2id[asin] | |
| try: | |
| color = filtered_color_dict[asin] | |
| if len(color): | |
| node_info[idx]['color'] = color | |
| except: pass | |
| print('loaded color') | |
| #################################################################### | |
| for i in tqdm(range(len(df_meta))): | |
| df_meta_i = df_meta.iloc[i] | |
| asin = df_meta_i['asin'] | |
| idx = self.asin2id[asin] | |
| for column in self.meta_columns: | |
| if column == 'brand': | |
| brand = self._process_brand(clean_data(df_meta_i[column])) | |
| if len(brand) > 1: | |
| node_info[idx]['brand'] = brand | |
| elif column == 'category': | |
| category_list = [] | |
| for category in df_meta_i[column]: | |
| category = category.lower() | |
| if category in self.SUB_CATEGORIES: | |
| category_list.append(category) | |
| if len(category_list) > 0: | |
| node_info[idx]['category'] = category_list | |
| else: | |
| node_info[idx][column] = clean_data(df_meta_i[column]) | |
| review_columns = self.review_columns | |
| review_columns.remove('style') | |
| for name, df in zip(['review', 'qa'], [df_review, df_qa]): | |
| for i in tqdm(range(len(df))): | |
| df_i = df.iloc[i] | |
| asin = df_i['asin'] | |
| idx = self.asin2id[asin] | |
| node_info[idx][name].append( | |
| df_row_to_dict(df_i, colunm_names=self.review_columns \ | |
| if name == 'review' else self.qa_columns)) | |
| import pdb; pdb.set_trace() | |
| return node_info | |
| def create_raw_product_graph(self, df, columns): | |
| edge_types = [] | |
| edge_index = [[], []] | |
| for idx in range(len(df)): | |
| out_node = self.asin2id[df['asin'].iloc[idx]] | |
| for edge_type_id, edge_type in enumerate(columns): | |
| in_nodes = [] | |
| if not isinstance(df[edge_type].iloc[idx], list): | |
| continue | |
| for i in df[edge_type].iloc[idx]: | |
| try: | |
| in_nodes.append(self.asin2id[i]) | |
| except KeyError: | |
| continue | |
| edge_types.extend([edge_type_id for _ in range(len(in_nodes))]) | |
| edge_index[0].extend([out_node for _ in range(len(in_nodes))]) | |
| edge_index[1].extend(in_nodes) | |
| return torch.LongTensor(edge_index), torch.LongTensor(edge_types) | |
| def has_brand(self, idx, brand): | |
| try: | |
| b = self[idx].brand | |
| if len(b) > 4 and b[-4:] == '.com': b = b[:-4] | |
| if len(brand) > 4 and brand[-4:] == '.com': brand = brand[:-4] | |
| return b.lower().strip("\"") == brand.lower().strip("\"") | |
| except: | |
| return False | |
| def has_also_buy(self, idx, also_buy_item): | |
| try: | |
| also_buy_lst = self.get_neighbor_nodes(idx, 'also_buy') | |
| return also_buy_item in also_buy_lst | |
| except: | |
| return False | |
| def has_also_view(self, idx, also_view_item): | |
| try: | |
| also_buy_lst = self.get_neighbor_nodes(idx, 'also_view') | |
| return also_view_item in also_buy_lst | |
| except: | |
| return False | |
| # read review files | |
| def read_review(path): | |
| def parse(path): | |
| g = gzip.open(path, 'rb') | |
| for l in g: | |
| yield json.loads(l) | |
| def getDF(path): | |
| i = 0 | |
| df = {} | |
| for d in parse(path): | |
| df[i] = d | |
| i += 1 | |
| return pd.DataFrame.from_dict(df, orient='index') | |
| return getDF(path) | |
| # read qa files | |
| def read_qa(path): | |
| def parse(path): | |
| g = gzip.open(path, 'rb') | |
| for l in g: | |
| yield eval(l) | |
| def getDF(path): | |
| i = 0 | |
| df = {} | |
| for d in parse(path): | |
| df[i] = d | |
| i += 1 | |
| return pd.DataFrame.from_dict(df, orient='index') | |
| return getDF(path) | |