Spaces:
Runtime error
Runtime error
| import click | |
| from qdrant_client import QdrantClient, models | |
| from openai import OpenAI | |
| from tqdm import tqdm | |
| import json | |
| import requests | |
| import os | |
| from prompts import REVIEWS_SYSTEM_PROMPT, REVIEWS_USER_PROMPT | |
| TRIPADVISOR_API_KEY = os.environ.get('TRIPADVISOR_API_KEY') | |
| def save_json(data, path): | |
| with open(path, "w") as outfile: | |
| json.dump(data, outfile) | |
| def get_df(dataset_path, is_hf): | |
| if is_hf: | |
| from datasets import load_dataset | |
| dataset = load_dataset(dataset_path) | |
| return dataset['train'].to_pandas() | |
| else: | |
| import pandas as pd | |
| return pd.read_csv(dataset_path) | |
| def _concat_reviews(df): | |
| text = '' | |
| for _, row in df.iterrows(): | |
| text += '\n' | |
| if row.review_title: | |
| text += '\nTitle:\n' + row.review_title | |
| if row.review_text: | |
| text += '\nReview:\n' + row.review_text | |
| return text | |
| def create_reviews_symmary(df, model, hotels, pos_rate=4.0, neg_rate=4.0, n_reviews=6): | |
| """Create a summary of reviews for each hotel, based on the most positive and most negative reviews. | |
| Args: | |
| df (pd.DataFrame): hotels dataset | |
| model (str): OpenAI model name | |
| hotels (list): list of hotels to create summaries for | |
| pos_rate (float): minimum positive rate, inclusive | |
| neg_rate (float): maximum negative rate, exclusive | |
| n_reviews (int): number of reviews to consider for each category | |
| Returns: | |
| dict: hotel name -> reviews summary | |
| """ | |
| df['review_text_len'] = df.review_text.str.len().fillna(value=0) | |
| df['review_title_len'] = df.review_title.str.len().fillna(value=0) | |
| client = OpenAI() | |
| hotels_reviews_summary = {} | |
| for hotel in tqdm(hotels): | |
| temp = df[df.hotel_name.eq(hotel)] | |
| temp_pos = temp[temp.rate >= pos_rate].nlargest(n_reviews, 'review_text_len') | |
| temp_neg = temp[temp.rate < neg_rate].nlargest(n_reviews, 'review_text_len') | |
| if len(temp_pos) == 0 and len(temp_neg) == 0: | |
| temp_pos = temp.nlargest(n_reviews, 'review_title_len') | |
| text = _concat_reviews(temp_pos) + _concat_reviews(temp_neg) | |
| if text: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| {"role": "system", "content": REVIEWS_SYSTEM_PROMPT}, | |
| {"role": "user", "content": REVIEWS_USER_PROMPT.format(text=text)}, | |
| ] | |
| ) | |
| hotels_reviews_summary[hotel] = response.choices[0].message.content | |
| return hotels_reviews_summary | |
| def _get_loc_id(hotel): | |
| """ Given a hotel name, receive location id. | |
| In order to get the hotel info, we need to get the location id first. | |
| Args: | |
| hotel (str): hotel name | |
| Returns: | |
| str: location id | |
| """ | |
| url = "https://api.content.tripadvisor.com/api/v1/location/search?key={key}&searchQuery={hotel}&category=hotels&language=en" | |
| headers = {"accept": "application/json"} | |
| response = requests.get(url.format(hotel=hotel, key=TRIPADVISOR_API_KEY), headers=headers) | |
| try: | |
| return response.json()['data'][0]['location_id'] | |
| except Exception as e: | |
| print(f'{response.status_code=}') | |
| print(f'{response.text=}') | |
| print(f'Error: {e}') | |
| return None | |
| def get_hotel_info(hotel): | |
| """Get hotel info from TripAdvisor. | |
| The following information is retrieved using the TripAdvisor API: | |
| - rank | |
| - ratings distributions | |
| - subratings | |
| - amenities | |
| Args: | |
| hotel (str): hotel name | |
| Returns: | |
| dict: hotel info | |
| """ | |
| url = "https://api.content.tripadvisor.com/api/v1/location/{loc_id}/details?key={key}&language=en¤cy=USD" | |
| headers = {"accept": "application/json"} | |
| loc_id = _get_loc_id(hotel) | |
| if loc_id is None: | |
| return None | |
| response = requests.get(url.format(loc_id=loc_id, key=TRIPADVISOR_API_KEY), headers=headers) | |
| try: | |
| response = response.json() | |
| except Exception as e: | |
| print(f'{response.status_code=}') | |
| print(f'{response.text=}') | |
| print(f'Error: {e}') | |
| return None | |
| rank = response['ranking_data'].get('ranking_string') | |
| reviews_ratings = response.get('review_rating_count') | |
| subratings = {} | |
| for d in response['subratings']: | |
| subratings[response['subratings'][d]['name']] = response['subratings'][d]['value'] | |
| amenities = response.get('amenities', []) | |
| return dict( | |
| rank=rank, | |
| reviews_ratings=reviews_ratings, | |
| subratings=subratings, | |
| amenities=amenities, | |
| ) | |
| def get_desc(hotel, data): | |
| """Create a text description of the hotel based on the retrieved data from TripAdvisor. | |
| Args: | |
| hotel (str): hotel name | |
| data (dict): hotel info | |
| Returns: | |
| str: hotel text description | |
| """ | |
| rating = "Rating: "+str(data[hotel]['rank'])+". " | |
| distr_ranks = "Rating distribution " | |
| for key in data[hotel]['reviews_ratings'].keys(): | |
| distr_ranks += str(key) + ": " + str(data[hotel]['reviews_ratings'][key] + ", ") | |
| distr_ranks = distr_ranks[:-2]+". " | |
| sub_ranks = "Specific ratings: " | |
| if 'rate_location' in data[hotel]['subratings'].keys(): | |
| sub_ranks += "Location " + data[hotel]['subratings']['rate_location'] + ", " | |
| if 'rate_sleep' in data[hotel]['subratings'].keys(): | |
| sub_ranks += "Sleep " + data[hotel]['subratings']['rate_sleep'] + ", " | |
| if 'rate_room' in data[hotel]['subratings'].keys(): | |
| sub_ranks += "Room " + data[hotel]['subratings']['rate_room'] + ", " | |
| if 'rate_service' in data[hotel]['subratings'].keys(): | |
| sub_ranks += "Service " + data[hotel]['subratings']['rate_service'] + ", " | |
| if 'rate_cleanliness' in data[hotel]['subratings'].keys(): | |
| sub_ranks += "Cleanliness " + data[hotel]['subratings']['rate_cleanliness'] | |
| sub_ranks += ". " | |
| amenities = "Amenities available: " | |
| for i in data[hotel]['amenities']: | |
| amenities += str(i) + ", " | |
| amenities = amenities[:-2] + "." | |
| total_desc = rating + distr_ranks + sub_ranks + amenities | |
| return total_desc | |
| def get_payload(hotel, df): | |
| """Create a metadata which will be stored in the database. | |
| Args: | |
| hotel (str): hotel name | |
| df (pd.DataFrame): hotels dataset | |
| Returns: | |
| dict: metadata | |
| """ | |
| temp = df[df.hotel_name.eq(hotel)] | |
| rating = temp.rating_value.value_counts().index[0] | |
| city = temp.locality.value_counts().index[0] | |
| country = temp.country.value_counts().index[0] | |
| price = temp.price_range.str.split(' ').str[0].value_counts().index[0] | |
| return dict( | |
| hotel_name=hotel, | |
| rating=rating, | |
| city=city, | |
| country=country, | |
| price=price | |
| ) | |
| def create_vector_db(dataset_path, is_hf, db_path, collection_name, embeddings_model, embeddings_size, reviews_model): | |
| REVIEW_SUMMARIES_PATH = 'reviews_summary.json' | |
| HOTELS_INFO_PATH = 'hotels_info.json' | |
| df = get_df(dataset_path, is_hf) | |
| # Create a collection if it does not exist and filter out hotels that are already in the collection | |
| qdrant_client = QdrantClient(path=db_path) | |
| if not qdrant_client.collection_exists(collection_name): | |
| qdrant_client.create_collection( | |
| collection_name=collection_name, | |
| vectors_config=models.VectorParams(size=embeddings_size, distance=models.Distance.COSINE), | |
| ) | |
| hotels = df.hotel_name.unique() | |
| else: | |
| docs, _ = qdrant_client.scroll( | |
| collection_name=collection_name, | |
| limit=1e9, | |
| with_payload=True, | |
| with_vectors=False, | |
| ) | |
| hotels = set(df.hotel_name.unique()) - set([doc.payload['hotel_name'] for doc in docs]) | |
| if len(hotels) == 0: | |
| return | |
| # Create reviews summary using OpenAI | |
| reviews_summary = create_reviews_symmary(df, reviews_model, hotels) | |
| save_json(reviews_summary, REVIEW_SUMMARIES_PATH) | |
| # Get hotel info from TripAdvisor | |
| hotels_info = {} | |
| for hotel in tqdm(hotels): | |
| hotels_info[hotel] = get_hotel_info(hotel) | |
| save_json(hotels_info, HOTELS_INFO_PATH) | |
| # Create descriptions and payloads for each hotel | |
| texts = [] | |
| payloads = [] | |
| for hotel in hotels: | |
| trip_desc_hotel = get_desc(hotel, hotels_info) | |
| review_hotel = reviews_summary.get(hotel) | |
| payload = get_payload(hotel, df) | |
| text = trip_desc_hotel if trip_desc_hotel else '' + '\n' + review_hotel if review_hotel else '' | |
| payload['description'] = text | |
| payloads.append(payload) | |
| texts.append(text) | |
| # Create description embeddings and upsert them to the database | |
| openai_client = OpenAI() | |
| embeddings = openai_client.embeddings.create(input=texts, model=embeddings_model) | |
| points = [ | |
| models.PointStruct( | |
| id=idx, | |
| vector=data.embedding, | |
| payload=payload, | |
| ) | |
| for idx, (data, payload) in enumerate(zip(embeddings.data, payloads)) | |
| ] | |
| qdrant_client.upsert(collection_name, points) | |
| if __name__ == '__main__': | |
| create_vector_db() | |