Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| import itertools | |
| import re | |
| import sys | |
| import requests | |
| import json | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| class CaesarHotelBooking: | |
| def __init__(self) -> None: | |
| pass | |
| def create_url(self,city,num_of_adults,num_of_rooms,num_of_children,checkin_date,checkout_date,purpose,page_num=1): | |
| # pages go in 25 intervals | |
| self.checkin_date = checkin_date | |
| self.checkout_date = checkout_date | |
| self.city = city | |
| page_num_offset = (page_num-1) * 25 | |
| url = f"https://www.booking.com/searchresults.en-gb.html?ss={city}&label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4AQfIAQzYAQHoAQH4AQKIAgGoAgO4AvTIm_IFwAIB&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=searchresults&checkin={checkin_date}&checkout={checkout_date}&group_adults={num_of_adults}&no_rooms={num_of_rooms}&group_children={num_of_children}&sb_travel_purpose={purpose}&offset={page_num_offset}" | |
| return url | |
| def find_indices(list_to_check, item_to_find): | |
| indices = [] | |
| for idx, value in enumerate(list_to_check): | |
| if value == item_to_find: | |
| indices.append(idx) | |
| return indices | |
| def caesar_get_hotel_info(self,url): | |
| bookings = [] | |
| assumed_vat_percentage = 0.2 | |
| rating_regex = re.compile(r"^(?=.*?\d)\d*[.,]?\d*$") | |
| headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'} | |
| response=requests.get(url,headers=headers) | |
| soup=BeautifulSoup(response.content,'lxml') | |
| title = [titl.text for titl in soup.find_all('div', attrs={'data-testid': 'title'})] | |
| city_list = [self.city.capitalize() for vatind in range(len(title))] | |
| address = [addr.text for addr in soup.find_all('span', attrs={'data-testid': 'address'})] | |
| price = [float(price.text.replace("£","").replace(",","").replace("US$","")) for price in soup.find_all('span', attrs={'data-testid': 'price-and-discounted-price'})] | |
| assumed_vat = [f"{assumed_vat_percentage *100}%" for vatind in range(len(price))] | |
| checkin = [self.checkin_date for checkin in range(len(title))] | |
| checkout = [self.checkout_date for checkout in range(len(title))] | |
| assumed_final_price = [pr * (1 +assumed_vat_percentage ) for pr in price] | |
| booking = [xnights.text for xnights in soup.find_all('div', attrs={'data-testid': 'price-for-x-nights'})] | |
| room = [recounit.find("div",attrs={'class': 'd8eab2cf7f'}).text for recounit in soup.find_all('div', attrs={'data-testid': 'recommended-units'})] | |
| location = [recounit.find("a").get("href") for recounit in soup.find_all('div', attrs={'data-testid': 'location'})] | |
| distance = [dist.text for dist in soup.find_all('span', attrs={'data-testid': 'distance'})] | |
| reviews = [rev.text for rev in soup.find_all('div', attrs={'class': 'd8eab2cf7f c90c0a70d3 db63693c62'})] | |
| rating = [float(rate.text) for rate in soup.select("[aria-label]") if rating_regex.match(rate.text) and "." in rate.text] | |
| for bookingind in range(len(title)): | |
| booking_info = {} | |
| try: | |
| city_json = {'city':city_list[bookingind]} | |
| booking_info.update(city_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| title_json = {'title':title[bookingind]} | |
| booking_info.update(title_json) | |
| except IndexError as ex: | |
| continue | |
| try: | |
| checkin_date_json = {'checkin_date':checkin[bookingind]} | |
| booking_info.update(checkin_date_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| checkout_date_json = {'checkout_date':checkout[bookingind]} | |
| booking_info.update(checkout_date_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| address_json = {'address':address[bookingind]} | |
| booking_info.update(address_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| price_json = {'price':price[bookingind]} | |
| booking_info.update(price_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| assumed_vat_json = {'assumed_vat':assumed_vat[bookingind]} | |
| booking_info.update(assumed_vat_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| assumed_final_price_json = {'assumed_final_price':assumed_final_price[bookingind]} | |
| booking_info.update(assumed_final_price_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| booking_json = {'booking':booking[bookingind]} | |
| booking_info.update(booking_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| distance_json = {'distance':distance[bookingind]} | |
| booking_info.update(distance_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| reviews_json = {'reviews':reviews[bookingind]} | |
| booking_info.update(reviews_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| room_json = {'room':room[bookingind]} | |
| booking_info.update(room_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| rating_json = {'rating':rating[bookingind]} | |
| booking_info.update(rating_json) | |
| except IndexError as ex: | |
| pass | |
| try: | |
| location_json = {'location':location[bookingind]} | |
| booking_info.update(location_json) | |
| except IndexError as ex: | |
| pass | |
| bookings.append(booking_info) | |
| return bookings | |
| def store_lower_than_3000(city,range): | |
| def condition(dic): | |
| ''' Define your own condition here''' | |
| try: | |
| price = dic['assumed_final_price'] | |
| return price <= range | |
| except KeyError as kex: | |
| return False | |
| with open(f"{city.lower()}_bookings.json","r") as f: | |
| bookings = json.load(f)[f"{city.lower()}_bookings"] | |
| filtered = [d for d in bookings if condition(d)] | |
| with open(f"{city.lower()}_smaller_than_{range}.json","w+") as f: | |
| json.dump({f"{city.lower()}_bookings":filtered},f) | |
| print(f"less than {range} stored") | |
| def store_whole_booking(city,num_of_pages): | |
| overall_booking_info = [] | |
| print(f"Extracting flight data for {city}...") | |
| for i in tqdm(range(1,num_of_pages+1)): | |
| params = { | |
| "city":city, | |
| "checkin_date":"2023-8-01", | |
| "checkout_date":"2023-8-08", | |
| "purpose":"work", | |
| "num_of_adults":10, | |
| "num_of_rooms":5, | |
| "num_of_children":0, | |
| "page_num":i | |
| } | |
| url = CaesarHotelBooking.create_url(**params) | |
| bookinginfo = CaesarHotelBooking.caesar_get_hotel_info(url) | |
| overall_booking_info.append(bookinginfo) | |
| full_bookings = list(itertools.chain(*overall_booking_info)) | |
| with open(f"{city.lower()}_bookings.json","w+") as f: | |
| json.dump({f"{city.lower()}_bookings":full_bookings},f) | |
| print(full_bookings) | |
| print(len(full_bookings)) | |
| def main(): | |
| # TODO Check out Expedia... | |
| try: | |
| city = sys.argv[1] | |
| max_amount = float(sys.argv[2]) # 3000 | |
| except IndexError as iex: | |
| print("python caesarhotelbooking.py <city_to_book>") | |
| num_of_pages = 10 | |
| store_whole_booking(city,num_of_pages) | |
| store_lower_than_3000(city,max_amount) | |
| if __name__ == "__main__": | |
| main() | |