Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import time | |
| import os | |
| import subprocess | |
| import chromedriver_autoinstaller | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| def install_chrome(): | |
| if not os.path.exists("/usr/bin/chromium-browser"): | |
| subprocess.run(["apt-get", "update"], check=True) | |
| subprocess.run(["apt-get", "install", "-y", "chromium-browser"], check=True) | |
| os.environ["PATH"] += os.pathsep + "/usr/bin/" | |
| def scrape_redfin(zipcode): | |
| install_chrome() # Ensure Chrome/Chromium is installed | |
| chromedriver_autoinstaller.install() # Ensure the correct chromedriver version is installed | |
| options = Options() | |
| options.add_argument("--headless") # Run in headless mode | |
| options.add_argument("--no-sandbox") | |
| options.add_argument("--disable-dev-shm-usage") | |
| options.add_argument("--incognito") | |
| options.add_argument("--disable-blink-features=AutomationControlled") | |
| options.add_argument("start-maximized") | |
| options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
| options.binary_location = "/usr/bin/chromium-browser" # Use Chromium | |
| service = Service(chromedriver_autoinstaller.install()) | |
| driver = webdriver.Chrome(service=service, options=options) | |
| url = f"https://www.redfin.com/zipcode/{zipcode}" | |
| driver.get(url) | |
| try: | |
| listings_container = WebDriverWait(driver, 60).until( | |
| EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) | |
| ) | |
| except Exception as e: | |
| st.error("Error: Listings did not load properly") | |
| driver.quit() | |
| return pd.DataFrame() | |
| scroll_pause_time = 5 | |
| screen_height = driver.execute_script("return window.innerHeight;") | |
| last_height = driver.execute_script("return document.body.scrollHeight") | |
| while True: | |
| driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) | |
| time.sleep(scroll_pause_time) | |
| new_height = driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| houses = [] | |
| listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") | |
| for listing in listings: | |
| try: | |
| price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text | |
| except: | |
| price = "N/A" | |
| try: | |
| address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text | |
| except: | |
| address = "N/A" | |
| try: | |
| size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text | |
| except: | |
| size = "N/A" | |
| try: | |
| link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") | |
| except: | |
| link = "N/A" | |
| houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) | |
| driver.quit() | |
| return pd.DataFrame(houses) | |
| st.title("Redfin House Listings Scraper") | |
| zipcode = st.text_input("Enter ZIP code:") | |
| if st.button("Scrape Data"): | |
| if zipcode: | |
| with st.spinner("Scraping data, please wait..."): | |
| df = scrape_redfin(zipcode) | |
| if not df.empty: | |
| st.success("Scraping complete! Here are the available houses:") | |
| st.dataframe(df) | |
| else: | |
| st.warning("No houses found for the given ZIP code.") | |
| else: | |
| st.error("Please enter a valid ZIP code.") | |
| ## working best code ever | |
| # import streamlit as st | |
| # import pandas as pd | |
| # import time | |
| # from selenium import webdriver | |
| # from selenium.webdriver.common.by import By | |
| # from selenium.webdriver.chrome.service import Service | |
| # from selenium.webdriver.chrome.options import Options | |
| # from selenium.webdriver.support.ui import WebDriverWait | |
| # from selenium.webdriver.support import expected_conditions as EC | |
| # from webdriver_manager.chrome import ChromeDriverManager | |
| # def scrape_redfin(zipcode): | |
| # options = Options() | |
| # options.add_argument("--headless") | |
| # options.add_argument("--incognito") | |
| # options.add_argument("--disable-blink-features=AutomationControlled") | |
| # options.add_argument("start-maximized") | |
| # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
| # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) | |
| # url = f"https://www.redfin.com/zipcode/{zipcode}" | |
| # driver.get(url) | |
| # try: | |
| # listings_container = WebDriverWait(driver, 60).until( | |
| # EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div")) | |
| # ) | |
| # except Exception as e: | |
| # st.error("Error: Listings did not load properly") | |
| # driver.quit() | |
| # return pd.DataFrame() | |
| # scroll_pause_time = 5 | |
| # screen_height = driver.execute_script("return window.innerHeight;") | |
| # last_height = driver.execute_script("return document.body.scrollHeight") | |
| # while True: | |
| # driver.execute_script("window.scrollBy(0, arguments[0]);", screen_height // 2) | |
| # time.sleep(scroll_pause_time) | |
| # new_height = driver.execute_script("return document.body.scrollHeight") | |
| # if new_height == last_height: | |
| # break | |
| # last_height = new_height | |
| # houses = [] | |
| # listings = driver.find_elements(By.XPATH, "/html/body/div[1]/div[6]/div[1]/div[3]/div[1]/div[4]/div/div[1]/div/div") | |
| # for listing in listings: | |
| # try: | |
| # price = listing.find_element(By.XPATH, ".//div/div/div[2]/div[1]/div[1]/span").text | |
| # except: | |
| # price = "N/A" | |
| # try: | |
| # address = listing.find_element(By.XPATH, ".//div/div/div[2]/div[3]").text | |
| # except: | |
| # address = "N/A" | |
| # try: | |
| # size = listing.find_element(By.XPATH, ".//div/div/div[2]/div[4]/div").text | |
| # except: | |
| # size = "N/A" | |
| # try: | |
| # link = listing.find_element(By.TAG_NAME, "a").get_attribute("href") | |
| # except: | |
| # link = "N/A" | |
| # houses.append({"Price": price, "Address": address, "Size": size, "Link": link}) | |
| # driver.quit() | |
| # return pd.DataFrame(houses) | |
| # st.title("Redfin House Listings Scraper") | |
| # zipcode = st.text_input("Enter ZIP code:") | |
| # if st.button("Scrape Data"): | |
| # if zipcode: | |
| # with st.spinner("Scraping data, please wait..."): | |
| # df = scrape_redfin(zipcode) | |
| # if not df.empty: | |
| # st.success("Scraping complete! Here are the available houses:") | |
| # st.dataframe(df) | |
| # else: | |
| # st.warning("No houses found for the given ZIP code.") | |
| # else: | |
| # st.error("Please enter a valid ZIP code.") | |