# -*- coding: utf-8 -*- """ proxy_sqft_app_2.py – Streamlit UI for Address Square Footage Scraper (v2) Upload a CSV/Excel with address columns, scrape StartPage via proxies, extract sqft/beds/baths/year built, display results with clickable URLs, and download as CSV. """ import re import time import random import logging from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urljoin, unquote import pandas as pd import requests import streamlit as st from bs4 import BeautifulSoup from joblib import Parallel, delayed import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%H:%M:%S', ) log = logging.getLogger(__name__) # ============================================================================= # CONSTANTS # ============================================================================= USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36", "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.124 Safari/537.36", "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Mobile Safari/537.36", ] FREE_PROXY_SOURCES = [ "https://www.proxy-list.download/api/v1/get?type=http", "https://www.proxy-list.download/api/v1/get?type=https", "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt", "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt", "https://openproxylist.xyz/http.txt", ] PASS1_JOBS = 512 PASS1_TIMEOUT = 10 PASS2_WORKERS = 10 PASS2_RETRIES = 3 # ============================================================================= # PROXY HELPERS # ============================================================================= @st.cache_data(ttl=300, show_spinner=False) def fetch_free_proxies(): proxies = set() for url in FREE_PROXY_SOURCES: try: text = requests.get(url, timeout=10, verify=False).text for line in text.strip().splitlines(): line = line.strip() if ':' in line and not line.startswith('#'): proxies.add("http://" + line) except Exception: continue return list(proxies) def random_proxy(pool): return random.choice(pool) # ============================================================================= # SCRAPER FUNCTIONS # ============================================================================= def _parse_startpage_html(html: str, query: str) -> pd.DataFrame: soup = BeautifulSoup(html, "html.parser") results = [] for item in soup.select("div.result"): a_tag = ( item.select_one("a.result-title") or item.select_one("h2 a") or item.select_one("a[class*='result']") ) desc_tag = ( item.select_one("p[class*='ogs6b8']") or item.select_one("p.description") or item.select_one("p[class*='snippet']") ) if not a_tag or not a_tag.get("href"): continue raw_url = a_tag["href"] if raw_url.startswith("/cgi-bin/jump"): real_url = ( unquote(raw_url.split("url=")[-1].split("&")[0]) if "url=" in raw_url else raw_url ) elif a_tag.get("data-clickurl"): real_url = unquote(a_tag["data-clickurl"]) else: real_url = urljoin("https://www.startpage.com", raw_url) if not real_url.startswith("http"): real_url = urljoin("https://www.startpage.com", real_url) title = a_tag.get_text(strip=True) description = desc_tag.get_text(strip=True) if desc_tag else "" results.append({"url": real_url, "description": description or title}) df = pd.DataFrame(results, columns=["url", "description"]) df["search_result_number"] = range(1, len(df) + 1) df["query"] = query return df def scrape_pass1(query: str, proxy_pool, timeout=10): try: url = "https://www.startpage.com/do/search" data = { "query": query, "cat": "web", "cmd": "process_search", "language": "english", "engine0": "v1all", "pg": "0", } proxy = random_proxy(proxy_pool) headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"} response = requests.post( url, data=data, headers=headers, timeout=timeout, verify=False, proxies={"http": proxy}, ) response.raise_for_status() return _parse_startpage_html(response.text, query) except Exception: return query def scrape_pass2(query: str, proxy_pool, timeout=15, retries=3, backoff=5): url = "https://www.startpage.com/do/search" post_data = { "query": query, "cat": "web", "cmd": "process_search", "language": "english", "engine0": "v1all", "pg": "0", } last_error = None for attempt in range(1, retries + 1): proxy = random_proxy(proxy_pool) headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"} try: response = requests.post( url, data=post_data, headers=headers, timeout=timeout, verify=False, proxies={"http": proxy, "https": proxy}, ) response.raise_for_status() return _parse_startpage_html(response.text, query) except Exception as e: last_error = e if attempt < retries: time.sleep(backoff * (3 ** (attempt - 1))) raise RuntimeError(f"Failed after {retries} attempts: {last_error}") # ============================================================================= # FEATURE EXTRACTION & RANKING # ============================================================================= def extract_features(df_final): desc = df_final['description'].astype(str).str.lower() sqft_raw = desc.str.extract( r'(\d{1,3}(?:,\d{3})*|\d+)' r'\s*[-–]?\s*' r'(?:sq\.?\s*(?:ft\.?|feet)|square\s*(?:ft\.?|feet|foot)|sqft|sf\b)', flags=re.IGNORECASE, )[0] sqft_clean = ( sqft_raw.str.replace(',', '', regex=False) .pipe(pd.to_numeric, errors='coerce').fillna(0).astype(int) ) sqft_clean = sqft_clean.where((sqft_clean >= 100) & (sqft_clean <= 500000), 0) df_final['Square Footage'] = sqft_clean df_final['Beds'] = desc.str.replace(r'[-–]', ' ', regex=True).str.extract(r'(\d+)\s*bed')[0] df_final['Baths'] = ( desc.str.replace(r'[-–]', ' ', regex=True) .str.extract(r'(\d+(?:\.\d+)?)\s*bath')[0].astype(float) ) df_final['Year Built'] = desc.str.extract(r'built\s+in\s+(\d{4})')[0] return df_final def rank_best_per_query(df_2): df_2['Square Footage'] = pd.to_numeric(df_2['Square Footage'], errors='coerce').fillna(0).astype(int) df_2['search_result_number'] = pd.to_numeric(df_2['search_result_number'], errors='coerce').fillna(999999).astype(int) df_2['has_sqft'] = (df_2['Square Footage'] > 0).astype(int) df_2['sqft_reasonable'] = ((df_2['Square Footage'] >= 100) & (df_2['Square Footage'] <= 50000)).astype(int) re_sites = r'zillow|redfin|realtor\.com|trulia|homes\.com|remax|century21|coldwellbanker|movoto|homesnap' df_2['from_re_site'] = df_2['url'].astype(str).str.lower().str.contains(re_sites, regex=True).astype(int) df_sorted = df_2.reset_index().sort_values( by=['query', 'has_sqft', 'sqft_reasonable', 'from_re_site', 'search_result_number', 'index'], ascending=[True, False, False, False, True, True], ) df_best = df_sorted.drop_duplicates(subset='query', keep='first').drop( columns=['has_sqft', 'sqft_reasonable', 'from_re_site', 'index'], ) return df_best def clean_and_combine(df_best, df_input): df_merged = pd.merge(df_best, df_input, on='query', how='inner') addr_candidates = ["Address", "Asset Address Line1", "Asset Address", "Street"] city_candidates = ["City", "Asset City"] state_candidates = ["State", "Asset State"] zip_candidates = ["Zip", "ZIP", "Asset Zip"] def _find_col(df, candidates): for c in candidates: if c in df.columns: return c return None addr_col = _find_col(df_merged, addr_candidates) city_col = _find_col(df_merged, city_candidates) state_col = _find_col(df_merged, state_candidates) zip_col = _find_col(df_merged, zip_candidates) if all([addr_col, city_col, state_col, zip_col]): df_merged = df_merged.rename(columns={ addr_col: "Address", city_col: "City", state_col: "State", zip_col: "Zip", }) df_merged["Full Address"] = ( df_merged["Address"].astype(str).str.strip() + " " + df_merged["City"].astype(str).str.strip() + " " + df_merged["State"].astype(str).str.strip() + " " + df_merged["Zip"].astype(str).str.strip() ).str.replace(r"\s+", " ", regex=True).str.strip() final_cols = ["Full Address", "Address", "City", "State", "Zip", "Square Footage", "Beds", "Baths", "Year Built", "url"] else: df_merged["Full Address"] = ( df_merged["query"] .str.replace(r"\s*square\s*footage\s*$", "", flags=re.IGNORECASE, regex=True) .str.strip() ) final_cols = ["Full Address", "Square Footage", "Beds", "Baths", "Year Built", "url"] for col in final_cols: if col not in df_merged.columns: df_merged[col] = pd.NA return df_merged[final_cols].copy() def _make_clickable(url): """Convert a URL to an HTML hyperlink for Streamlit display.""" if pd.isna(url) or not str(url).startswith("http"): return url return f'{url}' # ============================================================================= # STREAMLIT APP # ============================================================================= st.set_page_config(page_title="Address SqFt Scraper", layout="wide") st.title("Address Square Footage Scraper") st.caption("Upload a CSV or Excel file with address columns. The app searches StartPage via proxies and extracts property details.") # --- File upload --- uploaded = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"]) if uploaded is not None: if uploaded.name.endswith(".csv"): df_input = pd.read_csv(uploaded) else: df_input = pd.read_excel(uploaded) # --- Column mapping --- st.subheader("Map Columns") all_cols = ["(auto-detect)"] + list(df_input.columns) col1, col2, col3, col4 = st.columns(4) with col1: addr_col = st.selectbox("Address", all_cols, index=0) with col2: city_col = st.selectbox("City", all_cols, index=0) with col3: state_col = st.selectbox("State", all_cols, index=0) with col4: zip_col = st.selectbox("Zip", all_cols, index=0) if any(c != "(auto-detect)" for c in [addr_col, city_col, state_col, zip_col]): rename_map = {} if addr_col != "(auto-detect)": rename_map[addr_col] = "Address" if city_col != "(auto-detect)": rename_map[city_col] = "City" if state_col != "(auto-detect)": rename_map[state_col] = "State" if zip_col != "(auto-detect)": rename_map[zip_col] = "Zip" df_input = df_input.rename(columns=rename_map) st.subheader("Uploaded Data Preview") st.dataframe(df_input.head(20), use_container_width=True) st.info(f"{len(df_input)} rows loaded") # --- Run button --- if st.button("Run Scraper", type="primary", use_container_width=True): df_work = df_input.copy() df_work['query'] = df_work.iloc[:, :4].astype(str).apply(' '.join, axis=1) df_work['query'] = df_work['query'] + ' square footage' all_queries = df_work['query'].tolist() st.info(f"Scraping {len(all_queries)} addresses...") # --- Load proxies --- status = st.empty() status.text("Loading proxy pool...") proxy_pool = fetch_free_proxies() status.text(f"Proxy pool: {len(proxy_pool)} proxies") if not proxy_pool: st.error("No proxies found. Check network connectivity.") st.stop() # ============================================= # PASS 1 # ============================================= progress = st.progress(0, text="Pass 1: Searching...") p1_raw = Parallel(n_jobs=PASS1_JOBS, prefer="threads")( delayed(scrape_pass1)(q, proxy_pool, PASS1_TIMEOUT) for q in all_queries ) progress.progress(70, text="Pass 1 complete") p1_successes = [r for r in p1_raw if isinstance(r, pd.DataFrame)] p1_failed_queries = [r for r in p1_raw if isinstance(r, str)] status.text( f"Pass 1: {len(p1_successes)}/{len(all_queries)} succeeded, " f"{len(p1_failed_queries)} failed" ) # ============================================= # PASS 2 # ============================================= p2_successes = [] final_failed = [] if p1_failed_queries: progress.progress(75, text=f"Pass 2: Retrying {len(p1_failed_queries)} failures...") fetch_free_proxies.clear() proxy_pool = fetch_free_proxies() def _retry_one(query): time.sleep(random.uniform(1.0, 3.0)) return scrape_pass2(query, proxy_pool, retries=PASS2_RETRIES) with ThreadPoolExecutor(max_workers=PASS2_WORKERS) as executor: future_to_q = {executor.submit(_retry_one, q): q for q in p1_failed_queries} for future in as_completed(future_to_q): query = future_to_q[future] try: df = future.result() p2_successes.append(df) except Exception as e: final_failed.append({"query": query, "error": str(e)}) status.text( f"Pass 2: recovered {len(p2_successes)}/{len(p1_failed_queries)}" ) # ============================================= # COMBINE & PROCESS # ============================================= progress.progress(90, text="Processing results...") all_dfs = p1_successes + p2_successes if not all_dfs: st.error("No results collected. Check network/proxies.") st.stop() df_raw = pd.concat(all_dfs, ignore_index=True) df_raw = extract_features(df_raw) df_best = rank_best_per_query(df_raw) df_clean = clean_and_combine(df_best, df_work) progress.progress(100, text="Done!") # ============================================= # RESULTS # ============================================= total = len(df_clean) sqft_vals = pd.to_numeric(df_clean['Square Footage'], errors='coerce').fillna(0) has_sqft = (sqft_vals > 0).sum() in_range = ((sqft_vals >= 100) & (sqft_vals <= 8000)).sum() st.subheader("Results") mc1, mc2, mc3, mc4 = st.columns(4) mc1.metric("Total Addresses", total) mc2.metric("With Sq Ft", f"{has_sqft} ({has_sqft/total*100:.0f}%)" if total else "0") mc3.metric("In 100-8000 Range", f"{in_range} ({in_range/total*100:.0f}%)" if total else "0") mc4.metric("Failed Queries", len(final_failed)) # Display with clickable URL column in scrollable container df_display = df_clean.copy() df_display['url'] = df_display['url'].apply(_make_clickable) table_html = df_display.to_html(escape=False, index=False, classes="result-table") st.markdown( f"""