Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ | |
| proxy_sqft_app_2.py – Streamlit UI for Address Square Footage Scraper (v2) | |
| Upload a CSV/Excel with address columns, scrape StartPage via proxies, | |
| extract sqft/beds/baths/year built, display results with clickable URLs, | |
| and download as CSV. | |
| """ | |
| import re | |
| import time | |
| import random | |
| import logging | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from urllib.parse import urljoin, unquote | |
| import pandas as pd | |
| import requests | |
| import streamlit as st | |
| from bs4 import BeautifulSoup | |
| from joblib import Parallel, delayed | |
| import urllib3 | |
| urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s [%(levelname)s] %(message)s', | |
| datefmt='%H:%M:%S', | |
| ) | |
| log = logging.getLogger(__name__) | |
| # ============================================================================= | |
| # CONSTANTS | |
| # ============================================================================= | |
| USER_AGENTS = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15", | |
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36", | |
| "Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.124 Safari/537.36", | |
| "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Mobile Safari/537.36", | |
| ] | |
| FREE_PROXY_SOURCES = [ | |
| "https://www.proxy-list.download/api/v1/get?type=http", | |
| "https://www.proxy-list.download/api/v1/get?type=https", | |
| "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt", | |
| "https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt", | |
| "https://openproxylist.xyz/http.txt", | |
| ] | |
| PASS1_JOBS = 512 | |
| PASS1_TIMEOUT = 10 | |
| PASS2_WORKERS = 10 | |
| PASS2_RETRIES = 3 | |
| # ============================================================================= | |
| # PROXY HELPERS | |
| # ============================================================================= | |
| def fetch_free_proxies(): | |
| proxies = set() | |
| for url in FREE_PROXY_SOURCES: | |
| try: | |
| text = requests.get(url, timeout=10, verify=False).text | |
| for line in text.strip().splitlines(): | |
| line = line.strip() | |
| if ':' in line and not line.startswith('#'): | |
| proxies.add("http://" + line) | |
| except Exception: | |
| continue | |
| return list(proxies) | |
| def random_proxy(pool): | |
| return random.choice(pool) | |
| # ============================================================================= | |
| # SCRAPER FUNCTIONS | |
| # ============================================================================= | |
| def _parse_startpage_html(html: str, query: str) -> pd.DataFrame: | |
| soup = BeautifulSoup(html, "html.parser") | |
| results = [] | |
| for item in soup.select("div.result"): | |
| a_tag = ( | |
| item.select_one("a.result-title") | |
| or item.select_one("h2 a") | |
| or item.select_one("a[class*='result']") | |
| ) | |
| desc_tag = ( | |
| item.select_one("p[class*='ogs6b8']") | |
| or item.select_one("p.description") | |
| or item.select_one("p[class*='snippet']") | |
| ) | |
| if not a_tag or not a_tag.get("href"): | |
| continue | |
| raw_url = a_tag["href"] | |
| if raw_url.startswith("/cgi-bin/jump"): | |
| real_url = ( | |
| unquote(raw_url.split("url=")[-1].split("&")[0]) | |
| if "url=" in raw_url else raw_url | |
| ) | |
| elif a_tag.get("data-clickurl"): | |
| real_url = unquote(a_tag["data-clickurl"]) | |
| else: | |
| real_url = urljoin("https://www.startpage.com", raw_url) | |
| if not real_url.startswith("http"): | |
| real_url = urljoin("https://www.startpage.com", real_url) | |
| title = a_tag.get_text(strip=True) | |
| description = desc_tag.get_text(strip=True) if desc_tag else "" | |
| results.append({"url": real_url, "description": description or title}) | |
| df = pd.DataFrame(results, columns=["url", "description"]) | |
| df["search_result_number"] = range(1, len(df) + 1) | |
| df["query"] = query | |
| return df | |
| def scrape_pass1(query: str, proxy_pool, timeout=10): | |
| try: | |
| url = "https://www.startpage.com/do/search" | |
| data = { | |
| "query": query, "cat": "web", "cmd": "process_search", | |
| "language": "english", "engine0": "v1all", "pg": "0", | |
| } | |
| proxy = random_proxy(proxy_pool) | |
| headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"} | |
| response = requests.post( | |
| url, data=data, headers=headers, | |
| timeout=timeout, verify=False, proxies={"http": proxy}, | |
| ) | |
| response.raise_for_status() | |
| return _parse_startpage_html(response.text, query) | |
| except Exception: | |
| return query | |
| def scrape_pass2(query: str, proxy_pool, timeout=15, retries=3, backoff=5): | |
| url = "https://www.startpage.com/do/search" | |
| post_data = { | |
| "query": query, "cat": "web", "cmd": "process_search", | |
| "language": "english", "engine0": "v1all", "pg": "0", | |
| } | |
| last_error = None | |
| for attempt in range(1, retries + 1): | |
| proxy = random_proxy(proxy_pool) | |
| headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"} | |
| try: | |
| response = requests.post( | |
| url, data=post_data, headers=headers, | |
| timeout=timeout, verify=False, | |
| proxies={"http": proxy, "https": proxy}, | |
| ) | |
| response.raise_for_status() | |
| return _parse_startpage_html(response.text, query) | |
| except Exception as e: | |
| last_error = e | |
| if attempt < retries: | |
| time.sleep(backoff * (3 ** (attempt - 1))) | |
| raise RuntimeError(f"Failed after {retries} attempts: {last_error}") | |
| # ============================================================================= | |
| # FEATURE EXTRACTION & RANKING | |
| # ============================================================================= | |
| def extract_features(df_final): | |
| desc = df_final['description'].astype(str).str.lower() | |
| sqft_raw = desc.str.extract( | |
| r'(\d{1,3}(?:,\d{3})*|\d+)' | |
| r'\s*[-–]?\s*' | |
| r'(?:sq\.?\s*(?:ft\.?|feet)|square\s*(?:ft\.?|feet|foot)|sqft|sf\b)', | |
| flags=re.IGNORECASE, | |
| )[0] | |
| sqft_clean = ( | |
| sqft_raw.str.replace(',', '', regex=False) | |
| .pipe(pd.to_numeric, errors='coerce').fillna(0).astype(int) | |
| ) | |
| sqft_clean = sqft_clean.where((sqft_clean >= 100) & (sqft_clean <= 500000), 0) | |
| df_final['Square Footage'] = sqft_clean | |
| df_final['Beds'] = desc.str.replace(r'[-–]', ' ', regex=True).str.extract(r'(\d+)\s*bed')[0] | |
| df_final['Baths'] = ( | |
| desc.str.replace(r'[-–]', ' ', regex=True) | |
| .str.extract(r'(\d+(?:\.\d+)?)\s*bath')[0].astype(float) | |
| ) | |
| df_final['Year Built'] = desc.str.extract(r'built\s+in\s+(\d{4})')[0] | |
| return df_final | |
| def rank_best_per_query(df_2): | |
| df_2['Square Footage'] = pd.to_numeric(df_2['Square Footage'], errors='coerce').fillna(0).astype(int) | |
| df_2['search_result_number'] = pd.to_numeric(df_2['search_result_number'], errors='coerce').fillna(999999).astype(int) | |
| df_2['has_sqft'] = (df_2['Square Footage'] > 0).astype(int) | |
| df_2['sqft_reasonable'] = ((df_2['Square Footage'] >= 100) & (df_2['Square Footage'] <= 50000)).astype(int) | |
| re_sites = r'zillow|redfin|realtor\.com|trulia|homes\.com|remax|century21|coldwellbanker|movoto|homesnap' | |
| df_2['from_re_site'] = df_2['url'].astype(str).str.lower().str.contains(re_sites, regex=True).astype(int) | |
| df_sorted = df_2.reset_index().sort_values( | |
| by=['query', 'has_sqft', 'sqft_reasonable', 'from_re_site', 'search_result_number', 'index'], | |
| ascending=[True, False, False, False, True, True], | |
| ) | |
| df_best = df_sorted.drop_duplicates(subset='query', keep='first').drop( | |
| columns=['has_sqft', 'sqft_reasonable', 'from_re_site', 'index'], | |
| ) | |
| return df_best | |
| def clean_and_combine(df_best, df_input): | |
| df_merged = pd.merge(df_best, df_input, on='query', how='inner') | |
| addr_candidates = ["Address", "Asset Address Line1", "Asset Address", "Street"] | |
| city_candidates = ["City", "Asset City"] | |
| state_candidates = ["State", "Asset State"] | |
| zip_candidates = ["Zip", "ZIP", "Asset Zip"] | |
| def _find_col(df, candidates): | |
| for c in candidates: | |
| if c in df.columns: | |
| return c | |
| return None | |
| addr_col = _find_col(df_merged, addr_candidates) | |
| city_col = _find_col(df_merged, city_candidates) | |
| state_col = _find_col(df_merged, state_candidates) | |
| zip_col = _find_col(df_merged, zip_candidates) | |
| if all([addr_col, city_col, state_col, zip_col]): | |
| df_merged = df_merged.rename(columns={ | |
| addr_col: "Address", city_col: "City", state_col: "State", zip_col: "Zip", | |
| }) | |
| df_merged["Full Address"] = ( | |
| df_merged["Address"].astype(str).str.strip() + " " + | |
| df_merged["City"].astype(str).str.strip() + " " + | |
| df_merged["State"].astype(str).str.strip() + " " + | |
| df_merged["Zip"].astype(str).str.strip() | |
| ).str.replace(r"\s+", " ", regex=True).str.strip() | |
| final_cols = ["Full Address", "Address", "City", "State", "Zip", | |
| "Square Footage", "Beds", "Baths", "Year Built", "url"] | |
| else: | |
| df_merged["Full Address"] = ( | |
| df_merged["query"] | |
| .str.replace(r"\s*square\s*footage\s*$", "", flags=re.IGNORECASE, regex=True) | |
| .str.strip() | |
| ) | |
| final_cols = ["Full Address", "Square Footage", "Beds", "Baths", "Year Built", "url"] | |
| for col in final_cols: | |
| if col not in df_merged.columns: | |
| df_merged[col] = pd.NA | |
| return df_merged[final_cols].copy() | |
| def _make_clickable(url): | |
| """Convert a URL to an HTML hyperlink for Streamlit display.""" | |
| if pd.isna(url) or not str(url).startswith("http"): | |
| return url | |
| return f'<a href="{url}" target="_blank">{url}</a>' | |
| # ============================================================================= | |
| # STREAMLIT APP | |
| # ============================================================================= | |
| st.set_page_config(page_title="Address SqFt Scraper", layout="wide") | |
| st.title("Address Square Footage Scraper") | |
| st.caption("Upload a CSV or Excel file with address columns. The app searches StartPage via proxies and extracts property details.") | |
| # --- File upload --- | |
| uploaded = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"]) | |
| if uploaded is not None: | |
| if uploaded.name.endswith(".csv"): | |
| df_input = pd.read_csv(uploaded) | |
| else: | |
| df_input = pd.read_excel(uploaded) | |
| # --- Column mapping --- | |
| st.subheader("Map Columns") | |
| all_cols = ["(auto-detect)"] + list(df_input.columns) | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| addr_col = st.selectbox("Address", all_cols, index=0) | |
| with col2: | |
| city_col = st.selectbox("City", all_cols, index=0) | |
| with col3: | |
| state_col = st.selectbox("State", all_cols, index=0) | |
| with col4: | |
| zip_col = st.selectbox("Zip", all_cols, index=0) | |
| if any(c != "(auto-detect)" for c in [addr_col, city_col, state_col, zip_col]): | |
| rename_map = {} | |
| if addr_col != "(auto-detect)": | |
| rename_map[addr_col] = "Address" | |
| if city_col != "(auto-detect)": | |
| rename_map[city_col] = "City" | |
| if state_col != "(auto-detect)": | |
| rename_map[state_col] = "State" | |
| if zip_col != "(auto-detect)": | |
| rename_map[zip_col] = "Zip" | |
| df_input = df_input.rename(columns=rename_map) | |
| st.subheader("Uploaded Data Preview") | |
| st.dataframe(df_input.head(20), use_container_width=True) | |
| st.info(f"{len(df_input)} rows loaded") | |
| # --- Run button --- | |
| if st.button("Run Scraper", type="primary", use_container_width=True): | |
| df_work = df_input.copy() | |
| df_work['query'] = df_work.iloc[:, :4].astype(str).apply(' '.join, axis=1) | |
| df_work['query'] = df_work['query'] + ' square footage' | |
| all_queries = df_work['query'].tolist() | |
| st.info(f"Scraping {len(all_queries)} addresses...") | |
| # --- Load proxies --- | |
| status = st.empty() | |
| status.text("Loading proxy pool...") | |
| proxy_pool = fetch_free_proxies() | |
| status.text(f"Proxy pool: {len(proxy_pool)} proxies") | |
| if not proxy_pool: | |
| st.error("No proxies found. Check network connectivity.") | |
| st.stop() | |
| # ============================================= | |
| # PASS 1 | |
| # ============================================= | |
| progress = st.progress(0, text="Pass 1: Searching...") | |
| p1_raw = Parallel(n_jobs=PASS1_JOBS, prefer="threads")( | |
| delayed(scrape_pass1)(q, proxy_pool, PASS1_TIMEOUT) for q in all_queries | |
| ) | |
| progress.progress(70, text="Pass 1 complete") | |
| p1_successes = [r for r in p1_raw if isinstance(r, pd.DataFrame)] | |
| p1_failed_queries = [r for r in p1_raw if isinstance(r, str)] | |
| status.text( | |
| f"Pass 1: {len(p1_successes)}/{len(all_queries)} succeeded, " | |
| f"{len(p1_failed_queries)} failed" | |
| ) | |
| # ============================================= | |
| # PASS 2 | |
| # ============================================= | |
| p2_successes = [] | |
| final_failed = [] | |
| if p1_failed_queries: | |
| progress.progress(75, text=f"Pass 2: Retrying {len(p1_failed_queries)} failures...") | |
| fetch_free_proxies.clear() | |
| proxy_pool = fetch_free_proxies() | |
| def _retry_one(query): | |
| time.sleep(random.uniform(1.0, 3.0)) | |
| return scrape_pass2(query, proxy_pool, retries=PASS2_RETRIES) | |
| with ThreadPoolExecutor(max_workers=PASS2_WORKERS) as executor: | |
| future_to_q = {executor.submit(_retry_one, q): q for q in p1_failed_queries} | |
| for future in as_completed(future_to_q): | |
| query = future_to_q[future] | |
| try: | |
| df = future.result() | |
| p2_successes.append(df) | |
| except Exception as e: | |
| final_failed.append({"query": query, "error": str(e)}) | |
| status.text( | |
| f"Pass 2: recovered {len(p2_successes)}/{len(p1_failed_queries)}" | |
| ) | |
| # ============================================= | |
| # COMBINE & PROCESS | |
| # ============================================= | |
| progress.progress(90, text="Processing results...") | |
| all_dfs = p1_successes + p2_successes | |
| if not all_dfs: | |
| st.error("No results collected. Check network/proxies.") | |
| st.stop() | |
| df_raw = pd.concat(all_dfs, ignore_index=True) | |
| df_raw = extract_features(df_raw) | |
| df_best = rank_best_per_query(df_raw) | |
| df_clean = clean_and_combine(df_best, df_work) | |
| progress.progress(100, text="Done!") | |
| # ============================================= | |
| # RESULTS | |
| # ============================================= | |
| total = len(df_clean) | |
| sqft_vals = pd.to_numeric(df_clean['Square Footage'], errors='coerce').fillna(0) | |
| has_sqft = (sqft_vals > 0).sum() | |
| in_range = ((sqft_vals >= 100) & (sqft_vals <= 8000)).sum() | |
| st.subheader("Results") | |
| mc1, mc2, mc3, mc4 = st.columns(4) | |
| mc1.metric("Total Addresses", total) | |
| mc2.metric("With Sq Ft", f"{has_sqft} ({has_sqft/total*100:.0f}%)" if total else "0") | |
| mc3.metric("In 100-8000 Range", f"{in_range} ({in_range/total*100:.0f}%)" if total else "0") | |
| mc4.metric("Failed Queries", len(final_failed)) | |
| # Display with clickable URL column in scrollable container | |
| df_display = df_clean.copy() | |
| df_display['url'] = df_display['url'].apply(_make_clickable) | |
| table_html = df_display.to_html(escape=False, index=False, classes="result-table") | |
| st.markdown( | |
| f""" | |
| <div style="max-height: 500px; overflow-y: auto; border: 1px solid #ddd; border-radius: 4px;"> | |
| {table_html} | |
| </div> | |
| <style> | |
| .result-table {{ | |
| width: 100%; | |
| border-collapse: collapse; | |
| }} | |
| .result-table th {{ | |
| position: sticky; | |
| top: 0; | |
| background: #f0f2f6; | |
| z-index: 1; | |
| padding: 8px; | |
| border-bottom: 2px solid #ddd; | |
| }} | |
| .result-table td {{ | |
| padding: 6px 8px; | |
| border-bottom: 1px solid #eee; | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # --- Download button (include mapped input columns) --- | |
| input_cols = list(df_input.columns[:4]) | |
| df_download = df_clean.reset_index(drop=True) | |
| df_work_reset = df_work[input_cols].reset_index(drop=True) | |
| for col in input_cols: | |
| if col not in df_download.columns and col in df_work_reset.columns: | |
| df_download[col] = df_work_reset[col] | |
| csv_bytes = df_download.to_csv(index=False, encoding='utf-8-sig').encode('utf-8-sig') | |
| st.download_button( | |
| label="Download Results CSV", | |
| data=csv_bytes, | |
| file_name="address_sqft_results.csv", | |
| mime="text/csv", | |
| ) | |
| # Show failures if any | |
| if final_failed: | |
| with st.expander(f"Failed Queries ({len(final_failed)})"): | |
| st.dataframe(pd.DataFrame(final_failed), use_container_width=True) | |