address_web_scraper / src /streamlit_app.py
mattritchey's picture
Update src/streamlit_app.py
c5eb6ed verified
# -*- coding: utf-8 -*-
"""
proxy_sqft_app_2.py – Streamlit UI for Address Square Footage Scraper (v2)
Upload a CSV/Excel with address columns, scrape StartPage via proxies,
extract sqft/beds/baths/year built, display results with clickable URLs,
and download as CSV.
"""
import re
import time
import random
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin, unquote
import pandas as pd
import requests
import streamlit as st
from bs4 import BeautifulSoup
from joblib import Parallel, delayed
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
datefmt='%H:%M:%S',
)
log = logging.getLogger(__name__)
# =============================================================================
# CONSTANTS
# =============================================================================
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.95 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.124 Safari/537.36",
"Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Mobile Safari/537.36",
]
FREE_PROXY_SOURCES = [
"https://www.proxy-list.download/api/v1/get?type=http",
"https://www.proxy-list.download/api/v1/get?type=https",
"https://raw.githubusercontent.com/TheSpeedX/PROXY-List/master/http.txt",
"https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt",
"https://openproxylist.xyz/http.txt",
]
PASS1_JOBS = 512
PASS1_TIMEOUT = 10
PASS2_WORKERS = 10
PASS2_RETRIES = 3
# =============================================================================
# PROXY HELPERS
# =============================================================================
@st.cache_data(ttl=300, show_spinner=False)
def fetch_free_proxies():
proxies = set()
for url in FREE_PROXY_SOURCES:
try:
text = requests.get(url, timeout=10, verify=False).text
for line in text.strip().splitlines():
line = line.strip()
if ':' in line and not line.startswith('#'):
proxies.add("http://" + line)
except Exception:
continue
return list(proxies)
def random_proxy(pool):
return random.choice(pool)
# =============================================================================
# SCRAPER FUNCTIONS
# =============================================================================
def _parse_startpage_html(html: str, query: str) -> pd.DataFrame:
soup = BeautifulSoup(html, "html.parser")
results = []
for item in soup.select("div.result"):
a_tag = (
item.select_one("a.result-title")
or item.select_one("h2 a")
or item.select_one("a[class*='result']")
)
desc_tag = (
item.select_one("p[class*='ogs6b8']")
or item.select_one("p.description")
or item.select_one("p[class*='snippet']")
)
if not a_tag or not a_tag.get("href"):
continue
raw_url = a_tag["href"]
if raw_url.startswith("/cgi-bin/jump"):
real_url = (
unquote(raw_url.split("url=")[-1].split("&")[0])
if "url=" in raw_url else raw_url
)
elif a_tag.get("data-clickurl"):
real_url = unquote(a_tag["data-clickurl"])
else:
real_url = urljoin("https://www.startpage.com", raw_url)
if not real_url.startswith("http"):
real_url = urljoin("https://www.startpage.com", real_url)
title = a_tag.get_text(strip=True)
description = desc_tag.get_text(strip=True) if desc_tag else ""
results.append({"url": real_url, "description": description or title})
df = pd.DataFrame(results, columns=["url", "description"])
df["search_result_number"] = range(1, len(df) + 1)
df["query"] = query
return df
def scrape_pass1(query: str, proxy_pool, timeout=10):
try:
url = "https://www.startpage.com/do/search"
data = {
"query": query, "cat": "web", "cmd": "process_search",
"language": "english", "engine0": "v1all", "pg": "0",
}
proxy = random_proxy(proxy_pool)
headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"}
response = requests.post(
url, data=data, headers=headers,
timeout=timeout, verify=False, proxies={"http": proxy},
)
response.raise_for_status()
return _parse_startpage_html(response.text, query)
except Exception:
return query
def scrape_pass2(query: str, proxy_pool, timeout=15, retries=3, backoff=5):
url = "https://www.startpage.com/do/search"
post_data = {
"query": query, "cat": "web", "cmd": "process_search",
"language": "english", "engine0": "v1all", "pg": "0",
}
last_error = None
for attempt in range(1, retries + 1):
proxy = random_proxy(proxy_pool)
headers = {"User-Agent": random.choice(USER_AGENTS), "Accept-Language": "en-US,en;q=0.9"}
try:
response = requests.post(
url, data=post_data, headers=headers,
timeout=timeout, verify=False,
proxies={"http": proxy, "https": proxy},
)
response.raise_for_status()
return _parse_startpage_html(response.text, query)
except Exception as e:
last_error = e
if attempt < retries:
time.sleep(backoff * (3 ** (attempt - 1)))
raise RuntimeError(f"Failed after {retries} attempts: {last_error}")
# =============================================================================
# FEATURE EXTRACTION & RANKING
# =============================================================================
def extract_features(df_final):
desc = df_final['description'].astype(str).str.lower()
sqft_raw = desc.str.extract(
r'(\d{1,3}(?:,\d{3})*|\d+)'
r'\s*[-–]?\s*'
r'(?:sq\.?\s*(?:ft\.?|feet)|square\s*(?:ft\.?|feet|foot)|sqft|sf\b)',
flags=re.IGNORECASE,
)[0]
sqft_clean = (
sqft_raw.str.replace(',', '', regex=False)
.pipe(pd.to_numeric, errors='coerce').fillna(0).astype(int)
)
sqft_clean = sqft_clean.where((sqft_clean >= 100) & (sqft_clean <= 500000), 0)
df_final['Square Footage'] = sqft_clean
df_final['Beds'] = desc.str.replace(r'[-–]', ' ', regex=True).str.extract(r'(\d+)\s*bed')[0]
df_final['Baths'] = (
desc.str.replace(r'[-–]', ' ', regex=True)
.str.extract(r'(\d+(?:\.\d+)?)\s*bath')[0].astype(float)
)
df_final['Year Built'] = desc.str.extract(r'built\s+in\s+(\d{4})')[0]
return df_final
def rank_best_per_query(df_2):
df_2['Square Footage'] = pd.to_numeric(df_2['Square Footage'], errors='coerce').fillna(0).astype(int)
df_2['search_result_number'] = pd.to_numeric(df_2['search_result_number'], errors='coerce').fillna(999999).astype(int)
df_2['has_sqft'] = (df_2['Square Footage'] > 0).astype(int)
df_2['sqft_reasonable'] = ((df_2['Square Footage'] >= 100) & (df_2['Square Footage'] <= 50000)).astype(int)
re_sites = r'zillow|redfin|realtor\.com|trulia|homes\.com|remax|century21|coldwellbanker|movoto|homesnap'
df_2['from_re_site'] = df_2['url'].astype(str).str.lower().str.contains(re_sites, regex=True).astype(int)
df_sorted = df_2.reset_index().sort_values(
by=['query', 'has_sqft', 'sqft_reasonable', 'from_re_site', 'search_result_number', 'index'],
ascending=[True, False, False, False, True, True],
)
df_best = df_sorted.drop_duplicates(subset='query', keep='first').drop(
columns=['has_sqft', 'sqft_reasonable', 'from_re_site', 'index'],
)
return df_best
def clean_and_combine(df_best, df_input):
df_merged = pd.merge(df_best, df_input, on='query', how='inner')
addr_candidates = ["Address", "Asset Address Line1", "Asset Address", "Street"]
city_candidates = ["City", "Asset City"]
state_candidates = ["State", "Asset State"]
zip_candidates = ["Zip", "ZIP", "Asset Zip"]
def _find_col(df, candidates):
for c in candidates:
if c in df.columns:
return c
return None
addr_col = _find_col(df_merged, addr_candidates)
city_col = _find_col(df_merged, city_candidates)
state_col = _find_col(df_merged, state_candidates)
zip_col = _find_col(df_merged, zip_candidates)
if all([addr_col, city_col, state_col, zip_col]):
df_merged = df_merged.rename(columns={
addr_col: "Address", city_col: "City", state_col: "State", zip_col: "Zip",
})
df_merged["Full Address"] = (
df_merged["Address"].astype(str).str.strip() + " " +
df_merged["City"].astype(str).str.strip() + " " +
df_merged["State"].astype(str).str.strip() + " " +
df_merged["Zip"].astype(str).str.strip()
).str.replace(r"\s+", " ", regex=True).str.strip()
final_cols = ["Full Address", "Address", "City", "State", "Zip",
"Square Footage", "Beds", "Baths", "Year Built", "url"]
else:
df_merged["Full Address"] = (
df_merged["query"]
.str.replace(r"\s*square\s*footage\s*$", "", flags=re.IGNORECASE, regex=True)
.str.strip()
)
final_cols = ["Full Address", "Square Footage", "Beds", "Baths", "Year Built", "url"]
for col in final_cols:
if col not in df_merged.columns:
df_merged[col] = pd.NA
return df_merged[final_cols].copy()
def _make_clickable(url):
"""Convert a URL to an HTML hyperlink for Streamlit display."""
if pd.isna(url) or not str(url).startswith("http"):
return url
return f'<a href="{url}" target="_blank">{url}</a>'
# =============================================================================
# STREAMLIT APP
# =============================================================================
st.set_page_config(page_title="Address SqFt Scraper", layout="wide")
st.title("Address Square Footage Scraper")
st.caption("Upload a CSV or Excel file with address columns. The app searches StartPage via proxies and extracts property details.")
# --- File upload ---
uploaded = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
if uploaded is not None:
if uploaded.name.endswith(".csv"):
df_input = pd.read_csv(uploaded)
else:
df_input = pd.read_excel(uploaded)
# --- Column mapping ---
st.subheader("Map Columns")
all_cols = ["(auto-detect)"] + list(df_input.columns)
col1, col2, col3, col4 = st.columns(4)
with col1:
addr_col = st.selectbox("Address", all_cols, index=0)
with col2:
city_col = st.selectbox("City", all_cols, index=0)
with col3:
state_col = st.selectbox("State", all_cols, index=0)
with col4:
zip_col = st.selectbox("Zip", all_cols, index=0)
if any(c != "(auto-detect)" for c in [addr_col, city_col, state_col, zip_col]):
rename_map = {}
if addr_col != "(auto-detect)":
rename_map[addr_col] = "Address"
if city_col != "(auto-detect)":
rename_map[city_col] = "City"
if state_col != "(auto-detect)":
rename_map[state_col] = "State"
if zip_col != "(auto-detect)":
rename_map[zip_col] = "Zip"
df_input = df_input.rename(columns=rename_map)
st.subheader("Uploaded Data Preview")
st.dataframe(df_input.head(20), use_container_width=True)
st.info(f"{len(df_input)} rows loaded")
# --- Run button ---
if st.button("Run Scraper", type="primary", use_container_width=True):
df_work = df_input.copy()
df_work['query'] = df_work.iloc[:, :4].astype(str).apply(' '.join, axis=1)
df_work['query'] = df_work['query'] + ' square footage'
all_queries = df_work['query'].tolist()
st.info(f"Scraping {len(all_queries)} addresses...")
# --- Load proxies ---
status = st.empty()
status.text("Loading proxy pool...")
proxy_pool = fetch_free_proxies()
status.text(f"Proxy pool: {len(proxy_pool)} proxies")
if not proxy_pool:
st.error("No proxies found. Check network connectivity.")
st.stop()
# =============================================
# PASS 1
# =============================================
progress = st.progress(0, text="Pass 1: Searching...")
p1_raw = Parallel(n_jobs=PASS1_JOBS, prefer="threads")(
delayed(scrape_pass1)(q, proxy_pool, PASS1_TIMEOUT) for q in all_queries
)
progress.progress(70, text="Pass 1 complete")
p1_successes = [r for r in p1_raw if isinstance(r, pd.DataFrame)]
p1_failed_queries = [r for r in p1_raw if isinstance(r, str)]
status.text(
f"Pass 1: {len(p1_successes)}/{len(all_queries)} succeeded, "
f"{len(p1_failed_queries)} failed"
)
# =============================================
# PASS 2
# =============================================
p2_successes = []
final_failed = []
if p1_failed_queries:
progress.progress(75, text=f"Pass 2: Retrying {len(p1_failed_queries)} failures...")
fetch_free_proxies.clear()
proxy_pool = fetch_free_proxies()
def _retry_one(query):
time.sleep(random.uniform(1.0, 3.0))
return scrape_pass2(query, proxy_pool, retries=PASS2_RETRIES)
with ThreadPoolExecutor(max_workers=PASS2_WORKERS) as executor:
future_to_q = {executor.submit(_retry_one, q): q for q in p1_failed_queries}
for future in as_completed(future_to_q):
query = future_to_q[future]
try:
df = future.result()
p2_successes.append(df)
except Exception as e:
final_failed.append({"query": query, "error": str(e)})
status.text(
f"Pass 2: recovered {len(p2_successes)}/{len(p1_failed_queries)}"
)
# =============================================
# COMBINE & PROCESS
# =============================================
progress.progress(90, text="Processing results...")
all_dfs = p1_successes + p2_successes
if not all_dfs:
st.error("No results collected. Check network/proxies.")
st.stop()
df_raw = pd.concat(all_dfs, ignore_index=True)
df_raw = extract_features(df_raw)
df_best = rank_best_per_query(df_raw)
df_clean = clean_and_combine(df_best, df_work)
progress.progress(100, text="Done!")
# =============================================
# RESULTS
# =============================================
total = len(df_clean)
sqft_vals = pd.to_numeric(df_clean['Square Footage'], errors='coerce').fillna(0)
has_sqft = (sqft_vals > 0).sum()
in_range = ((sqft_vals >= 100) & (sqft_vals <= 8000)).sum()
st.subheader("Results")
mc1, mc2, mc3, mc4 = st.columns(4)
mc1.metric("Total Addresses", total)
mc2.metric("With Sq Ft", f"{has_sqft} ({has_sqft/total*100:.0f}%)" if total else "0")
mc3.metric("In 100-8000 Range", f"{in_range} ({in_range/total*100:.0f}%)" if total else "0")
mc4.metric("Failed Queries", len(final_failed))
# Display with clickable URL column in scrollable container
df_display = df_clean.copy()
df_display['url'] = df_display['url'].apply(_make_clickable)
table_html = df_display.to_html(escape=False, index=False, classes="result-table")
st.markdown(
f"""
<div style="max-height: 500px; overflow-y: auto; border: 1px solid #ddd; border-radius: 4px;">
{table_html}
</div>
<style>
.result-table {{
width: 100%;
border-collapse: collapse;
}}
.result-table th {{
position: sticky;
top: 0;
background: #f0f2f6;
z-index: 1;
padding: 8px;
border-bottom: 2px solid #ddd;
}}
.result-table td {{
padding: 6px 8px;
border-bottom: 1px solid #eee;
}}
</style>
""",
unsafe_allow_html=True,
)
# --- Download button (include mapped input columns) ---
input_cols = list(df_input.columns[:4])
df_download = df_clean.reset_index(drop=True)
df_work_reset = df_work[input_cols].reset_index(drop=True)
for col in input_cols:
if col not in df_download.columns and col in df_work_reset.columns:
df_download[col] = df_work_reset[col]
csv_bytes = df_download.to_csv(index=False, encoding='utf-8-sig').encode('utf-8-sig')
st.download_button(
label="Download Results CSV",
data=csv_bytes,
file_name="address_sqft_results.csv",
mime="text/csv",
)
# Show failures if any
if final_failed:
with st.expander(f"Failed Queries ({len(final_failed)})"):
st.dataframe(pd.DataFrame(final_failed), use_container_width=True)