saifisvibin's picture
test
8763e10 verified
import io
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Tuple
import pandas as pd
import streamlit as st
from ddgs import DDGS
DEFAULT_TITLES = [
"Medical Director",
"Regional Medical Director",
"Head of Medical Affairs",
"Medical Cluster Head",
"Marketing Manager",
"Senior Marketing Manager",
"Marketing Director",
"Regional Marketing Manager",
"Global Marketing Director",
"Head of Marketing",
]
DEFAULT_LOCATIONS = [
"Bahrain",
"Kuwait",
"Qatar",
"Saudi Arabia",
"Oman",
"Egypt",
"United Arab Emirates",
]
@dataclass
class LeadResult:
name: str
title_guess: str
location_guess: str
profile_url: str
source_title: str
snippet: str
confidence: int
def parse_multiline(text: str) -> List[str]:
return [line.strip() for line in text.splitlines() if line.strip()]
def build_query(title: str, locations: List[str]) -> str:
loc_clause = " OR ".join(f'"{loc}"' for loc in locations)
return f'site:linkedin.com/in "{title}" ({loc_clause})'
def build_fallback_query(title: str, locations: List[str]) -> str:
# Simpler query helps when strict boolean query returns too few results.
loc_terms = " ".join(locations[:3])
return f'site:linkedin.com/in "{title}" {loc_terms}'
def normalize_linkedin_url(url: str) -> str:
# Remove tracking params and keep canonical profile URL.
base = url.split("?", 1)[0].split("#", 1)[0]
return base.rstrip("/")
def extract_name(result_title: str) -> str:
# Common format: "Name - Title | LinkedIn"
cleaned = re.sub(r"\s*\|\s*LinkedIn.*$", "", result_title, flags=re.I).strip()
name = cleaned.split(" - ")[0].strip()
return name or "Unknown"
def best_location(snippet_text: str, locations: List[str]) -> str:
snippet_low = snippet_text.lower()
for loc in locations:
if loc.lower() in snippet_low:
return loc
return "Unknown"
def confidence_score(snippet: str, source_title: str, location: str) -> int:
score = 40
s_low = snippet.lower()
if source_title.lower() in s_low:
score += 35
if location != "Unknown":
score += 20
if "linkedin" in s_low:
score += 5
return min(score, 100)
def fetch_query_with_retry(query: str, max_results: int, retries: int = 3) -> List[dict]:
last_error = None
for attempt in range(retries):
try:
with DDGS(timeout=20) as ddgs:
return list(ddgs.text(query, max_results=max_results))
except Exception as exc:
last_error = str(exc)
if attempt < retries - 1:
time.sleep(1.2 * (attempt + 1))
raise RuntimeError(last_error or "Search provider unavailable")
@st.cache_data(ttl=900, show_spinner=False)
def run_search_cached(
titles: Tuple[str, ...], locations: Tuple[str, ...], per_title_limit: int
) -> tuple[List[dict], Dict[str, object]]:
output: List[LeadResult] = []
seen_urls = set()
query_debug: Dict[str, int] = {}
errors: Dict[str, str] = {}
for title in titles:
queries = [
build_query(title, list(locations)),
build_fallback_query(title, list(locations)),
]
title_hits = 0
for query in queries:
try:
results = fetch_query_with_retry(query, max_results=per_title_limit)
except Exception as exc:
errors[query] = str(exc)
query_debug[query] = 0
continue
query_count = 0
for item in results:
url = item.get("href", "")
text_blob = f"{item.get('title', '')} {item.get('body', '')}"
if "linkedin.com/in/" not in url.lower():
continue
profile_url = normalize_linkedin_url(url)
if profile_url in seen_urls:
continue
seen_urls.add(profile_url)
location = best_location(text_blob, list(locations))
output.append(
LeadResult(
name=extract_name(item.get("title", "")),
title_guess=title,
location_guess=location,
profile_url=profile_url,
source_title=item.get("title", ""),
snippet=item.get("body", ""),
confidence=confidence_score(text_blob, title, location),
)
)
query_count += 1
title_hits += 1
# Light throttle to reduce provider blocking on shared HF IPs.
time.sleep(0.15)
query_debug[query] = query_count
if title_hits >= per_title_limit:
break
output.sort(key=lambda x: x.confidence, reverse=True)
meta: Dict[str, object] = {
"query_counts": query_debug,
"errors": errors,
}
return [lead.__dict__ for lead in output], meta
def retry_failed_queries(
failed_queries: List[str],
titles: List[str],
locations: List[str],
per_title_limit: int,
existing_rows: List[dict],
) -> tuple[List[dict], Dict[str, object]]:
seen_urls = {
normalize_linkedin_url(row.get("profile_url", ""))
for row in existing_rows
if row.get("profile_url")
}
title_lookup = {}
for title in titles:
title_lookup[build_query(title, locations)] = title
title_lookup[build_fallback_query(title, locations)] = title
new_rows: List[dict] = []
query_counts: Dict[str, int] = {}
errors: Dict[str, str] = {}
for query in failed_queries:
query_title = title_lookup.get(query, "Unknown")
try:
results = fetch_query_with_retry(query, max_results=per_title_limit, retries=4)
except Exception as exc:
errors[query] = str(exc)
query_counts[query] = 0
continue
count = 0
for item in results:
url = item.get("href", "")
text_blob = f"{item.get('title', '')} {item.get('body', '')}"
if "linkedin.com/in/" not in url.lower():
continue
profile_url = normalize_linkedin_url(url)
if profile_url in seen_urls:
continue
seen_urls.add(profile_url)
location = best_location(text_blob, locations)
new_rows.append(
LeadResult(
name=extract_name(item.get("title", "")),
title_guess=query_title,
location_guess=location,
profile_url=profile_url,
source_title=item.get("title", ""),
snippet=item.get("body", ""),
confidence=confidence_score(text_blob, query_title, location),
).__dict__
)
count += 1
time.sleep(0.15)
query_counts[query] = count
merged_rows = existing_rows + new_rows
merged_rows.sort(key=lambda x: x.get("confidence", 0), reverse=True)
return merged_rows, {"query_counts": query_counts, "errors": errors, "added": len(new_rows)}
def to_csv_bytes(df: pd.DataFrame) -> bytes:
buffer = io.StringIO()
df.to_csv(buffer, index=False)
return buffer.getvalue().encode("utf-8")
st.set_page_config(
page_title="LinkedIn Lead Finder (Free MVP)",
page_icon="๐Ÿ”Ž",
layout="wide",
)
st.title("๐Ÿ”Ž LinkedIn Lead Finder")
with st.sidebar:
st.header("Settings")
per_title_limit = st.slider(
"Results per title",
min_value=5,
max_value=30,
value=12,
help="More results are slower.",
)
show_debug = st.checkbox("Show debug", value=False)
col1, col2 = st.columns(2)
with col1:
titles_text = st.text_area(
"Job titles / specialties (one per line)",
value="\n".join(DEFAULT_TITLES),
height=260,
)
with col2:
locations_text = st.text_area(
"Locations (one per line)",
value="\n".join(DEFAULT_LOCATIONS),
height=260,
)
run_btn = st.button("Find Profiles", type="primary", use_container_width=True)
if "lead_rows" not in st.session_state:
st.session_state.lead_rows = []
if "query_debug" not in st.session_state:
st.session_state.query_debug = {}
if "query_errors" not in st.session_state:
st.session_state.query_errors = {}
if "titles_used" not in st.session_state:
st.session_state.titles_used = []
if "locations_used" not in st.session_state:
st.session_state.locations_used = []
if run_btn:
titles = parse_multiline(titles_text)
locations = parse_multiline(locations_text)
if not titles or not locations:
st.error("Please provide at least one title and one location.")
else:
with st.spinner("Searching and filtering profiles..."):
lead_rows, meta = run_search_cached(tuple(titles), tuple(locations), per_title_limit)
st.session_state.lead_rows = lead_rows
st.session_state.query_debug = meta["query_counts"]
st.session_state.query_errors = meta["errors"]
st.session_state.titles_used = titles
st.session_state.locations_used = locations
if st.session_state.query_errors:
if st.button("Retry failed queries", use_container_width=True):
with st.spinner("Retrying failed queries..."):
merged_rows, retry_meta = retry_failed_queries(
failed_queries=list(st.session_state.query_errors.keys()),
titles=st.session_state.titles_used,
locations=st.session_state.locations_used,
per_title_limit=per_title_limit,
existing_rows=st.session_state.lead_rows,
)
st.session_state.lead_rows = merged_rows
st.session_state.query_debug.update(retry_meta["query_counts"])
st.session_state.query_errors = retry_meta["errors"]
if retry_meta["added"] > 0:
st.success(f"Added {retry_meta['added']} profiles from retries.")
lead_rows = st.session_state.lead_rows
query_debug = st.session_state.query_debug
query_errors = st.session_state.query_errors
if not lead_rows:
if query_errors:
st.error("Search provider is temporarily limited. Please retry in a minute.")
elif run_btn:
st.warning("No profiles found. Try broader titles or increase results per title.")
else:
df = pd.DataFrame(lead_rows)
st.success(f"Found {len(df)} unique profiles.")
table_df = df.rename(
columns={
"name": "Name",
"title_guess": "Matched Title",
"location_guess": "Location",
"profile_url": "LinkedIn",
"confidence": "Confidence",
}
)[["Name", "Matched Title", "Location", "LinkedIn", "Confidence"]]
st.dataframe(
table_df,
use_container_width=True,
hide_index=True,
column_config={
"LinkedIn": st.column_config.LinkColumn(
"LinkedIn",
display_text="Open Profile",
),
"Confidence": st.column_config.ProgressColumn(
"Confidence",
min_value=0,
max_value=100,
),
},
)
st.download_button(
"Download CSV",
data=to_csv_bytes(df),
file_name="linkedin_leads_mvp.csv",
mime="text/csv",
use_container_width=True,
)
if show_debug:
with st.expander("Debug"):
st.json({"query_counts": query_debug, "errors": query_errors})