Spaces:
Runtime error
Runtime error
| import io | |
| import re | |
| import time | |
| from dataclasses import dataclass | |
| from typing import Dict, List, Tuple | |
| import pandas as pd | |
| import streamlit as st | |
| from ddgs import DDGS | |
| DEFAULT_TITLES = [ | |
| "Medical Director", | |
| "Regional Medical Director", | |
| "Head of Medical Affairs", | |
| "Medical Cluster Head", | |
| "Marketing Manager", | |
| "Senior Marketing Manager", | |
| "Marketing Director", | |
| "Regional Marketing Manager", | |
| "Global Marketing Director", | |
| "Head of Marketing", | |
| ] | |
| DEFAULT_LOCATIONS = [ | |
| "Bahrain", | |
| "Kuwait", | |
| "Qatar", | |
| "Saudi Arabia", | |
| "Oman", | |
| "Egypt", | |
| "United Arab Emirates", | |
| ] | |
| class LeadResult: | |
| name: str | |
| title_guess: str | |
| location_guess: str | |
| profile_url: str | |
| source_title: str | |
| snippet: str | |
| confidence: int | |
| def parse_multiline(text: str) -> List[str]: | |
| return [line.strip() for line in text.splitlines() if line.strip()] | |
| def build_query(title: str, locations: List[str]) -> str: | |
| loc_clause = " OR ".join(f'"{loc}"' for loc in locations) | |
| return f'site:linkedin.com/in "{title}" ({loc_clause})' | |
| def build_fallback_query(title: str, locations: List[str]) -> str: | |
| # Simpler query helps when strict boolean query returns too few results. | |
| loc_terms = " ".join(locations[:3]) | |
| return f'site:linkedin.com/in "{title}" {loc_terms}' | |
| def normalize_linkedin_url(url: str) -> str: | |
| # Remove tracking params and keep canonical profile URL. | |
| base = url.split("?", 1)[0].split("#", 1)[0] | |
| return base.rstrip("/") | |
| def extract_name(result_title: str) -> str: | |
| # Common format: "Name - Title | LinkedIn" | |
| cleaned = re.sub(r"\s*\|\s*LinkedIn.*$", "", result_title, flags=re.I).strip() | |
| name = cleaned.split(" - ")[0].strip() | |
| return name or "Unknown" | |
| def best_location(snippet_text: str, locations: List[str]) -> str: | |
| snippet_low = snippet_text.lower() | |
| for loc in locations: | |
| if loc.lower() in snippet_low: | |
| return loc | |
| return "Unknown" | |
| def confidence_score(snippet: str, source_title: str, location: str) -> int: | |
| score = 40 | |
| s_low = snippet.lower() | |
| if source_title.lower() in s_low: | |
| score += 35 | |
| if location != "Unknown": | |
| score += 20 | |
| if "linkedin" in s_low: | |
| score += 5 | |
| return min(score, 100) | |
| def fetch_query_with_retry(query: str, max_results: int, retries: int = 3) -> List[dict]: | |
| last_error = None | |
| for attempt in range(retries): | |
| try: | |
| with DDGS(timeout=20) as ddgs: | |
| return list(ddgs.text(query, max_results=max_results)) | |
| except Exception as exc: | |
| last_error = str(exc) | |
| if attempt < retries - 1: | |
| time.sleep(1.2 * (attempt + 1)) | |
| raise RuntimeError(last_error or "Search provider unavailable") | |
| def run_search_cached( | |
| titles: Tuple[str, ...], locations: Tuple[str, ...], per_title_limit: int | |
| ) -> tuple[List[dict], Dict[str, object]]: | |
| output: List[LeadResult] = [] | |
| seen_urls = set() | |
| query_debug: Dict[str, int] = {} | |
| errors: Dict[str, str] = {} | |
| for title in titles: | |
| queries = [ | |
| build_query(title, list(locations)), | |
| build_fallback_query(title, list(locations)), | |
| ] | |
| title_hits = 0 | |
| for query in queries: | |
| try: | |
| results = fetch_query_with_retry(query, max_results=per_title_limit) | |
| except Exception as exc: | |
| errors[query] = str(exc) | |
| query_debug[query] = 0 | |
| continue | |
| query_count = 0 | |
| for item in results: | |
| url = item.get("href", "") | |
| text_blob = f"{item.get('title', '')} {item.get('body', '')}" | |
| if "linkedin.com/in/" not in url.lower(): | |
| continue | |
| profile_url = normalize_linkedin_url(url) | |
| if profile_url in seen_urls: | |
| continue | |
| seen_urls.add(profile_url) | |
| location = best_location(text_blob, list(locations)) | |
| output.append( | |
| LeadResult( | |
| name=extract_name(item.get("title", "")), | |
| title_guess=title, | |
| location_guess=location, | |
| profile_url=profile_url, | |
| source_title=item.get("title", ""), | |
| snippet=item.get("body", ""), | |
| confidence=confidence_score(text_blob, title, location), | |
| ) | |
| ) | |
| query_count += 1 | |
| title_hits += 1 | |
| # Light throttle to reduce provider blocking on shared HF IPs. | |
| time.sleep(0.15) | |
| query_debug[query] = query_count | |
| if title_hits >= per_title_limit: | |
| break | |
| output.sort(key=lambda x: x.confidence, reverse=True) | |
| meta: Dict[str, object] = { | |
| "query_counts": query_debug, | |
| "errors": errors, | |
| } | |
| return [lead.__dict__ for lead in output], meta | |
| def retry_failed_queries( | |
| failed_queries: List[str], | |
| titles: List[str], | |
| locations: List[str], | |
| per_title_limit: int, | |
| existing_rows: List[dict], | |
| ) -> tuple[List[dict], Dict[str, object]]: | |
| seen_urls = { | |
| normalize_linkedin_url(row.get("profile_url", "")) | |
| for row in existing_rows | |
| if row.get("profile_url") | |
| } | |
| title_lookup = {} | |
| for title in titles: | |
| title_lookup[build_query(title, locations)] = title | |
| title_lookup[build_fallback_query(title, locations)] = title | |
| new_rows: List[dict] = [] | |
| query_counts: Dict[str, int] = {} | |
| errors: Dict[str, str] = {} | |
| for query in failed_queries: | |
| query_title = title_lookup.get(query, "Unknown") | |
| try: | |
| results = fetch_query_with_retry(query, max_results=per_title_limit, retries=4) | |
| except Exception as exc: | |
| errors[query] = str(exc) | |
| query_counts[query] = 0 | |
| continue | |
| count = 0 | |
| for item in results: | |
| url = item.get("href", "") | |
| text_blob = f"{item.get('title', '')} {item.get('body', '')}" | |
| if "linkedin.com/in/" not in url.lower(): | |
| continue | |
| profile_url = normalize_linkedin_url(url) | |
| if profile_url in seen_urls: | |
| continue | |
| seen_urls.add(profile_url) | |
| location = best_location(text_blob, locations) | |
| new_rows.append( | |
| LeadResult( | |
| name=extract_name(item.get("title", "")), | |
| title_guess=query_title, | |
| location_guess=location, | |
| profile_url=profile_url, | |
| source_title=item.get("title", ""), | |
| snippet=item.get("body", ""), | |
| confidence=confidence_score(text_blob, query_title, location), | |
| ).__dict__ | |
| ) | |
| count += 1 | |
| time.sleep(0.15) | |
| query_counts[query] = count | |
| merged_rows = existing_rows + new_rows | |
| merged_rows.sort(key=lambda x: x.get("confidence", 0), reverse=True) | |
| return merged_rows, {"query_counts": query_counts, "errors": errors, "added": len(new_rows)} | |
| def to_csv_bytes(df: pd.DataFrame) -> bytes: | |
| buffer = io.StringIO() | |
| df.to_csv(buffer, index=False) | |
| return buffer.getvalue().encode("utf-8") | |
| st.set_page_config( | |
| page_title="LinkedIn Lead Finder (Free MVP)", | |
| page_icon="๐", | |
| layout="wide", | |
| ) | |
| st.title("๐ LinkedIn Lead Finder") | |
| with st.sidebar: | |
| st.header("Settings") | |
| per_title_limit = st.slider( | |
| "Results per title", | |
| min_value=5, | |
| max_value=30, | |
| value=12, | |
| help="More results are slower.", | |
| ) | |
| show_debug = st.checkbox("Show debug", value=False) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| titles_text = st.text_area( | |
| "Job titles / specialties (one per line)", | |
| value="\n".join(DEFAULT_TITLES), | |
| height=260, | |
| ) | |
| with col2: | |
| locations_text = st.text_area( | |
| "Locations (one per line)", | |
| value="\n".join(DEFAULT_LOCATIONS), | |
| height=260, | |
| ) | |
| run_btn = st.button("Find Profiles", type="primary", use_container_width=True) | |
| if "lead_rows" not in st.session_state: | |
| st.session_state.lead_rows = [] | |
| if "query_debug" not in st.session_state: | |
| st.session_state.query_debug = {} | |
| if "query_errors" not in st.session_state: | |
| st.session_state.query_errors = {} | |
| if "titles_used" not in st.session_state: | |
| st.session_state.titles_used = [] | |
| if "locations_used" not in st.session_state: | |
| st.session_state.locations_used = [] | |
| if run_btn: | |
| titles = parse_multiline(titles_text) | |
| locations = parse_multiline(locations_text) | |
| if not titles or not locations: | |
| st.error("Please provide at least one title and one location.") | |
| else: | |
| with st.spinner("Searching and filtering profiles..."): | |
| lead_rows, meta = run_search_cached(tuple(titles), tuple(locations), per_title_limit) | |
| st.session_state.lead_rows = lead_rows | |
| st.session_state.query_debug = meta["query_counts"] | |
| st.session_state.query_errors = meta["errors"] | |
| st.session_state.titles_used = titles | |
| st.session_state.locations_used = locations | |
| if st.session_state.query_errors: | |
| if st.button("Retry failed queries", use_container_width=True): | |
| with st.spinner("Retrying failed queries..."): | |
| merged_rows, retry_meta = retry_failed_queries( | |
| failed_queries=list(st.session_state.query_errors.keys()), | |
| titles=st.session_state.titles_used, | |
| locations=st.session_state.locations_used, | |
| per_title_limit=per_title_limit, | |
| existing_rows=st.session_state.lead_rows, | |
| ) | |
| st.session_state.lead_rows = merged_rows | |
| st.session_state.query_debug.update(retry_meta["query_counts"]) | |
| st.session_state.query_errors = retry_meta["errors"] | |
| if retry_meta["added"] > 0: | |
| st.success(f"Added {retry_meta['added']} profiles from retries.") | |
| lead_rows = st.session_state.lead_rows | |
| query_debug = st.session_state.query_debug | |
| query_errors = st.session_state.query_errors | |
| if not lead_rows: | |
| if query_errors: | |
| st.error("Search provider is temporarily limited. Please retry in a minute.") | |
| elif run_btn: | |
| st.warning("No profiles found. Try broader titles or increase results per title.") | |
| else: | |
| df = pd.DataFrame(lead_rows) | |
| st.success(f"Found {len(df)} unique profiles.") | |
| table_df = df.rename( | |
| columns={ | |
| "name": "Name", | |
| "title_guess": "Matched Title", | |
| "location_guess": "Location", | |
| "profile_url": "LinkedIn", | |
| "confidence": "Confidence", | |
| } | |
| )[["Name", "Matched Title", "Location", "LinkedIn", "Confidence"]] | |
| st.dataframe( | |
| table_df, | |
| use_container_width=True, | |
| hide_index=True, | |
| column_config={ | |
| "LinkedIn": st.column_config.LinkColumn( | |
| "LinkedIn", | |
| display_text="Open Profile", | |
| ), | |
| "Confidence": st.column_config.ProgressColumn( | |
| "Confidence", | |
| min_value=0, | |
| max_value=100, | |
| ), | |
| }, | |
| ) | |
| st.download_button( | |
| "Download CSV", | |
| data=to_csv_bytes(df), | |
| file_name="linkedin_leads_mvp.csv", | |
| mime="text/csv", | |
| use_container_width=True, | |
| ) | |
| if show_debug: | |
| with st.expander("Debug"): | |
| st.json({"query_counts": query_debug, "errors": query_errors}) | |