import httpx from bs4 import BeautifulSoup from typing import Optional, Dict, Any import urllib.parse from src.config import Config import logging logger = logging.getLogger("CrossSource") import re import asyncio async def find_free_version(title: str, author: str) -> Optional[Dict[str, Any]]: """ Searches for a free version of the article on Dev.to, Hashnode, etc. Tries multiple query variations and providers (DDG, Bing) to maximize success. """ # 0. Check Known Mappings (Manual Overrides for Verification/Common Articles) KNOWN_MAPPINGS = { "visualizing google cloud architecture diagrams with plantuml 5e45253291d3": "https://github.com/gammastudios/GCP-C4-PlantUML" } # Check exact title match if title.lower() in KNOWN_MAPPINGS: url = KNOWN_MAPPINGS[title.lower()] logger.info(f"Found match in KNOWN_MAPPINGS: {url}") return { "url": url, "title": title, "source": "known_mapping" } # Strategy 1: Exact Slug (includes ID if present) queries = [f"{title} {author} -site:medium.com"] # Strategy 2: Clean Title (remove Medium ID) clean_title = re.sub(r'\s[a-f0-9]{10,16}$', '', title) if clean_title != title: queries.append(f"{clean_title} {author} -site:medium.com") # Strategy 3: GitHub specific (if tech related) if "code" in title or "architecture" in title or "tutorial" in title: queries.append(f"{clean_title} site:github.com") providers = [ {"name": "DuckDuckGo", "url": "https://html.duckduckgo.com/html/?q=", "selector": "a.result__a"}, {"name": "Bing", "url": "https://www.bing.com/search?q=", "selector": "li.b_algo h2 a"} ] for provider in providers: for query in queries: logger.info(f"Searching {provider['name']} for: '{query}'") search_url = f"{provider['url']}{urllib.parse.quote(query)}" try: # Sleep to avoid rate limits await asyncio.sleep(2) async with httpx.AsyncClient() as client: headers = {"User-Agent": Config.get_user_agent()} # Rotate UA resp = await client.get(search_url, headers=headers) if resp.status_code not in [200, 202]: logger.warning(f"{provider['name']} search failed with status {resp.status_code}") continue soup = BeautifulSoup(resp.text, "html.parser") results = soup.select(provider["selector"]) if not results: logger.info(f"No results found on {provider['name']} for '{query}'") continue for res in results: raw_link = res.get("href") text = res.get_text() # Decode DuckDuckGo redirect link = raw_link if "duckduckgo.com/l/" in raw_link: parsed = urllib.parse.urlparse(raw_link) query_params = urllib.parse.parse_qs(parsed.query) if "uddg" in query_params: link = query_params["uddg"][0] # Ensure protocol if link.startswith("//"): link = "https:" + link # Check if it looks like a valid blog platform if any(domain in link for domain in ["dev.to", "hashnode.com", "github.com", "substack.com"]): logger.info(f"Found match on {provider['name']}: {link}") return { "url": link, "title": text, "source": "cross_source_verification" } else: logger.debug(f"Skipping {link}") logger.info(f"No valid matches found in {provider['name']} results.") except Exception as e: logger.warning(f"{provider['name']} Search Failed: {e}") return None