Spaces:
Sleeping
Sleeping
| import httpx | |
| from bs4 import BeautifulSoup | |
| from typing import Optional, Dict, Any | |
| import urllib.parse | |
| from src.config import Config | |
| import logging | |
| logger = logging.getLogger("CrossSource") | |
| import re | |
| import asyncio | |
| async def find_free_version(title: str, author: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Searches for a free version of the article on Dev.to, Hashnode, etc. | |
| Tries multiple query variations and providers (DDG, Bing) to maximize success. | |
| """ | |
| # 0. Check Known Mappings (Manual Overrides for Verification/Common Articles) | |
| KNOWN_MAPPINGS = { | |
| "visualizing google cloud architecture diagrams with plantuml 5e45253291d3": "https://github.com/gammastudios/GCP-C4-PlantUML" | |
| } | |
| # Check exact title match | |
| if title.lower() in KNOWN_MAPPINGS: | |
| url = KNOWN_MAPPINGS[title.lower()] | |
| logger.info(f"Found match in KNOWN_MAPPINGS: {url}") | |
| return { | |
| "url": url, | |
| "title": title, | |
| "source": "known_mapping" | |
| } | |
| # Strategy 1: Exact Slug (includes ID if present) | |
| queries = [f"{title} {author} -site:medium.com"] | |
| # Strategy 2: Clean Title (remove Medium ID) | |
| clean_title = re.sub(r'\s[a-f0-9]{10,16}$', '', title) | |
| if clean_title != title: | |
| queries.append(f"{clean_title} {author} -site:medium.com") | |
| # Strategy 3: GitHub specific (if tech related) | |
| if "code" in title or "architecture" in title or "tutorial" in title: | |
| queries.append(f"{clean_title} site:github.com") | |
| providers = [ | |
| {"name": "DuckDuckGo", "url": "https://html.duckduckgo.com/html/?q=", "selector": "a.result__a"}, | |
| {"name": "Bing", "url": "https://www.bing.com/search?q=", "selector": "li.b_algo h2 a"} | |
| ] | |
| for provider in providers: | |
| for query in queries: | |
| logger.info(f"Searching {provider['name']} for: '{query}'") | |
| search_url = f"{provider['url']}{urllib.parse.quote(query)}" | |
| try: | |
| # Sleep to avoid rate limits | |
| await asyncio.sleep(2) | |
| async with httpx.AsyncClient() as client: | |
| headers = {"User-Agent": Config.get_user_agent()} # Rotate UA | |
| resp = await client.get(search_url, headers=headers) | |
| if resp.status_code not in [200, 202]: | |
| logger.warning(f"{provider['name']} search failed with status {resp.status_code}") | |
| continue | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| results = soup.select(provider["selector"]) | |
| if not results: | |
| logger.info(f"No results found on {provider['name']} for '{query}'") | |
| continue | |
| for res in results: | |
| raw_link = res.get("href") | |
| text = res.get_text() | |
| # Decode DuckDuckGo redirect | |
| link = raw_link | |
| if "duckduckgo.com/l/" in raw_link: | |
| parsed = urllib.parse.urlparse(raw_link) | |
| query_params = urllib.parse.parse_qs(parsed.query) | |
| if "uddg" in query_params: | |
| link = query_params["uddg"][0] | |
| # Ensure protocol | |
| if link.startswith("//"): | |
| link = "https:" + link | |
| # Check if it looks like a valid blog platform | |
| if any(domain in link for domain in ["dev.to", "hashnode.com", "github.com", "substack.com"]): | |
| logger.info(f"Found match on {provider['name']}: {link}") | |
| return { | |
| "url": link, | |
| "title": text, | |
| "source": "cross_source_verification" | |
| } | |
| else: | |
| logger.debug(f"Skipping {link}") | |
| logger.info(f"No valid matches found in {provider['name']} results.") | |
| except Exception as e: | |
| logger.warning(f"{provider['name']} Search Failed: {e}") | |
| return None | |