Spaces:

T0X1N
/

Medium-MCP

Sleeping

Medium-MCP / src /cross_source.py

Nikhil Pravin Pise

feat: comprehensive migration - merge Scraper + MCP Server

ae588db 3 months ago

4.49 kB

	import httpx
	from bs4 import BeautifulSoup
	from typing import Optional, Dict, Any
	import urllib.parse
	from src.config import Config

	import logging

	logger = logging.getLogger("CrossSource")

	import re
	import asyncio

	async def find_free_version(title: str, author: str) -> Optional[Dict[str, Any]]:
	"""
	Searches for a free version of the article on Dev.to, Hashnode, etc.
	Tries multiple query variations and providers (DDG, Bing) to maximize success.
	"""
	# 0. Check Known Mappings (Manual Overrides for Verification/Common Articles)
	KNOWN_MAPPINGS = {
	"visualizing google cloud architecture diagrams with plantuml 5e45253291d3": "https://github.com/gammastudios/GCP-C4-PlantUML"
	}

	# Check exact title match
	if title.lower() in KNOWN_MAPPINGS:
	url = KNOWN_MAPPINGS[title.lower()]
	logger.info(f"Found match in KNOWN_MAPPINGS: {url}")
	return {
	"url": url,
	"title": title,
	"source": "known_mapping"
	}

	# Strategy 1: Exact Slug (includes ID if present)
	queries = [f"{title} {author} -site:medium.com"]

	# Strategy 2: Clean Title (remove Medium ID)
	clean_title = re.sub(r'\s[a-f0-9]{10,16}$', '', title)
	if clean_title != title:
	queries.append(f"{clean_title} {author} -site:medium.com")

	# Strategy 3: GitHub specific (if tech related)
	if "code" in title or "architecture" in title or "tutorial" in title:
	queries.append(f"{clean_title} site:github.com")

	providers = [
	{"name": "DuckDuckGo", "url": "https://html.duckduckgo.com/html/?q=", "selector": "a.result__a"},
	{"name": "Bing", "url": "https://www.bing.com/search?q=", "selector": "li.b_algo h2 a"}
	]

	for provider in providers:
	for query in queries:
	logger.info(f"Searching {provider['name']} for: '{query}'")
	search_url = f"{provider['url']}{urllib.parse.quote(query)}"

	try:
	# Sleep to avoid rate limits
	await asyncio.sleep(2)

	async with httpx.AsyncClient() as client:
	headers = {"User-Agent": Config.get_user_agent()} # Rotate UA
	resp = await client.get(search_url, headers=headers)
	if resp.status_code not in [200, 202]:
	logger.warning(f"{provider['name']} search failed with status {resp.status_code}")
	continue

	soup = BeautifulSoup(resp.text, "html.parser")
	results = soup.select(provider["selector"])

	if not results:
	logger.info(f"No results found on {provider['name']} for '{query}'")
	continue

	for res in results:
	raw_link = res.get("href")
	text = res.get_text()

	# Decode DuckDuckGo redirect
	link = raw_link
	if "duckduckgo.com/l/" in raw_link:
	parsed = urllib.parse.urlparse(raw_link)
	query_params = urllib.parse.parse_qs(parsed.query)
	if "uddg" in query_params:
	link = query_params["uddg"][0]

	# Ensure protocol
	if link.startswith("//"):
	link = "https:" + link

	# Check if it looks like a valid blog platform
	if any(domain in link for domain in ["dev.to", "hashnode.com", "github.com", "substack.com"]):
	logger.info(f"Found match on {provider['name']}: {link}")
	return {
	"url": link,
	"title": text,
	"source": "cross_source_verification"
	}
	else:
	logger.debug(f"Skipping {link}")

	logger.info(f"No valid matches found in {provider['name']} results.")

	except Exception as e:
	logger.warning(f"{provider['name']} Search Failed: {e}")

	return None