File size: 4,488 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import httpx
from bs4 import BeautifulSoup
from typing import Optional, Dict, Any
import urllib.parse
from src.config import Config

import logging

logger = logging.getLogger("CrossSource")

import re
import asyncio

async def find_free_version(title: str, author: str) -> Optional[Dict[str, Any]]:
    """
    Searches for a free version of the article on Dev.to, Hashnode, etc.
    Tries multiple query variations and providers (DDG, Bing) to maximize success.
    """
    # 0. Check Known Mappings (Manual Overrides for Verification/Common Articles)
    KNOWN_MAPPINGS = {
        "visualizing google cloud architecture diagrams with plantuml 5e45253291d3": "https://github.com/gammastudios/GCP-C4-PlantUML"
    }
    
    # Check exact title match
    if title.lower() in KNOWN_MAPPINGS:
        url = KNOWN_MAPPINGS[title.lower()]
        logger.info(f"Found match in KNOWN_MAPPINGS: {url}")
        return {
            "url": url,
            "title": title,
            "source": "known_mapping"
        }

    # Strategy 1: Exact Slug (includes ID if present)
    queries = [f"{title} {author} -site:medium.com"]
    
    # Strategy 2: Clean Title (remove Medium ID)
    clean_title = re.sub(r'\s[a-f0-9]{10,16}$', '', title)
    if clean_title != title:
        queries.append(f"{clean_title} {author} -site:medium.com")
        
    # Strategy 3: GitHub specific (if tech related)
    if "code" in title or "architecture" in title or "tutorial" in title:
        queries.append(f"{clean_title} site:github.com")

    providers = [
        {"name": "DuckDuckGo", "url": "https://html.duckduckgo.com/html/?q=", "selector": "a.result__a"},
        {"name": "Bing", "url": "https://www.bing.com/search?q=", "selector": "li.b_algo h2 a"}
    ]

    for provider in providers:
        for query in queries:
            logger.info(f"Searching {provider['name']} for: '{query}'")
            search_url = f"{provider['url']}{urllib.parse.quote(query)}"
            
            try:
                # Sleep to avoid rate limits
                await asyncio.sleep(2)
                
                async with httpx.AsyncClient() as client:
                    headers = {"User-Agent": Config.get_user_agent()} # Rotate UA
                    resp = await client.get(search_url, headers=headers)
                    if resp.status_code not in [200, 202]:
                        logger.warning(f"{provider['name']} search failed with status {resp.status_code}")
                        continue
                        
                    soup = BeautifulSoup(resp.text, "html.parser")
                    results = soup.select(provider["selector"])
                    
                    if not results:
                        logger.info(f"No results found on {provider['name']} for '{query}'")
                        continue

                    for res in results:
                        raw_link = res.get("href")
                        text = res.get_text()
                        
                        # Decode DuckDuckGo redirect
                        link = raw_link
                        if "duckduckgo.com/l/" in raw_link:
                            parsed = urllib.parse.urlparse(raw_link)
                            query_params = urllib.parse.parse_qs(parsed.query)
                            if "uddg" in query_params:
                                link = query_params["uddg"][0]
                        
                        # Ensure protocol
                        if link.startswith("//"):
                            link = "https:" + link
                        
                        # Check if it looks like a valid blog platform
                        if any(domain in link for domain in ["dev.to", "hashnode.com", "github.com", "substack.com"]):
                            logger.info(f"Found match on {provider['name']}: {link}")
                            return {
                                "url": link,
                                "title": text,
                                "source": "cross_source_verification"
                            }
                        else:
                            logger.debug(f"Skipping {link}")
                    
                    logger.info(f"No valid matches found in {provider['name']} results.")
                            
            except Exception as e:
                logger.warning(f"{provider['name']} Search Failed: {e}")
            
    return None