File size: 11,503 Bytes
ceae8b0
 
2d96b3b
9155a62
ceae8b0
 
9155a62
 
ceae8b0
 
2d96b3b
 
15b6036
 
 
79ae05b
15b6036
9155a62
514da67
9155a62
 
62283c0
4e3ab6e
9155a62
15b6036
 
 
 
 
 
54a7d14
15b6036
 
 
 
 
 
 
0a3d9b7
54a7d14
4e3ab6e
54a7d14
9635653
62283c0
2d96b3b
54a7d14
15b6036
62283c0
 
 
 
 
 
 
 
 
 
2d96b3b
15b6036
 
 
62283c0
54a7d14
 
 
 
9155a62
 
 
 
 
 
 
54a7d14
79ae05b
54a7d14
 
 
 
 
 
62283c0
 
54a7d14
 
56e3a38
963ab6b
 
 
f7da48c
79ae05b
f7da48c
 
4e3ab6e
62283c0
54a7d14
 
62283c0
 
ceae8b0
f7da48c
79ae05b
 
 
f7da48c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56e3a38
79ae05b
f7da48c
79ae05b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62283c0
54a7d14
15b6036
 
 
9635653
 
9155a62
 
 
9635653
 
62283c0
 
 
9635653
9155a62
9635653
 
 
 
62283c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9635653
 
 
62283c0
 
 
9635653
 
4e3ab6e
79ae05b
15b6036
 
9635653
15b6036
 
54a7d14
 
 
9155a62
 
 
63a0765
9635653
 
 
 
0a3d9b7
 
02298d2
b6e302a
9635653
0a3d9b7
54a7d14
 
 
 
 
9155a62
 
ceae8b0
 
 
54a7d14
 
0a3d9b7
54a7d14
62283c0
 
 
 
 
 
 
 
 
 
 
54a7d14
15b6036
62283c0
 
 
 
 
 
 
 
 
 
 
 
15b6036
 
514da67
54a7d14
9155a62
9635653
 
 
 
 
 
 
 
54a7d14
9635653
9155a62
15b6036
 
 
62283c0
15b6036
62283c0
54a7d14
15b6036
9155a62
15b6036
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import asyncio
import json
import logging
import time
from typing import Any, Dict, List
from urllib.parse import quote_plus

import requests
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode


class CrawlForAIScraper:
    def __init__(self) -> None:
        self.logger = logging.getLogger(__name__)
        self.session = requests.Session()
        self.base_browser = BrowserConfig(
            browser_type="chromium",
            headless=True,
            viewport_width=1920,
            viewport_height=1080,
            accept_downloads=False,
            verbose=False,
        )
        self.crawler = AsyncWebCrawler(config=self.base_browser)
        self._is_started = False

    async def start(self):
        if not self._is_started:
            await self.crawler.start()
            time.sleep(1)
            self._is_started = True

    async def close(self):
        if self._is_started:
            await self.crawler.close()
            self._is_started = False

    async def search_and_scrape(self, query: str, num_sites: int = 10) -> List[Dict[str, Any]]:
        await self.start()
        self.logger.info(f"Querying: {query}")

        # Perform a search to get a list of webpages
        search_results = await self._search(query)

        # Scrape each webpage
        scraped_data = []
        self.logger.info(f"Scraping {num_sites} sites...")
        data = await self._scrape_pages(search_results[: num_sites + 2], num_sites)
        scraped_data.extend(data)

        # Scrape next pages when some failed
        for _ in range(3):
            if len(scraped_data) < num_sites:
                idx_last_page = search_results.index(search_results[-1])
                data = await self._scrape_pages(search_results[idx_last_page + 1 : num_sites + 2], num_sites)
                scraped_data.extend(data)

        self.logger.info(f"Completed scraping {len(scraped_data)} sites")
        return scraped_data

    async def _search(self, query: str) -> List[str]:
        try:
            encoded_query = quote_plus(query)
            search_uri = f"https://www.google.com/search?q={encoded_query}"

            result = await self.crawler.arun(
                url=search_uri,
                screenshot=False,
                cache_mode=CacheMode.BYPASS,
                delay_before_return_html=2,
                scan_full_page=True,
            )

            soup = BeautifulSoup(result.html, "html.parser")
            search_results = []

            for link in list(soup.select("div > span > a"))[2:]:
                url = link.get("href").replace(" ", "").replace("\n", "").strip()
                if not url.startswith(("http://", "https://")):
                    url = "https://" + url
                if "support.google.com" in url or url.startswith("/search?q="):
                    continue
                search_results.append(url)

            for _ in range(3):
                if not search_results:
                    self.logger.info("Performing DuckDuckGo search as fallback...")
                    self.logger.warning("No search results found.")
                    search_results = await self._duckduckgo_search(query)

            if not search_results:
                raise Exception("No results found")
            self.logger.info(f"Found {len(search_results)} results")
            return search_results

        except Exception as e:
            self.logger.error(f"Google search error: {str(e)}", exc_info=True)
            raise

    async def _duckduckgo_search(self, query: str) -> List[str]:
        self.logger.info("Performing DuckDuckGo search...")
        try:
            encoded_query = quote_plus(query)
            search_uri = f"https://html.duckduckgo.com/html/?q={encoded_query}"

            # response = self.session.get(
            #     url,
            #     headers={
            #         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            #     },
            #     timeout=10,
            # )
            # response.raise_for_status()

            result = await self.crawler.arun(
                url=search_uri,
                screenshot=False,
                cache_mode=CacheMode.BYPASS,
                delay_before_return_html=2,
                scan_full_page=True,
            )

            soup = BeautifulSoup(result.html, "html.parser")
            search_results = []

            # DuckDuckGo search results are in elements with class 'result__url'
            for result in soup.select(".result__url"):
                url = result.get("href").replace(" ", "").replace("\\n", "")
                if not url.startswith(("http://", "https://")):
                    url = "https://" + url
                search_results.append(url)

            self.logger.info(f"Found {len(search_results)} URLs")
            return search_results

        except requests.exceptions.RequestException as e:  # Catch network errors specifically
            self.logger.error(f"DuckDuckGo search error: {str(e)}")
            return []
        except Exception as e:  # Catch any other errors
            self.logger.error(f"DuckDuckGo search error: {str(e)}")
            return []

    async def _scrape_pages(self, urls: str, max_sites: int) -> Dict[str, Any]:
        await self.start()

        try:
            # Run the crawler on a URL
            results = await self.crawler.arun_many(
                urls=urls,
                screenshot=False,
                cache_mode=CacheMode.BYPASS,
                scan_full_page=True,
                semaphore_count=4,
                wait_for_images=True,
                scroll_delay=0.1,
                delay_before_return_html=2,
                exclude_external_images=True,
                page_timeout=25000,
            )
            scraped_sites = []
            for result in results:
                if result.success:
                    soup = BeautifulSoup(result.html, "html.parser")

                    # Combine images
                    extracted_images = self._extract_images(soup, result.url)
                    media_images = []
                    for img in result.media["images"]:
                        if img["width"] is None or (isinstance(img["width"], (int, float)) and img["width"] > 300):
                            # Resolve multiple URLs in the src attribute
                            src = img["src"]
                            if " " in src and "w," in src:
                                urls = [url.strip() for url in src.split(" ") if url.strip()]
                                if urls:
                                    last_url = urls[-1].split(" ")[0]
                                    media_images.append(last_url)
                            else:
                                media_images.append(src)
                    all_images = list(set(extracted_images + media_images))

                    # Combine videos
                    all_videos = self._extract_videos(soup)
                    media_videos = [v["src"] for v in result.media["videos"] if v["src"]]
                    all_videos = list(set(all_videos + media_videos))

                    data = {
                        "url": result.url,
                        "text": result.markdown,
                        "images": all_images,
                        "videos": all_videos,
                        "links": self._extract_links(result.links["external"]),
                    }
                    scraped_sites.append(data)
                    self.logger.info(f"  - {result.url[:80]}...")
            return scraped_sites[:max_sites]

        except Exception as e:
            self.logger.error(f"Scraping error while {urls}: {str(e)}")
            return {}

    def _extract_images(self, soup: BeautifulSoup, url: str) -> List[str]:
        # Extract images with width and height greater than 300 pixels
        images = []
        for img in soup.find_all("img"):
            if "src" in img.attrs:
                src = img["src"]
                if not "width" or "height" not in img.attrs:
                    continue
                if "width" in img.attrs and img.get("width").lower() == "auto":
                    images.append((src, 999, 0))
                # Remove units from width and height: get start of the entity till the first non-digit character
                width = "".join([i for i in img.get("width", "0") if i.isdigit() or i == "."])
                height = "".join([i for i in img.get("height", "0") if i.isdigit() or i == "."])
                if width == "" or height == "":
                    continue
                width, height = float(width), float(height)
                if width > 300 and height > 300 and "pixel" not in src and "icon" not in src:
                    images.append((src, width, height))
        images = sorted(images, key=lambda img: -1 * (img[1] * img[2]))
        images = [img[0] for img in images]

        # Add base URL to relative URLs
        base_url = "/".join(url.split("/")[:3])
        images = [img if img.startswith("http") else base_url + img for img in images]
        return images

    def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
        # Extract videos from iframes and video tags
        videos = []
        nodes = list(soup.find_all("iframe")) + list(soup.find_all("video")) + list(soup.find_all("a"))
        for node in nodes:
            if not any(
                keyword in node.get("src", "") or keyword in node.get("href", "")
                for keyword in ["accounts.google.com", "blob:", "youtube.com/redirect"]
            ):
                continue
            elif (
                any(node.name in tag for tag in ["video", "iframe", "a"])
                and "www.youtube.com/watch?v" in node.get("src", "")
                or "www.youtube.com/watch?v" in node.get("href", "")
            ):
                videos.append(node.get("src", ""))
        return videos

    def _extract_links(self, links: list) -> List[str]:
        # Filter out unwanted links
        filtered_links = []
        for link in links:
            url = link.get("href")
            if url.startswith(("http://", "https://")) and not any(
                keyword in url
                for keyword in ["support.google.com", "google.com", "accounts.google.com", "youtube.com", "blob:", "mailto:", "javascript:"]
            ):
                filtered_links.append(link)
        return filtered_links


if __name__ == "__main__":
    # Testing the scraper
    import sys

    urls = [
        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
        "https://github.com/SesameAILabs/csm",
        "https://docs.anthropic.com/en/docs/agents-and-tools/claude-code/overview",
        "https://docs.crawl4ai.com/advanced/multi-url-crawling/",
        "https://github.com/SesameAILabs/csm",
    ]
    if len(sys.argv) > 1:
        urls = sys.argv[1:]

    async def main():
        scraper = CrawlForAIScraper()
        await scraper.start()
        data = await scraper.search_and_scrape("blender.org")
        await scraper.close()
        with open("output.log.json", "w") as f:
            f.write(json.dumps(data, indent=2))
        print(json.dumps(data, indent=2))

    asyncio.run(main())