Spaces:
Paused
Paused
Soham Waghmare
commited on
Commit
·
ceae8b0
1
Parent(s):
15b6036
feat: image extraction logic
Browse files- backend/scraper.py +27 -21
backend/scraper.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
import logging
|
| 3 |
-
from typing import
|
|
|
|
|
|
|
|
|
|
| 4 |
import newspaper
|
| 5 |
from newspaper import Article
|
| 6 |
import requests
|
| 7 |
-
from urllib.parse import quote_plus
|
| 8 |
|
| 9 |
|
| 10 |
class WebScraper:
|
|
@@ -150,9 +153,7 @@ class WebScraper:
|
|
| 150 |
return merged
|
| 151 |
|
| 152 |
|
| 153 |
-
|
| 154 |
-
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig
|
| 155 |
-
import json
|
| 156 |
class CrawlForAIScraper:
|
| 157 |
def __init__(self) -> None:
|
| 158 |
self.logger = logging.getLogger(__name__)
|
|
@@ -199,6 +200,9 @@ class CrawlForAIScraper:
|
|
| 199 |
self.logger.info(f"Completed scraping {len(scraped_data)} sites")
|
| 200 |
return scraped_data
|
| 201 |
|
|
|
|
|
|
|
|
|
|
| 202 |
async def _scrape_page(self, url: str) -> Dict[str, Any]:
|
| 203 |
if not self._is_started:
|
| 204 |
await self.start()
|
|
@@ -206,27 +210,15 @@ class CrawlForAIScraper:
|
|
| 206 |
try:
|
| 207 |
# Run the crawler on a URL
|
| 208 |
result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS)
|
|
|
|
| 209 |
data = {
|
| 210 |
"url": url,
|
| 211 |
"text": result.markdown,
|
| 212 |
-
"images":
|
| 213 |
"videos": result.media["videos"],
|
| 214 |
"links": result.links,
|
| 215 |
}
|
| 216 |
|
| 217 |
-
# if not data["text"]:
|
| 218 |
-
# response = self.session.get(url, timeout=self.timeout)
|
| 219 |
-
# soup = BeautifulSoup(response.text, "html.parser")
|
| 220 |
-
# selenium_data = {
|
| 221 |
-
# "url": url,
|
| 222 |
-
# "title": soup.title.string if soup.title else "",
|
| 223 |
-
# "text": self._extract_text(soup),
|
| 224 |
-
# "images": self._extract_images(soup),
|
| 225 |
-
# "videos": self._extract_videos(soup),
|
| 226 |
-
# "links": self._extract_links(soup),
|
| 227 |
-
# }
|
| 228 |
-
# return self._merge_extraction_results(data, selenium_data)
|
| 229 |
-
|
| 230 |
return data
|
| 231 |
|
| 232 |
except Exception as e:
|
|
@@ -234,7 +226,21 @@ class CrawlForAIScraper:
|
|
| 234 |
raise e
|
| 235 |
return {}
|
| 236 |
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
pass
|
| 239 |
|
| 240 |
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import json
|
| 3 |
import logging
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
from urllib.parse import quote_plus
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
|
| 8 |
import newspaper
|
| 9 |
from newspaper import Article
|
| 10 |
import requests
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class WebScraper:
|
|
|
|
| 153 |
return merged
|
| 154 |
|
| 155 |
|
| 156 |
+
|
|
|
|
|
|
|
| 157 |
class CrawlForAIScraper:
|
| 158 |
def __init__(self) -> None:
|
| 159 |
self.logger = logging.getLogger(__name__)
|
|
|
|
| 200 |
self.logger.info(f"Completed scraping {len(scraped_data)} sites")
|
| 201 |
return scraped_data
|
| 202 |
|
| 203 |
+
async def _google_search(self, query: str, num_results: int) -> List[str]:
|
| 204 |
+
pass
|
| 205 |
+
|
| 206 |
async def _scrape_page(self, url: str) -> Dict[str, Any]:
|
| 207 |
if not self._is_started:
|
| 208 |
await self.start()
|
|
|
|
| 210 |
try:
|
| 211 |
# Run the crawler on a URL
|
| 212 |
result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS)
|
| 213 |
+
soup = BeautifulSoup(result.html, "html.parser")
|
| 214 |
data = {
|
| 215 |
"url": url,
|
| 216 |
"text": result.markdown,
|
| 217 |
+
"images": self._extract_images(soup),
|
| 218 |
"videos": result.media["videos"],
|
| 219 |
"links": result.links,
|
| 220 |
}
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
return data
|
| 223 |
|
| 224 |
except Exception as e:
|
|
|
|
| 226 |
raise e
|
| 227 |
return {}
|
| 228 |
|
| 229 |
+
def _extract_text(self, soup: BeautifulSoup) -> str:
|
| 230 |
+
pass
|
| 231 |
+
|
| 232 |
+
def _extract_images(self, soup: BeautifulSoup) -> List[str]:
|
| 233 |
+
images = [img['src'] for img in soup.find_all('img') if 'src' in img.attrs and int(img.get('width', 0)) > 300 and int(img.get('height', 0)) > 300 and 'pixel' not in img['src'] and 'icon' not in img['src']]
|
| 234 |
+
images = sorted(images, key=lambda src: -1 * (int(soup.find('img', {'src': src}).get('width', 0)) * int(soup.find('img', {'src': src}).get('height', 0))))
|
| 235 |
+
return images
|
| 236 |
+
|
| 237 |
+
def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
|
| 238 |
+
pass
|
| 239 |
+
|
| 240 |
+
def _extract_links(self, soup: BeautifulSoup) -> List[str]:
|
| 241 |
+
pass
|
| 242 |
+
|
| 243 |
+
def _merge_extraction_results(self, news_data: Dict, selenium_data: Dict) -> Dict[str, Any]:
|
| 244 |
pass
|
| 245 |
|
| 246 |
|