Soham Waghmare commited on
Commit
ceae8b0
·
1 Parent(s): 15b6036

feat: image extraction logic

Browse files
Files changed (1) hide show
  1. backend/scraper.py +27 -21
backend/scraper.py CHANGED
@@ -1,10 +1,13 @@
1
- from bs4 import BeautifulSoup
 
2
  import logging
3
- from typing import List, Dict, Any
 
 
 
4
  import newspaper
5
  from newspaper import Article
6
  import requests
7
- from urllib.parse import quote_plus
8
 
9
 
10
  class WebScraper:
@@ -150,9 +153,7 @@ class WebScraper:
150
  return merged
151
 
152
 
153
- import asyncio
154
- from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig
155
- import json
156
  class CrawlForAIScraper:
157
  def __init__(self) -> None:
158
  self.logger = logging.getLogger(__name__)
@@ -199,6 +200,9 @@ class CrawlForAIScraper:
199
  self.logger.info(f"Completed scraping {len(scraped_data)} sites")
200
  return scraped_data
201
 
 
 
 
202
  async def _scrape_page(self, url: str) -> Dict[str, Any]:
203
  if not self._is_started:
204
  await self.start()
@@ -206,27 +210,15 @@ class CrawlForAIScraper:
206
  try:
207
  # Run the crawler on a URL
208
  result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS)
 
209
  data = {
210
  "url": url,
211
  "text": result.markdown,
212
- "images": result.media["images"],
213
  "videos": result.media["videos"],
214
  "links": result.links,
215
  }
216
 
217
- # if not data["text"]:
218
- # response = self.session.get(url, timeout=self.timeout)
219
- # soup = BeautifulSoup(response.text, "html.parser")
220
- # selenium_data = {
221
- # "url": url,
222
- # "title": soup.title.string if soup.title else "",
223
- # "text": self._extract_text(soup),
224
- # "images": self._extract_images(soup),
225
- # "videos": self._extract_videos(soup),
226
- # "links": self._extract_links(soup),
227
- # }
228
- # return self._merge_extraction_results(data, selenium_data)
229
-
230
  return data
231
 
232
  except Exception as e:
@@ -234,7 +226,21 @@ class CrawlForAIScraper:
234
  raise e
235
  return {}
236
 
237
- async def _google_search(self, query: str, num_results: int) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  pass
239
 
240
 
 
1
+ import asyncio
2
+ import json
3
  import logging
4
+ from typing import Any, Dict, List
5
+ from urllib.parse import quote_plus
6
+ from bs4 import BeautifulSoup
7
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CacheMode
8
  import newspaper
9
  from newspaper import Article
10
  import requests
 
11
 
12
 
13
  class WebScraper:
 
153
  return merged
154
 
155
 
156
+
 
 
157
  class CrawlForAIScraper:
158
  def __init__(self) -> None:
159
  self.logger = logging.getLogger(__name__)
 
200
  self.logger.info(f"Completed scraping {len(scraped_data)} sites")
201
  return scraped_data
202
 
203
+ async def _google_search(self, query: str, num_results: int) -> List[str]:
204
+ pass
205
+
206
  async def _scrape_page(self, url: str) -> Dict[str, Any]:
207
  if not self._is_started:
208
  await self.start()
 
210
  try:
211
  # Run the crawler on a URL
212
  result = await self.crawler.arun(url=url, screenshot=False, cache_mode=CacheMode.BYPASS)
213
+ soup = BeautifulSoup(result.html, "html.parser")
214
  data = {
215
  "url": url,
216
  "text": result.markdown,
217
+ "images": self._extract_images(soup),
218
  "videos": result.media["videos"],
219
  "links": result.links,
220
  }
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  return data
223
 
224
  except Exception as e:
 
226
  raise e
227
  return {}
228
 
229
+ def _extract_text(self, soup: BeautifulSoup) -> str:
230
+ pass
231
+
232
+ def _extract_images(self, soup: BeautifulSoup) -> List[str]:
233
+ images = [img['src'] for img in soup.find_all('img') if 'src' in img.attrs and int(img.get('width', 0)) > 300 and int(img.get('height', 0)) > 300 and 'pixel' not in img['src'] and 'icon' not in img['src']]
234
+ images = sorted(images, key=lambda src: -1 * (int(soup.find('img', {'src': src}).get('width', 0)) * int(soup.find('img', {'src': src}).get('height', 0))))
235
+ return images
236
+
237
+ def _extract_videos(self, soup: BeautifulSoup) -> List[str]:
238
+ pass
239
+
240
+ def _extract_links(self, soup: BeautifulSoup) -> List[str]:
241
+ pass
242
+
243
+ def _merge_extraction_results(self, news_data: Dict, selenium_data: Dict) -> Dict[str, Any]:
244
  pass
245
 
246