itsOwen commited on
Commit
dfc44b9
Β·
1 Parent(s): 74036ee

stealth mode fix + captcha bypass added

Browse files
README.md CHANGED
@@ -37,6 +37,7 @@ Whether you're a corpo data analyst, a street-smart netrunner, or just someone l
37
  - πŸ›‘οΈ **Ethical Scraping**: Respects robots.txt and site policies. We may be in 2077, but we still have standards.
38
  - πŸ“„ **Caching**: We implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
39
  - βœ… **Upload to Google Sheets**: Now you can easily upload your extract csv data to google sheets with one click.
 
40
  - 🌐 **Proxy Mode (Coming Soon)**: Built-in proxy support to keep you ghosting through the net.
41
  - πŸ›‘οΈ **Navigate through the Pages (BETA)**: Navigate through the webpage and scrap the data from different pages.
42
 
@@ -78,12 +79,9 @@ Please follow the Docker Container Guide given below, As I won't be able to main
78
 
79
  Linux/Mac:
80
  ```bash
81
- export OPENAI_API_KEY='your-api-key-here'
82
- ```
83
- For Windows:
84
- ```bash
85
- set OPENAI_API_KEY=your-api-key-here
86
  ```
 
87
  6. If you want to use the Ollama:
88
 
89
  Note: I only recommend using OpenAI API as GPT4o-mini is really good at following instructions, If you are using open-source LLMs make sure you have a good system as the speed of the data generation/presentation depends on how good your system is in running the LLM and also you may have to fine-tune the prompt and add some additional filters yourself.
@@ -118,7 +116,7 @@ If you prefer to use Docker, follow these steps to set up and run CyberScraper 2
118
  ```
119
  - With OpenAI API key:
120
  ```bash
121
- docker run -p 8501:8501 -e OPENAI_API_KEY='your-actual-api-key' cyberscraper-2077
122
  ```
123
 
124
  5. Open your browser and navigate to `http://localhost:8501`.
@@ -272,6 +270,8 @@ bypass_cloudflare: bool = True:
272
 
273
  Adjust these settings based on your target website and environment for optimal results.
274
 
 
 
275
  ## 🀝 Contributing
276
 
277
  We welcome all cyberpunks, netrunners, and code samurais to contribute to CyberScraper 2077!
 
37
  - πŸ›‘οΈ **Ethical Scraping**: Respects robots.txt and site policies. We may be in 2077, but we still have standards.
38
  - πŸ“„ **Caching**: We implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
39
  - βœ… **Upload to Google Sheets**: Now you can easily upload your extract csv data to google sheets with one click.
40
+ - πŸ€– **Bypass Captcha**: Bypass captcha by using the -captcha at the end of the url.
41
  - 🌐 **Proxy Mode (Coming Soon)**: Built-in proxy support to keep you ghosting through the net.
42
  - πŸ›‘οΈ **Navigate through the Pages (BETA)**: Navigate through the webpage and scrap the data from different pages.
43
 
 
79
 
80
  Linux/Mac:
81
  ```bash
82
+ export OPENAI_API_KEY="your-api-key-here"
 
 
 
 
83
  ```
84
+
85
  6. If you want to use the Ollama:
86
 
87
  Note: I only recommend using OpenAI API as GPT4o-mini is really good at following instructions, If you are using open-source LLMs make sure you have a good system as the speed of the data generation/presentation depends on how good your system is in running the LLM and also you may have to fine-tune the prompt and add some additional filters yourself.
 
116
  ```
117
  - With OpenAI API key:
118
  ```bash
119
+ docker run -p 8501:8501 -e OPENAI_API_KEY="your-actual-api-key" cyberscraper-2077
120
  ```
121
 
122
  5. Open your browser and navigate to `http://localhost:8501`.
 
270
 
271
  Adjust these settings based on your target website and environment for optimal results.
272
 
273
+ You can also bypass the captcha using the ```-captcha``` parameter at the end of the URL, the browser window will popup, complete the capatcha and go back to your terminal window, Press enter and the bot will complete it's task.
274
+
275
  ## 🀝 Contributing
276
 
277
  We welcome all cyberpunks, netrunners, and code samurais to contribute to CyberScraper 2077!
src/scrapers/playwright_scraper.py CHANGED
@@ -1,5 +1,4 @@
1
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
2
- from playwright_stealth import stealth_async
3
  from .base_scraper import BaseScraper
4
  from typing import Dict, Any, Optional, List, Tuple
5
  import asyncio
@@ -35,17 +34,19 @@ class PlaywrightScraper(BaseScraper):
35
  self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
36
  self.config = config
37
 
38
- async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
39
  async with async_playwright() as p:
40
- browser = await self.launch_browser(p, proxy)
41
  context = await self.create_context(browser, proxy)
42
  page = await context.new_page()
43
 
44
  if self.config.use_stealth:
45
- await stealth_async(page)
46
  await self.set_browser_features(page)
47
 
48
  try:
 
 
49
  contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
50
  except Exception as e:
51
  self.logger.error(f"Error during scraping: {str(e)}")
@@ -58,6 +59,72 @@ class PlaywrightScraper(BaseScraper):
58
 
59
  return contents
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
62
  contents = []
63
 
@@ -65,6 +132,7 @@ class PlaywrightScraper(BaseScraper):
65
  url_pattern = self.detect_url_pattern(base_url)
66
 
67
  if not url_pattern and not pages:
 
68
  self.logger.info(f"Scraping single page: {base_url}")
69
  content = await self.navigate_and_get_content(page, base_url)
70
  if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
@@ -72,6 +140,7 @@ class PlaywrightScraper(BaseScraper):
72
  content = await self.bypass_cloudflare(page, base_url)
73
  contents.append(content)
74
  else:
 
75
  page_numbers = self.parse_page_numbers(pages) if pages else [1]
76
  for page_num in page_numbers:
77
  current_url = self.apply_url_pattern(base_url, url_pattern, page_num) if url_pattern else base_url
@@ -88,43 +157,6 @@ class PlaywrightScraper(BaseScraper):
88
 
89
  return contents
90
 
91
- async def launch_browser(self, playwright, proxy: Optional[str] = None) -> Browser:
92
- return await playwright.chromium.launch(
93
- headless=self.config.headless,
94
- args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
95
- '--window-position=0,0', '--ignore-certifcate-errors',
96
- '--ignore-certifcate-errors-spki-list'],
97
- proxy={'server': proxy} if proxy else None
98
- )
99
-
100
- async def create_context(self, browser: Browser, proxy: Optional[str] = None) -> BrowserContext:
101
- return await browser.new_context(
102
- viewport={'width': 1920, 'height': 1080},
103
- user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
104
- proxy={'server': proxy} if proxy else None,
105
- java_script_enabled=True,
106
- ignore_https_errors=True
107
- )
108
-
109
- async def set_browser_features(self, page: Page):
110
- if self.config.use_custom_headers:
111
- await page.set_extra_http_headers({
112
- 'Accept-Language': 'en-US,en;q=0.9',
113
- 'Accept-Encoding': 'gzip, deflate, br',
114
- 'Referer': 'https://www.google.com/',
115
- 'Sec-Fetch-Dest': 'document',
116
- 'Sec-Fetch-Mode': 'navigate',
117
- 'Sec-Fetch-Site': 'none',
118
- 'Sec-Fetch-User': '?1',
119
- 'Upgrade-Insecure-Requests': '1'
120
- })
121
- if self.config.hide_webdriver:
122
- await page.evaluate('''
123
- Object.defineProperty(navigator, 'webdriver', {
124
- get: () => undefined
125
- })
126
- ''')
127
-
128
  async def navigate_and_get_content(self, page: Page, url: str) -> str:
129
  await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
130
  if self.config.simulate_human:
@@ -189,12 +221,12 @@ class PlaywrightScraper(BaseScraper):
189
 
190
  def apply_url_pattern(self, base_url: str, pattern: str, page_num: int) -> str:
191
  parsed_url = urlparse(base_url)
192
- if '=' in pattern:
193
  query = parse_qs(parsed_url.query)
194
  param, value = pattern.split('=')
195
  query[param] = [value.format(**{param: page_num})]
196
  return urlunparse(parsed_url._replace(query=urlencode(query, doseq=True)))
197
- elif '{page}' in pattern:
198
  return urlunparse(parsed_url._replace(path=pattern.format(page=page_num)))
199
  else:
200
  return base_url
 
1
  from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 
2
  from .base_scraper import BaseScraper
3
  from typing import Dict, Any, Optional, List, Tuple
4
  import asyncio
 
34
  self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
35
  self.config = config
36
 
37
+ async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Optional[str] = None, url_pattern: Optional[str] = None, handle_captcha: bool = False) -> List[str]:
38
  async with async_playwright() as p:
39
+ browser = await self.launch_browser(p, proxy, handle_captcha)
40
  context = await self.create_context(browser, proxy)
41
  page = await context.new_page()
42
 
43
  if self.config.use_stealth:
44
+ await self.apply_stealth_settings(page)
45
  await self.set_browser_features(page)
46
 
47
  try:
48
+ if handle_captcha:
49
+ await self.handle_captcha(page, url)
50
  contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
51
  except Exception as e:
52
  self.logger.error(f"Error during scraping: {str(e)}")
 
59
 
60
  return contents
61
 
62
+ async def launch_browser(self, playwright, proxy: Optional[str] = None, handle_captcha: bool = False) -> Browser:
63
+ return await playwright.chromium.launch(
64
+ headless=self.config.headless and not handle_captcha,
65
+ args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
66
+ '--window-position=0,0', '--ignore-certifcate-errors',
67
+ '--ignore-certifcate-errors-spki-list'],
68
+ proxy={'server': proxy} if proxy else None
69
+ )
70
+
71
+ async def create_context(self, browser: Browser, proxy: Optional[str] = None) -> BrowserContext:
72
+ return await browser.new_context(
73
+ viewport={'width': 1920, 'height': 1080},
74
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
75
+ proxy={'server': proxy} if proxy else None,
76
+ java_script_enabled=True,
77
+ ignore_https_errors=True
78
+ )
79
+
80
+ async def apply_stealth_settings(self, page: Page):
81
+ await page.evaluate('''
82
+ () => {
83
+ Object.defineProperty(navigator, 'webdriver', {
84
+ get: () => undefined
85
+ });
86
+
87
+ Object.defineProperty(navigator, 'languages', {
88
+ get: () => ['en-US', 'en']
89
+ });
90
+
91
+ Object.defineProperty(navigator, 'plugins', {
92
+ get: () => [1, 2, 3, 4, 5]
93
+ });
94
+
95
+ const originalQuery = window.navigator.permissions.query;
96
+ window.navigator.permissions.query = (parameters) => (
97
+ parameters.name === 'notifications' ?
98
+ Promise.resolve({ state: Notification.permission }) :
99
+ originalQuery(parameters)
100
+ );
101
+ }
102
+ ''')
103
+
104
+ async def set_browser_features(self, page: Page):
105
+ if self.config.use_custom_headers:
106
+ await page.set_extra_http_headers({
107
+ 'Accept-Language': 'en-US,en;q=0.9',
108
+ 'Accept-Encoding': 'gzip, deflate, br',
109
+ 'Referer': 'https://www.google.com/',
110
+ 'Sec-Fetch-Dest': 'document',
111
+ 'Sec-Fetch-Mode': 'navigate',
112
+ 'Sec-Fetch-Site': 'none',
113
+ 'Sec-Fetch-User': '?1',
114
+ 'Upgrade-Insecure-Requests': '1'
115
+ })
116
+
117
+ async def handle_captcha(self, page: Page, url: str):
118
+ self.logger.info("Waiting for user to solve CAPTCHA...")
119
+ await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
120
+
121
+ print("Please solve the CAPTCHA in the browser window.")
122
+ print("Once solved, press Enter in this console to continue...")
123
+ input()
124
+
125
+ await page.wait_for_load_state('networkidle')
126
+ self.logger.info("CAPTCHA handling completed.")
127
+
128
  async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
129
  contents = []
130
 
 
132
  url_pattern = self.detect_url_pattern(base_url)
133
 
134
  if not url_pattern and not pages:
135
+ # Single page scraping
136
  self.logger.info(f"Scraping single page: {base_url}")
137
  content = await self.navigate_and_get_content(page, base_url)
138
  if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
 
140
  content = await self.bypass_cloudflare(page, base_url)
141
  contents.append(content)
142
  else:
143
+ # Multiple page scraping
144
  page_numbers = self.parse_page_numbers(pages) if pages else [1]
145
  for page_num in page_numbers:
146
  current_url = self.apply_url_pattern(base_url, url_pattern, page_num) if url_pattern else base_url
 
157
 
158
  return contents
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  async def navigate_and_get_content(self, page: Page, url: str) -> str:
161
  await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
162
  if self.config.simulate_human:
 
221
 
222
  def apply_url_pattern(self, base_url: str, pattern: str, page_num: int) -> str:
223
  parsed_url = urlparse(base_url)
224
+ if '=' in pattern:
225
  query = parse_qs(parsed_url.query)
226
  param, value = pattern.split('=')
227
  query[param] = [value.format(**{param: page_num})]
228
  return urlunparse(parsed_url._replace(query=urlencode(query, doseq=True)))
229
+ elif '{page}' in pattern:
230
  return urlunparse(parsed_url._replace(path=pattern.format(page=page_num)))
231
  else:
232
  return base_url
src/web_extractor.py CHANGED
@@ -109,36 +109,38 @@ class WebExtractor:
109
 
110
  async def process_query(self, user_input: str) -> str:
111
  if user_input.lower().startswith("http"):
112
- parts = user_input.split(maxsplit=2)
113
  url = parts[0]
114
- pages = parts[1] if len(parts) > 1 else None
115
- url_pattern = parts[2] if len(parts) > 2 else None
116
- response = await self._fetch_url(url, pages, url_pattern)
 
 
117
  elif not self.current_content:
118
  response = "Please provide a URL first before asking for information."
119
  else:
120
  response = await self._extract_info(user_input)
121
-
122
  self.conversation_history.append(f"Human: {user_input}")
123
  self.conversation_history.append(f"AI: {response}")
124
  return response
125
 
126
- async def _fetch_url(self, url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> str:
127
  self.current_url = url
128
  proxy = await self.proxy_manager.get_proxy()
129
-
130
- contents = await self.playwright_scraper.fetch_content(url, proxy, pages, url_pattern)
131
  self.current_content = "\n".join(contents)
132
  self.preprocessed_content = self._preprocess_content(self.current_content)
133
-
134
  new_hash = self._hash_content(self.preprocessed_content)
135
  if self.content_hash != new_hash:
136
  self.content_hash = new_hash
137
  self.query_cache.clear()
138
-
139
  return f"I've fetched and preprocessed the content from {self.current_url}" + \
140
- (f" (pages: {pages})" if pages else "") + \
141
- ". What would you like to know about it?"
142
 
143
  def _preprocess_content(self, content: str) -> str:
144
  soup = BeautifulSoup(content, 'html.parser')
 
109
 
110
  async def process_query(self, user_input: str) -> str:
111
  if user_input.lower().startswith("http"):
112
+ parts = user_input.split(maxsplit=3)
113
  url = parts[0]
114
+ pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
115
+ url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
116
+ handle_captcha = '-captcha' in user_input.lower()
117
+
118
+ response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
119
  elif not self.current_content:
120
  response = "Please provide a URL first before asking for information."
121
  else:
122
  response = await self._extract_info(user_input)
123
+
124
  self.conversation_history.append(f"Human: {user_input}")
125
  self.conversation_history.append(f"AI: {response}")
126
  return response
127
 
128
+ async def _fetch_url(self, url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None, handle_captcha: bool = False) -> str:
129
  self.current_url = url
130
  proxy = await self.proxy_manager.get_proxy()
131
+
132
+ contents = await self.playwright_scraper.fetch_content(url, proxy, pages, url_pattern, handle_captcha)
133
  self.current_content = "\n".join(contents)
134
  self.preprocessed_content = self._preprocess_content(self.current_content)
135
+
136
  new_hash = self._hash_content(self.preprocessed_content)
137
  if self.content_hash != new_hash:
138
  self.content_hash = new_hash
139
  self.query_cache.clear()
140
+
141
  return f"I've fetched and preprocessed the content from {self.current_url}" + \
142
+ (f" (pages: {pages})" if pages else "") + \
143
+ ". What would you like to know about it?"
144
 
145
  def _preprocess_content(self, content: str) -> str:
146
  soup = BeautifulSoup(content, 'html.parser')