Spaces:
Paused
Paused
itsOwen commited on
Commit Β·
dfc44b9
1
Parent(s): 74036ee
stealth mode fix + captcha bypass added
Browse files- README.md +6 -6
- src/scrapers/playwright_scraper.py +75 -43
- src/web_extractor.py +14 -12
README.md
CHANGED
|
@@ -37,6 +37,7 @@ Whether you're a corpo data analyst, a street-smart netrunner, or just someone l
|
|
| 37 |
- π‘οΈ **Ethical Scraping**: Respects robots.txt and site policies. We may be in 2077, but we still have standards.
|
| 38 |
- π **Caching**: We implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
|
| 39 |
- β
**Upload to Google Sheets**: Now you can easily upload your extract csv data to google sheets with one click.
|
|
|
|
| 40 |
- π **Proxy Mode (Coming Soon)**: Built-in proxy support to keep you ghosting through the net.
|
| 41 |
- π‘οΈ **Navigate through the Pages (BETA)**: Navigate through the webpage and scrap the data from different pages.
|
| 42 |
|
|
@@ -78,12 +79,9 @@ Please follow the Docker Container Guide given below, As I won't be able to main
|
|
| 78 |
|
| 79 |
Linux/Mac:
|
| 80 |
```bash
|
| 81 |
-
export OPENAI_API_KEY=
|
| 82 |
-
```
|
| 83 |
-
For Windows:
|
| 84 |
-
```bash
|
| 85 |
-
set OPENAI_API_KEY=your-api-key-here
|
| 86 |
```
|
|
|
|
| 87 |
6. If you want to use the Ollama:
|
| 88 |
|
| 89 |
Note: I only recommend using OpenAI API as GPT4o-mini is really good at following instructions, If you are using open-source LLMs make sure you have a good system as the speed of the data generation/presentation depends on how good your system is in running the LLM and also you may have to fine-tune the prompt and add some additional filters yourself.
|
|
@@ -118,7 +116,7 @@ If you prefer to use Docker, follow these steps to set up and run CyberScraper 2
|
|
| 118 |
```
|
| 119 |
- With OpenAI API key:
|
| 120 |
```bash
|
| 121 |
-
docker run -p 8501:8501 -e OPENAI_API_KEY=
|
| 122 |
```
|
| 123 |
|
| 124 |
5. Open your browser and navigate to `http://localhost:8501`.
|
|
@@ -272,6 +270,8 @@ bypass_cloudflare: bool = True:
|
|
| 272 |
|
| 273 |
Adjust these settings based on your target website and environment for optimal results.
|
| 274 |
|
|
|
|
|
|
|
| 275 |
## π€ Contributing
|
| 276 |
|
| 277 |
We welcome all cyberpunks, netrunners, and code samurais to contribute to CyberScraper 2077!
|
|
|
|
| 37 |
- π‘οΈ **Ethical Scraping**: Respects robots.txt and site policies. We may be in 2077, but we still have standards.
|
| 38 |
- π **Caching**: We implemented content-based and query-based caching using LRU cache and a custom dictionary to reduce redundant API calls.
|
| 39 |
- β
**Upload to Google Sheets**: Now you can easily upload your extract csv data to google sheets with one click.
|
| 40 |
+
- π€ **Bypass Captcha**: Bypass captcha by using the -captcha at the end of the url.
|
| 41 |
- π **Proxy Mode (Coming Soon)**: Built-in proxy support to keep you ghosting through the net.
|
| 42 |
- π‘οΈ **Navigate through the Pages (BETA)**: Navigate through the webpage and scrap the data from different pages.
|
| 43 |
|
|
|
|
| 79 |
|
| 80 |
Linux/Mac:
|
| 81 |
```bash
|
| 82 |
+
export OPENAI_API_KEY="your-api-key-here"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
```
|
| 84 |
+
|
| 85 |
6. If you want to use the Ollama:
|
| 86 |
|
| 87 |
Note: I only recommend using OpenAI API as GPT4o-mini is really good at following instructions, If you are using open-source LLMs make sure you have a good system as the speed of the data generation/presentation depends on how good your system is in running the LLM and also you may have to fine-tune the prompt and add some additional filters yourself.
|
|
|
|
| 116 |
```
|
| 117 |
- With OpenAI API key:
|
| 118 |
```bash
|
| 119 |
+
docker run -p 8501:8501 -e OPENAI_API_KEY="your-actual-api-key" cyberscraper-2077
|
| 120 |
```
|
| 121 |
|
| 122 |
5. Open your browser and navigate to `http://localhost:8501`.
|
|
|
|
| 270 |
|
| 271 |
Adjust these settings based on your target website and environment for optimal results.
|
| 272 |
|
| 273 |
+
You can also bypass the captcha using the ```-captcha``` parameter at the end of the URL, the browser window will popup, complete the capatcha and go back to your terminal window, Press enter and the bot will complete it's task.
|
| 274 |
+
|
| 275 |
## π€ Contributing
|
| 276 |
|
| 277 |
We welcome all cyberpunks, netrunners, and code samurais to contribute to CyberScraper 2077!
|
src/scrapers/playwright_scraper.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 2 |
-
from playwright_stealth import stealth_async
|
| 3 |
from .base_scraper import BaseScraper
|
| 4 |
from typing import Dict, Any, Optional, List, Tuple
|
| 5 |
import asyncio
|
|
@@ -35,17 +34,19 @@ class PlaywrightScraper(BaseScraper):
|
|
| 35 |
self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
|
| 36 |
self.config = config
|
| 37 |
|
| 38 |
-
async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
|
| 39 |
async with async_playwright() as p:
|
| 40 |
-
browser = await self.launch_browser(p, proxy)
|
| 41 |
context = await self.create_context(browser, proxy)
|
| 42 |
page = await context.new_page()
|
| 43 |
|
| 44 |
if self.config.use_stealth:
|
| 45 |
-
await
|
| 46 |
await self.set_browser_features(page)
|
| 47 |
|
| 48 |
try:
|
|
|
|
|
|
|
| 49 |
contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
|
| 50 |
except Exception as e:
|
| 51 |
self.logger.error(f"Error during scraping: {str(e)}")
|
|
@@ -58,6 +59,72 @@ class PlaywrightScraper(BaseScraper):
|
|
| 58 |
|
| 59 |
return contents
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
|
| 62 |
contents = []
|
| 63 |
|
|
@@ -65,6 +132,7 @@ class PlaywrightScraper(BaseScraper):
|
|
| 65 |
url_pattern = self.detect_url_pattern(base_url)
|
| 66 |
|
| 67 |
if not url_pattern and not pages:
|
|
|
|
| 68 |
self.logger.info(f"Scraping single page: {base_url}")
|
| 69 |
content = await self.navigate_and_get_content(page, base_url)
|
| 70 |
if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
|
|
@@ -72,6 +140,7 @@ class PlaywrightScraper(BaseScraper):
|
|
| 72 |
content = await self.bypass_cloudflare(page, base_url)
|
| 73 |
contents.append(content)
|
| 74 |
else:
|
|
|
|
| 75 |
page_numbers = self.parse_page_numbers(pages) if pages else [1]
|
| 76 |
for page_num in page_numbers:
|
| 77 |
current_url = self.apply_url_pattern(base_url, url_pattern, page_num) if url_pattern else base_url
|
|
@@ -88,43 +157,6 @@ class PlaywrightScraper(BaseScraper):
|
|
| 88 |
|
| 89 |
return contents
|
| 90 |
|
| 91 |
-
async def launch_browser(self, playwright, proxy: Optional[str] = None) -> Browser:
|
| 92 |
-
return await playwright.chromium.launch(
|
| 93 |
-
headless=self.config.headless,
|
| 94 |
-
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
|
| 95 |
-
'--window-position=0,0', '--ignore-certifcate-errors',
|
| 96 |
-
'--ignore-certifcate-errors-spki-list'],
|
| 97 |
-
proxy={'server': proxy} if proxy else None
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
async def create_context(self, browser: Browser, proxy: Optional[str] = None) -> BrowserContext:
|
| 101 |
-
return await browser.new_context(
|
| 102 |
-
viewport={'width': 1920, 'height': 1080},
|
| 103 |
-
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 104 |
-
proxy={'server': proxy} if proxy else None,
|
| 105 |
-
java_script_enabled=True,
|
| 106 |
-
ignore_https_errors=True
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
async def set_browser_features(self, page: Page):
|
| 110 |
-
if self.config.use_custom_headers:
|
| 111 |
-
await page.set_extra_http_headers({
|
| 112 |
-
'Accept-Language': 'en-US,en;q=0.9',
|
| 113 |
-
'Accept-Encoding': 'gzip, deflate, br',
|
| 114 |
-
'Referer': 'https://www.google.com/',
|
| 115 |
-
'Sec-Fetch-Dest': 'document',
|
| 116 |
-
'Sec-Fetch-Mode': 'navigate',
|
| 117 |
-
'Sec-Fetch-Site': 'none',
|
| 118 |
-
'Sec-Fetch-User': '?1',
|
| 119 |
-
'Upgrade-Insecure-Requests': '1'
|
| 120 |
-
})
|
| 121 |
-
if self.config.hide_webdriver:
|
| 122 |
-
await page.evaluate('''
|
| 123 |
-
Object.defineProperty(navigator, 'webdriver', {
|
| 124 |
-
get: () => undefined
|
| 125 |
-
})
|
| 126 |
-
''')
|
| 127 |
-
|
| 128 |
async def navigate_and_get_content(self, page: Page, url: str) -> str:
|
| 129 |
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 130 |
if self.config.simulate_human:
|
|
@@ -189,12 +221,12 @@ class PlaywrightScraper(BaseScraper):
|
|
| 189 |
|
| 190 |
def apply_url_pattern(self, base_url: str, pattern: str, page_num: int) -> str:
|
| 191 |
parsed_url = urlparse(base_url)
|
| 192 |
-
if '=' in pattern:
|
| 193 |
query = parse_qs(parsed_url.query)
|
| 194 |
param, value = pattern.split('=')
|
| 195 |
query[param] = [value.format(**{param: page_num})]
|
| 196 |
return urlunparse(parsed_url._replace(query=urlencode(query, doseq=True)))
|
| 197 |
-
elif '{page}' in pattern:
|
| 198 |
return urlunparse(parsed_url._replace(path=pattern.format(page=page_num)))
|
| 199 |
else:
|
| 200 |
return base_url
|
|
|
|
| 1 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
|
|
| 2 |
from .base_scraper import BaseScraper
|
| 3 |
from typing import Dict, Any, Optional, List, Tuple
|
| 4 |
import asyncio
|
|
|
|
| 34 |
self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
|
| 35 |
self.config = config
|
| 36 |
|
| 37 |
+
async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Optional[str] = None, url_pattern: Optional[str] = None, handle_captcha: bool = False) -> List[str]:
|
| 38 |
async with async_playwright() as p:
|
| 39 |
+
browser = await self.launch_browser(p, proxy, handle_captcha)
|
| 40 |
context = await self.create_context(browser, proxy)
|
| 41 |
page = await context.new_page()
|
| 42 |
|
| 43 |
if self.config.use_stealth:
|
| 44 |
+
await self.apply_stealth_settings(page)
|
| 45 |
await self.set_browser_features(page)
|
| 46 |
|
| 47 |
try:
|
| 48 |
+
if handle_captcha:
|
| 49 |
+
await self.handle_captcha(page, url)
|
| 50 |
contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
|
| 51 |
except Exception as e:
|
| 52 |
self.logger.error(f"Error during scraping: {str(e)}")
|
|
|
|
| 59 |
|
| 60 |
return contents
|
| 61 |
|
| 62 |
+
async def launch_browser(self, playwright, proxy: Optional[str] = None, handle_captcha: bool = False) -> Browser:
|
| 63 |
+
return await playwright.chromium.launch(
|
| 64 |
+
headless=self.config.headless and not handle_captcha,
|
| 65 |
+
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
|
| 66 |
+
'--window-position=0,0', '--ignore-certifcate-errors',
|
| 67 |
+
'--ignore-certifcate-errors-spki-list'],
|
| 68 |
+
proxy={'server': proxy} if proxy else None
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
async def create_context(self, browser: Browser, proxy: Optional[str] = None) -> BrowserContext:
|
| 72 |
+
return await browser.new_context(
|
| 73 |
+
viewport={'width': 1920, 'height': 1080},
|
| 74 |
+
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 75 |
+
proxy={'server': proxy} if proxy else None,
|
| 76 |
+
java_script_enabled=True,
|
| 77 |
+
ignore_https_errors=True
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
async def apply_stealth_settings(self, page: Page):
|
| 81 |
+
await page.evaluate('''
|
| 82 |
+
() => {
|
| 83 |
+
Object.defineProperty(navigator, 'webdriver', {
|
| 84 |
+
get: () => undefined
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
Object.defineProperty(navigator, 'languages', {
|
| 88 |
+
get: () => ['en-US', 'en']
|
| 89 |
+
});
|
| 90 |
+
|
| 91 |
+
Object.defineProperty(navigator, 'plugins', {
|
| 92 |
+
get: () => [1, 2, 3, 4, 5]
|
| 93 |
+
});
|
| 94 |
+
|
| 95 |
+
const originalQuery = window.navigator.permissions.query;
|
| 96 |
+
window.navigator.permissions.query = (parameters) => (
|
| 97 |
+
parameters.name === 'notifications' ?
|
| 98 |
+
Promise.resolve({ state: Notification.permission }) :
|
| 99 |
+
originalQuery(parameters)
|
| 100 |
+
);
|
| 101 |
+
}
|
| 102 |
+
''')
|
| 103 |
+
|
| 104 |
+
async def set_browser_features(self, page: Page):
|
| 105 |
+
if self.config.use_custom_headers:
|
| 106 |
+
await page.set_extra_http_headers({
|
| 107 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
| 108 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 109 |
+
'Referer': 'https://www.google.com/',
|
| 110 |
+
'Sec-Fetch-Dest': 'document',
|
| 111 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 112 |
+
'Sec-Fetch-Site': 'none',
|
| 113 |
+
'Sec-Fetch-User': '?1',
|
| 114 |
+
'Upgrade-Insecure-Requests': '1'
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
async def handle_captcha(self, page: Page, url: str):
|
| 118 |
+
self.logger.info("Waiting for user to solve CAPTCHA...")
|
| 119 |
+
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 120 |
+
|
| 121 |
+
print("Please solve the CAPTCHA in the browser window.")
|
| 122 |
+
print("Once solved, press Enter in this console to continue...")
|
| 123 |
+
input()
|
| 124 |
+
|
| 125 |
+
await page.wait_for_load_state('networkidle')
|
| 126 |
+
self.logger.info("CAPTCHA handling completed.")
|
| 127 |
+
|
| 128 |
async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
|
| 129 |
contents = []
|
| 130 |
|
|
|
|
| 132 |
url_pattern = self.detect_url_pattern(base_url)
|
| 133 |
|
| 134 |
if not url_pattern and not pages:
|
| 135 |
+
# Single page scraping
|
| 136 |
self.logger.info(f"Scraping single page: {base_url}")
|
| 137 |
content = await self.navigate_and_get_content(page, base_url)
|
| 138 |
if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
|
|
|
|
| 140 |
content = await self.bypass_cloudflare(page, base_url)
|
| 141 |
contents.append(content)
|
| 142 |
else:
|
| 143 |
+
# Multiple page scraping
|
| 144 |
page_numbers = self.parse_page_numbers(pages) if pages else [1]
|
| 145 |
for page_num in page_numbers:
|
| 146 |
current_url = self.apply_url_pattern(base_url, url_pattern, page_num) if url_pattern else base_url
|
|
|
|
| 157 |
|
| 158 |
return contents
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
async def navigate_and_get_content(self, page: Page, url: str) -> str:
|
| 161 |
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 162 |
if self.config.simulate_human:
|
|
|
|
| 221 |
|
| 222 |
def apply_url_pattern(self, base_url: str, pattern: str, page_num: int) -> str:
|
| 223 |
parsed_url = urlparse(base_url)
|
| 224 |
+
if '=' in pattern:
|
| 225 |
query = parse_qs(parsed_url.query)
|
| 226 |
param, value = pattern.split('=')
|
| 227 |
query[param] = [value.format(**{param: page_num})]
|
| 228 |
return urlunparse(parsed_url._replace(query=urlencode(query, doseq=True)))
|
| 229 |
+
elif '{page}' in pattern:
|
| 230 |
return urlunparse(parsed_url._replace(path=pattern.format(page=page_num)))
|
| 231 |
else:
|
| 232 |
return base_url
|
src/web_extractor.py
CHANGED
|
@@ -109,36 +109,38 @@ class WebExtractor:
|
|
| 109 |
|
| 110 |
async def process_query(self, user_input: str) -> str:
|
| 111 |
if user_input.lower().startswith("http"):
|
| 112 |
-
parts = user_input.split(maxsplit=
|
| 113 |
url = parts[0]
|
| 114 |
-
pages = parts[1] if len(parts) > 1 else None
|
| 115 |
-
url_pattern = parts[2] if len(parts) > 2 else None
|
| 116 |
-
|
|
|
|
|
|
|
| 117 |
elif not self.current_content:
|
| 118 |
response = "Please provide a URL first before asking for information."
|
| 119 |
else:
|
| 120 |
response = await self._extract_info(user_input)
|
| 121 |
-
|
| 122 |
self.conversation_history.append(f"Human: {user_input}")
|
| 123 |
self.conversation_history.append(f"AI: {response}")
|
| 124 |
return response
|
| 125 |
|
| 126 |
-
async def _fetch_url(self, url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> str:
|
| 127 |
self.current_url = url
|
| 128 |
proxy = await self.proxy_manager.get_proxy()
|
| 129 |
-
|
| 130 |
-
contents = await self.playwright_scraper.fetch_content(url, proxy, pages, url_pattern)
|
| 131 |
self.current_content = "\n".join(contents)
|
| 132 |
self.preprocessed_content = self._preprocess_content(self.current_content)
|
| 133 |
-
|
| 134 |
new_hash = self._hash_content(self.preprocessed_content)
|
| 135 |
if self.content_hash != new_hash:
|
| 136 |
self.content_hash = new_hash
|
| 137 |
self.query_cache.clear()
|
| 138 |
-
|
| 139 |
return f"I've fetched and preprocessed the content from {self.current_url}" + \
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
def _preprocess_content(self, content: str) -> str:
|
| 144 |
soup = BeautifulSoup(content, 'html.parser')
|
|
|
|
| 109 |
|
| 110 |
async def process_query(self, user_input: str) -> str:
|
| 111 |
if user_input.lower().startswith("http"):
|
| 112 |
+
parts = user_input.split(maxsplit=3)
|
| 113 |
url = parts[0]
|
| 114 |
+
pages = parts[1] if len(parts) > 1 and not parts[1].startswith('-') else None
|
| 115 |
+
url_pattern = parts[2] if len(parts) > 2 and not parts[2].startswith('-') else None
|
| 116 |
+
handle_captcha = '-captcha' in user_input.lower()
|
| 117 |
+
|
| 118 |
+
response = await self._fetch_url(url, pages, url_pattern, handle_captcha)
|
| 119 |
elif not self.current_content:
|
| 120 |
response = "Please provide a URL first before asking for information."
|
| 121 |
else:
|
| 122 |
response = await self._extract_info(user_input)
|
| 123 |
+
|
| 124 |
self.conversation_history.append(f"Human: {user_input}")
|
| 125 |
self.conversation_history.append(f"AI: {response}")
|
| 126 |
return response
|
| 127 |
|
| 128 |
+
async def _fetch_url(self, url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None, handle_captcha: bool = False) -> str:
|
| 129 |
self.current_url = url
|
| 130 |
proxy = await self.proxy_manager.get_proxy()
|
| 131 |
+
|
| 132 |
+
contents = await self.playwright_scraper.fetch_content(url, proxy, pages, url_pattern, handle_captcha)
|
| 133 |
self.current_content = "\n".join(contents)
|
| 134 |
self.preprocessed_content = self._preprocess_content(self.current_content)
|
| 135 |
+
|
| 136 |
new_hash = self._hash_content(self.preprocessed_content)
|
| 137 |
if self.content_hash != new_hash:
|
| 138 |
self.content_hash = new_hash
|
| 139 |
self.query_cache.clear()
|
| 140 |
+
|
| 141 |
return f"I've fetched and preprocessed the content from {self.current_url}" + \
|
| 142 |
+
(f" (pages: {pages})" if pages else "") + \
|
| 143 |
+
". What would you like to know about it?"
|
| 144 |
|
| 145 |
def _preprocess_content(self, content: str) -> str:
|
| 146 |
soup = BeautifulSoup(content, 'html.parser')
|