Spaces:
Paused
Paused
itsOwen commited on
Commit ·
d25f246
1
Parent(s): a89b867
multi-page scrape beta released
Browse files- README.md +67 -0
- src/scrapers/playwright_scraper.py +93 -15
- src/web_extractor.py +41 -21
README.md
CHANGED
|
@@ -167,6 +167,73 @@ Note: Ensure that your firewall allows connections to port 11434 for Ollama.
|
|
| 167 |
|
| 168 |
4. Watch as CyberScraper 2077 tears through the net, extracting your data faster than you can say "flatline"!
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
## Setup Google Sheets Authentication:
|
| 171 |
|
| 172 |
1. Go to the Google Cloud Console (https://console.cloud.google.com/).
|
|
|
|
| 167 |
|
| 168 |
4. Watch as CyberScraper 2077 tears through the net, extracting your data faster than you can say "flatline"!
|
| 169 |
|
| 170 |
+
## 🌐 Multi-Page Scraping (BETA)
|
| 171 |
+
|
| 172 |
+
> **Note**: The multi-page scraping feature is currently in beta. While functional, you may encounter occasional issues or unexpected behavior. We appreciate your feedback and patience as we continue to improve this feature.
|
| 173 |
+
|
| 174 |
+
CyberScraper 2077 now supports multi-page scraping, allowing you to extract data from multiple pages of a website in one go. This feature is perfect for scraping paginated content, search results, or any site with data spread across multiple pages.
|
| 175 |
+
|
| 176 |
+
### How to Use Multi-Page Scraping
|
| 177 |
+
|
| 178 |
+
I suggest you enter the URL structure every time If you want to scrape multiple pages so it can detect the URL structure easily, It detects nearly all URL types.
|
| 179 |
+
|
| 180 |
+
1. **Basic Usage**:
|
| 181 |
+
To scrape multiple pages, use the following format when entering the URL:
|
| 182 |
+
```
|
| 183 |
+
https://example.com/page 1-5
|
| 184 |
+
https://example.com/p/ 1-6
|
| 185 |
+
https://example.com/xample/something-something-1279?p=1 1-3
|
| 186 |
+
```
|
| 187 |
+
This will scrape pages 1 through 5 of the website.
|
| 188 |
+
|
| 189 |
+
2. **Custom Page Ranges**:
|
| 190 |
+
You can specify custom page ranges:
|
| 191 |
+
```
|
| 192 |
+
https://example.com/p/ 1-5,7,9-12
|
| 193 |
+
https://example.com/xample/something-something-1279?p=1 1,7,8,9
|
| 194 |
+
```
|
| 195 |
+
This will scrape pages 1 to 5, page 7, and pages 9 to 12.
|
| 196 |
+
|
| 197 |
+
3. **URL Patterns**:
|
| 198 |
+
For websites with different URL structures, you can specify a pattern:
|
| 199 |
+
```
|
| 200 |
+
https://example.com/search?q=cyberpunk&page={page} 1-5
|
| 201 |
+
```
|
| 202 |
+
Replace `{page}` with where the page number should be in the URL.
|
| 203 |
+
|
| 204 |
+
4. **Automatic Pattern Detection**:
|
| 205 |
+
If you don't specify a pattern, CyberScraper 2077 will attempt to detect the URL pattern automatically. However, for best results, specifying the pattern is recommended.
|
| 206 |
+
|
| 207 |
+
### Tips for Effective Multi-Page Scraping
|
| 208 |
+
|
| 209 |
+
- Start with a small range of pages to test before scraping a large number.
|
| 210 |
+
- Be mindful of the website's load and your scraping speed to avoid overloading servers.
|
| 211 |
+
- Use the `simulate_human` option for more natural scraping behavior on sites with anti-bot measures.
|
| 212 |
+
- Regularly check the website's `robots.txt` file and terms of service to ensure compliance.
|
| 213 |
+
|
| 214 |
+
### Example
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
URL Example : "https://news.ycombinator.com/?p=1 1-3 or 1,2,3,4"
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
### Handling Errors
|
| 221 |
+
|
| 222 |
+
If you encounter errors during multi-page scraping:
|
| 223 |
+
- Check your internet connection
|
| 224 |
+
- Verify the URL pattern is correct
|
| 225 |
+
- Ensure the website allows scraping
|
| 226 |
+
- Try reducing the number of pages or increasing the delay between requests
|
| 227 |
+
|
| 228 |
+
### Beta Feedback
|
| 229 |
+
|
| 230 |
+
As this feature is in beta, we highly value your feedback. If you encounter any issues or have suggestions for improvement, please:
|
| 231 |
+
1. Open an issue on our GitHub repository
|
| 232 |
+
2. Provide detailed information about the problem, including the URL structure and number of pages you were attempting to scrape
|
| 233 |
+
3. Share any error messages or unexpected behaviors you observed
|
| 234 |
+
|
| 235 |
+
Your input is crucial in helping us refine and stabilize this feature for future releases.
|
| 236 |
+
|
| 237 |
## Setup Google Sheets Authentication:
|
| 238 |
|
| 239 |
1. Go to the Google Cloud Console (https://console.cloud.google.com/).
|
src/scrapers/playwright_scraper.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 2 |
from playwright_stealth import stealth_async
|
| 3 |
from .base_scraper import BaseScraper
|
| 4 |
-
from typing import Dict, Any, Optional
|
| 5 |
import asyncio
|
| 6 |
import random
|
| 7 |
import logging
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class ScraperConfig:
|
| 10 |
def __init__(self,
|
|
@@ -12,20 +14,28 @@ class ScraperConfig:
|
|
| 12 |
simulate_human: bool = False,
|
| 13 |
use_custom_headers: bool = True,
|
| 14 |
hide_webdriver: bool = True,
|
| 15 |
-
bypass_cloudflare: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
self.use_stealth = use_stealth
|
| 17 |
self.simulate_human = simulate_human
|
| 18 |
self.use_custom_headers = use_custom_headers
|
| 19 |
self.hide_webdriver = hide_webdriver
|
| 20 |
self.bypass_cloudflare = bypass_cloudflare
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
class PlaywrightScraper(BaseScraper):
|
| 23 |
def __init__(self, config: ScraperConfig = ScraperConfig()):
|
| 24 |
self.logger = logging.getLogger(__name__)
|
| 25 |
-
self.logger.setLevel(logging.DEBUG)
|
| 26 |
self.config = config
|
| 27 |
|
| 28 |
-
async def fetch_content(self, url: str, proxy: Optional[str] = None) -> str:
|
| 29 |
async with async_playwright() as p:
|
| 30 |
browser = await self.launch_browser(p, proxy)
|
| 31 |
context = await self.create_context(browser, proxy)
|
|
@@ -36,24 +46,49 @@ class PlaywrightScraper(BaseScraper):
|
|
| 36 |
await self.set_browser_features(page)
|
| 37 |
|
| 38 |
try:
|
| 39 |
-
|
| 40 |
-
if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
|
| 41 |
-
self.logger.info("Cloudflare detected, attempting to bypass...")
|
| 42 |
-
content = await self.bypass_cloudflare(page, url)
|
| 43 |
except Exception as e:
|
| 44 |
self.logger.error(f"Error during scraping: {str(e)}")
|
| 45 |
-
|
| 46 |
finally:
|
|
|
|
|
|
|
|
|
|
| 47 |
await browser.close()
|
| 48 |
|
| 49 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
async def launch_browser(self, playwright, proxy: Optional[str] = None) -> Browser:
|
| 52 |
return await playwright.chromium.launch(
|
| 53 |
-
headless=
|
| 54 |
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
|
| 55 |
'--window-position=0,0', '--ignore-certifcate-errors',
|
| 56 |
-
'--ignore-certifcate-errors-spki-list'
|
| 57 |
proxy={'server': proxy} if proxy else None
|
| 58 |
)
|
| 59 |
|
|
@@ -86,17 +121,17 @@ class PlaywrightScraper(BaseScraper):
|
|
| 86 |
''')
|
| 87 |
|
| 88 |
async def navigate_and_get_content(self, page: Page, url: str) -> str:
|
| 89 |
-
await page.goto(url, wait_until=
|
| 90 |
if self.config.simulate_human:
|
| 91 |
await self.simulate_human_behavior(page)
|
| 92 |
else:
|
| 93 |
-
await asyncio.sleep(
|
| 94 |
return await page.content()
|
| 95 |
|
| 96 |
async def bypass_cloudflare(self, page: Page, url: str) -> str:
|
| 97 |
max_retries = 3
|
| 98 |
for _ in range(max_retries):
|
| 99 |
-
await page.reload(wait_until=
|
| 100 |
if self.config.simulate_human:
|
| 101 |
await self.simulate_human_behavior(page)
|
| 102 |
else:
|
|
@@ -131,5 +166,48 @@ class PlaywrightScraper(BaseScraper):
|
|
| 131 |
await random_element.hover()
|
| 132 |
await asyncio.sleep(random.uniform(0.3, 0.7))
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
async def extract(self, content: str) -> Dict[str, Any]:
|
| 135 |
return {"raw_content": content}
|
|
|
|
| 1 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 2 |
from playwright_stealth import stealth_async
|
| 3 |
from .base_scraper import BaseScraper
|
| 4 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 5 |
import asyncio
|
| 6 |
import random
|
| 7 |
import logging
|
| 8 |
+
import re
|
| 9 |
+
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
| 10 |
|
| 11 |
class ScraperConfig:
|
| 12 |
def __init__(self,
|
|
|
|
| 14 |
simulate_human: bool = False,
|
| 15 |
use_custom_headers: bool = True,
|
| 16 |
hide_webdriver: bool = True,
|
| 17 |
+
bypass_cloudflare: bool = True,
|
| 18 |
+
headless: bool = True,
|
| 19 |
+
debug: bool = False,
|
| 20 |
+
timeout: int = 60000,
|
| 21 |
+
wait_for: str = 'networkidle'):
|
| 22 |
self.use_stealth = use_stealth
|
| 23 |
self.simulate_human = simulate_human
|
| 24 |
self.use_custom_headers = use_custom_headers
|
| 25 |
self.hide_webdriver = hide_webdriver
|
| 26 |
self.bypass_cloudflare = bypass_cloudflare
|
| 27 |
+
self.headless = headless
|
| 28 |
+
self.debug = debug
|
| 29 |
+
self.timeout = timeout
|
| 30 |
+
self.wait_for = wait_for
|
| 31 |
|
| 32 |
class PlaywrightScraper(BaseScraper):
|
| 33 |
def __init__(self, config: ScraperConfig = ScraperConfig()):
|
| 34 |
self.logger = logging.getLogger(__name__)
|
| 35 |
+
self.logger.setLevel(logging.DEBUG if config.debug else logging.INFO)
|
| 36 |
self.config = config
|
| 37 |
|
| 38 |
+
async def fetch_content(self, url: str, proxy: Optional[str] = None, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
|
| 39 |
async with async_playwright() as p:
|
| 40 |
browser = await self.launch_browser(p, proxy)
|
| 41 |
context = await self.create_context(browser, proxy)
|
|
|
|
| 46 |
await self.set_browser_features(page)
|
| 47 |
|
| 48 |
try:
|
| 49 |
+
contents = await self.scrape_multiple_pages(page, url, pages, url_pattern)
|
|
|
|
|
|
|
|
|
|
| 50 |
except Exception as e:
|
| 51 |
self.logger.error(f"Error during scraping: {str(e)}")
|
| 52 |
+
contents = [f"Error: {str(e)}"]
|
| 53 |
finally:
|
| 54 |
+
if self.config.debug:
|
| 55 |
+
self.logger.info("Scraping completed. Keeping browser open for debugging.")
|
| 56 |
+
await asyncio.sleep(30) # Keep the browser open for 30 seconds
|
| 57 |
await browser.close()
|
| 58 |
|
| 59 |
+
return contents
|
| 60 |
+
|
| 61 |
+
async def scrape_multiple_pages(self, page: Page, base_url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> List[str]:
|
| 62 |
+
if not url_pattern:
|
| 63 |
+
url_pattern = self.detect_url_pattern(base_url)
|
| 64 |
+
if not url_pattern:
|
| 65 |
+
return ["Error: Unable to detect URL pattern. Please provide a pattern."]
|
| 66 |
+
|
| 67 |
+
page_numbers = self.parse_page_numbers(pages)
|
| 68 |
+
contents = []
|
| 69 |
+
|
| 70 |
+
for page_num in page_numbers:
|
| 71 |
+
current_url = self.apply_url_pattern(base_url, url_pattern, page_num)
|
| 72 |
+
self.logger.info(f"Scraping page {page_num}: {current_url}")
|
| 73 |
+
|
| 74 |
+
content = await self.navigate_and_get_content(page, current_url)
|
| 75 |
+
if self.config.bypass_cloudflare and "Cloudflare" in content and "ray ID" in content.lower():
|
| 76 |
+
self.logger.info("Cloudflare detected, attempting to bypass...")
|
| 77 |
+
content = await self.bypass_cloudflare(page, current_url)
|
| 78 |
+
|
| 79 |
+
contents.append(content)
|
| 80 |
+
|
| 81 |
+
# Add a delay between page navigations
|
| 82 |
+
await asyncio.sleep(random.uniform(1, 3))
|
| 83 |
+
|
| 84 |
+
return contents
|
| 85 |
|
| 86 |
async def launch_browser(self, playwright, proxy: Optional[str] = None) -> Browser:
|
| 87 |
return await playwright.chromium.launch(
|
| 88 |
+
headless=self.config.headless,
|
| 89 |
args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-infobars',
|
| 90 |
'--window-position=0,0', '--ignore-certifcate-errors',
|
| 91 |
+
'--ignore-certifcate-errors-spki-list'],
|
| 92 |
proxy={'server': proxy} if proxy else None
|
| 93 |
)
|
| 94 |
|
|
|
|
| 121 |
''')
|
| 122 |
|
| 123 |
async def navigate_and_get_content(self, page: Page, url: str) -> str:
|
| 124 |
+
await page.goto(url, wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 125 |
if self.config.simulate_human:
|
| 126 |
await self.simulate_human_behavior(page)
|
| 127 |
else:
|
| 128 |
+
await asyncio.sleep(2) # Wait for 2 seconds to ensure content is loaded
|
| 129 |
return await page.content()
|
| 130 |
|
| 131 |
async def bypass_cloudflare(self, page: Page, url: str) -> str:
|
| 132 |
max_retries = 3
|
| 133 |
for _ in range(max_retries):
|
| 134 |
+
await page.reload(wait_until=self.config.wait_for, timeout=self.config.timeout)
|
| 135 |
if self.config.simulate_human:
|
| 136 |
await self.simulate_human_behavior(page)
|
| 137 |
else:
|
|
|
|
| 166 |
await random_element.hover()
|
| 167 |
await asyncio.sleep(random.uniform(0.3, 0.7))
|
| 168 |
|
| 169 |
+
def detect_url_pattern(self, url: str) -> Optional[str]:
|
| 170 |
+
parsed_url = urlparse(url)
|
| 171 |
+
query = parse_qs(parsed_url.query)
|
| 172 |
+
|
| 173 |
+
# Check query parameters for any numeric values
|
| 174 |
+
for param, value in query.items():
|
| 175 |
+
if value and value[0].isdigit():
|
| 176 |
+
return f"{param}={{{param}}}"
|
| 177 |
+
|
| 178 |
+
# Check path for numeric segments
|
| 179 |
+
path_parts = parsed_url.path.split('/')
|
| 180 |
+
for i, part in enumerate(path_parts):
|
| 181 |
+
if part.isdigit():
|
| 182 |
+
path_parts[i] = "{page}"
|
| 183 |
+
return '/'.join(path_parts)
|
| 184 |
+
|
| 185 |
+
# If no pattern is detected, return None
|
| 186 |
+
return None
|
| 187 |
+
|
| 188 |
+
def apply_url_pattern(self, base_url: str, pattern: str, page_num: int) -> str:
|
| 189 |
+
parsed_url = urlparse(base_url)
|
| 190 |
+
if '=' in pattern: # Query parameter pattern
|
| 191 |
+
query = parse_qs(parsed_url.query)
|
| 192 |
+
param, value = pattern.split('=')
|
| 193 |
+
query[param] = [value.format(**{param: page_num})]
|
| 194 |
+
return urlunparse(parsed_url._replace(query=urlencode(query, doseq=True)))
|
| 195 |
+
else: # Path pattern
|
| 196 |
+
return urlunparse(parsed_url._replace(path=pattern.format(page=page_num)))
|
| 197 |
+
|
| 198 |
+
def parse_page_numbers(self, pages: Optional[str]) -> List[int]:
|
| 199 |
+
if not pages:
|
| 200 |
+
return [1] # Default to first page if not specified
|
| 201 |
+
|
| 202 |
+
page_numbers = []
|
| 203 |
+
for part in pages.split(','):
|
| 204 |
+
if '-' in part:
|
| 205 |
+
start, end = map(int, part.split('-'))
|
| 206 |
+
page_numbers.extend(range(start, end + 1))
|
| 207 |
+
else:
|
| 208 |
+
page_numbers.append(int(part))
|
| 209 |
+
|
| 210 |
+
return sorted(set(page_numbers)) # Remove duplicates and sort
|
| 211 |
+
|
| 212 |
async def extract(self, content: str) -> Dict[str, Any]:
|
| 213 |
return {"raw_content": content}
|
src/web_extractor.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import asyncio
|
| 2 |
-
from typing import Dict, Any, Optional, List, Tuple
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
from io import StringIO, BytesIO
|
|
@@ -20,9 +20,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
| 20 |
import tiktoken
|
| 21 |
import csv
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
|
|
|
| 23 |
|
| 24 |
class WebExtractor:
|
| 25 |
-
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None):
|
| 26 |
model_kwargs = model_kwargs or {}
|
| 27 |
if isinstance(model_name, str) and model_name.startswith("ollama:"):
|
| 28 |
self.model = OllamaModelManager.get_model(model_name[7:])
|
|
@@ -31,7 +32,8 @@ class WebExtractor:
|
|
| 31 |
else:
|
| 32 |
self.model = Models.get_model(model_name, **model_kwargs)
|
| 33 |
|
| 34 |
-
|
|
|
|
| 35 |
self.html_scraper = HTMLScraper()
|
| 36 |
self.json_scraper = JSONScraper()
|
| 37 |
self.proxy_manager = ProxyManager(proxy)
|
|
@@ -77,7 +79,7 @@ class WebExtractor:
|
|
| 77 |
Based on the following preprocessed webpage content and the user's request, extract the relevant information.
|
| 78 |
Always present the data as a JSON array of objects, regardless of the user's requested format.
|
| 79 |
Each object in the array should represent one item or row of data.
|
| 80 |
-
Use the following format without any
|
| 81 |
|
| 82 |
[
|
| 83 |
{{
|
|
@@ -107,7 +109,11 @@ class WebExtractor:
|
|
| 107 |
|
| 108 |
async def process_query(self, user_input: str) -> str:
|
| 109 |
if user_input.lower().startswith("http"):
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
elif not self.current_content:
|
| 112 |
response = "Please provide a URL first before asking for information."
|
| 113 |
else:
|
|
@@ -117,10 +123,12 @@ class WebExtractor:
|
|
| 117 |
self.conversation_history.append(f"AI: {response}")
|
| 118 |
return response
|
| 119 |
|
| 120 |
-
async def _fetch_url(self, url: str) -> str:
|
| 121 |
self.current_url = url
|
| 122 |
proxy = await self.proxy_manager.get_proxy()
|
| 123 |
-
|
|
|
|
|
|
|
| 124 |
self.preprocessed_content = self._preprocess_content(self.current_content)
|
| 125 |
|
| 126 |
new_hash = self._hash_content(self.preprocessed_content)
|
|
@@ -128,7 +136,9 @@ class WebExtractor:
|
|
| 128 |
self.content_hash = new_hash
|
| 129 |
self.query_cache.clear()
|
| 130 |
|
| 131 |
-
return f"I've fetched and preprocessed the content from {self.current_url}
|
|
|
|
|
|
|
| 132 |
|
| 133 |
def _preprocess_content(self, content: str) -> str:
|
| 134 |
soup = BeautifulSoup(content, 'html.parser')
|
|
@@ -185,19 +195,29 @@ class WebExtractor:
|
|
| 185 |
self.query_cache[cache_key] = formatted_result
|
| 186 |
return formatted_result
|
| 187 |
|
| 188 |
-
def _format_result(self, extracted_data: str, query: str) -> str:
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
return self._format_as_text(extracted_data)
|
| 202 |
|
| 203 |
def optimized_text_splitter(self, text: str) -> List[str]:
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
from typing import Dict, Any, Optional, List, Tuple, Union
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
from io import StringIO, BytesIO
|
|
|
|
| 20 |
import tiktoken
|
| 21 |
import csv
|
| 22 |
from bs4 import BeautifulSoup, Comment
|
| 23 |
+
from .scrapers.playwright_scraper import PlaywrightScraper, ScraperConfig
|
| 24 |
|
| 25 |
class WebExtractor:
|
| 26 |
+
def __init__(self, model_name: str = "gpt-4o-mini", model_kwargs: Dict[str, Any] = None, proxy: Optional[str] = None, headless: bool = True, debug: bool = False):
|
| 27 |
model_kwargs = model_kwargs or {}
|
| 28 |
if isinstance(model_name, str) and model_name.startswith("ollama:"):
|
| 29 |
self.model = OllamaModelManager.get_model(model_name[7:])
|
|
|
|
| 32 |
else:
|
| 33 |
self.model = Models.get_model(model_name, **model_kwargs)
|
| 34 |
|
| 35 |
+
scraper_config = ScraperConfig(headless=headless, debug=debug)
|
| 36 |
+
self.playwright_scraper = PlaywrightScraper(config=scraper_config)
|
| 37 |
self.html_scraper = HTMLScraper()
|
| 38 |
self.json_scraper = JSONScraper()
|
| 39 |
self.proxy_manager = ProxyManager(proxy)
|
|
|
|
| 79 |
Based on the following preprocessed webpage content and the user's request, extract the relevant information.
|
| 80 |
Always present the data as a JSON array of objects, regardless of the user's requested format.
|
| 81 |
Each object in the array should represent one item or row of data.
|
| 82 |
+
Use the following format without any commentary text, provide only the format and nothing else:
|
| 83 |
|
| 84 |
[
|
| 85 |
{{
|
|
|
|
| 109 |
|
| 110 |
async def process_query(self, user_input: str) -> str:
|
| 111 |
if user_input.lower().startswith("http"):
|
| 112 |
+
parts = user_input.split(maxsplit=2)
|
| 113 |
+
url = parts[0]
|
| 114 |
+
pages = parts[1] if len(parts) > 1 else None
|
| 115 |
+
url_pattern = parts[2] if len(parts) > 2 else None
|
| 116 |
+
response = await self._fetch_url(url, pages, url_pattern)
|
| 117 |
elif not self.current_content:
|
| 118 |
response = "Please provide a URL first before asking for information."
|
| 119 |
else:
|
|
|
|
| 123 |
self.conversation_history.append(f"AI: {response}")
|
| 124 |
return response
|
| 125 |
|
| 126 |
+
async def _fetch_url(self, url: str, pages: Optional[str] = None, url_pattern: Optional[str] = None) -> str:
|
| 127 |
self.current_url = url
|
| 128 |
proxy = await self.proxy_manager.get_proxy()
|
| 129 |
+
|
| 130 |
+
contents = await self.playwright_scraper.fetch_content(url, proxy, pages, url_pattern)
|
| 131 |
+
self.current_content = "\n".join(contents)
|
| 132 |
self.preprocessed_content = self._preprocess_content(self.current_content)
|
| 133 |
|
| 134 |
new_hash = self._hash_content(self.preprocessed_content)
|
|
|
|
| 136 |
self.content_hash = new_hash
|
| 137 |
self.query_cache.clear()
|
| 138 |
|
| 139 |
+
return f"I've fetched and preprocessed the content from {self.current_url}" + \
|
| 140 |
+
(f" (pages: {pages})" if pages else "") + \
|
| 141 |
+
". What would you like to know about it?"
|
| 142 |
|
| 143 |
def _preprocess_content(self, content: str) -> str:
|
| 144 |
soup = BeautifulSoup(content, 'html.parser')
|
|
|
|
| 195 |
self.query_cache[cache_key] = formatted_result
|
| 196 |
return formatted_result
|
| 197 |
|
| 198 |
+
def _format_result(self, extracted_data: str, query: str) -> Union[str, Tuple[str, pd.DataFrame], BytesIO]:
|
| 199 |
+
try:
|
| 200 |
+
json_data = json.loads(extracted_data)
|
| 201 |
+
|
| 202 |
+
if 'json' in query.lower():
|
| 203 |
+
return self._format_as_json(json.dumps(json_data))
|
| 204 |
+
elif 'csv' in query.lower():
|
| 205 |
+
csv_string, df = self._format_as_csv(json.dumps(json_data))
|
| 206 |
+
return f"```csv\n{csv_string}\n```", df
|
| 207 |
+
elif 'excel' in query.lower():
|
| 208 |
+
return self._format_as_excel(json.dumps(json_data))
|
| 209 |
+
elif 'sql' in query.lower():
|
| 210 |
+
return self._format_as_sql(json.dumps(json_data))
|
| 211 |
+
elif 'html' in query.lower():
|
| 212 |
+
return self._format_as_html(json.dumps(json_data))
|
| 213 |
+
else:
|
| 214 |
+
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
|
| 215 |
+
csv_string, df = self._format_as_csv(json.dumps(json_data))
|
| 216 |
+
return f"```csv\n{csv_string}\n```", df
|
| 217 |
+
else:
|
| 218 |
+
return self._format_as_json(json.dumps(json_data))
|
| 219 |
+
|
| 220 |
+
except json.JSONDecodeError:
|
| 221 |
return self._format_as_text(extracted_data)
|
| 222 |
|
| 223 |
def optimized_text_splitter(self, text: str) -> List[str]:
|