| | import requests |
| | from bs4 import BeautifulSoup |
| | from urllib.robotparser import RobotFileParser |
| | from urllib.parse import urlparse, urljoin |
| | import time |
| | import logging |
| | from concurrent.futures import ThreadPoolExecutor, as_completed |
| | import re |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
| | logger = logging.getLogger(__name__) |
| |
|
| | class WebScraper: |
| | def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)", |
| | rate_limit=1, timeout=10, max_retries=3): |
| | self.session = requests.Session() |
| | self.session.headers.update({"User-Agent": user_agent}) |
| | self.robot_parser = RobotFileParser() |
| | self.rate_limit = rate_limit |
| | self.timeout = timeout |
| | self.max_retries = max_retries |
| | self.last_request_time = {} |
| |
|
| | def can_fetch(self, url): |
| | parsed_url = urlparse(url) |
| | robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
| | self.robot_parser.set_url(robots_url) |
| | try: |
| | self.robot_parser.read() |
| | return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url) |
| | except Exception as e: |
| | logger.warning(f"Error reading robots.txt for {url}: {e}") |
| | return True |
| |
|
| | def respect_rate_limit(self, url): |
| | domain = urlparse(url).netloc |
| | current_time = time.time() |
| | if domain in self.last_request_time: |
| | time_since_last_request = current_time - self.last_request_time[domain] |
| | if time_since_last_request < self.rate_limit: |
| | time.sleep(self.rate_limit - time_since_last_request) |
| | self.last_request_time[domain] = time.time() |
| |
|
| | def scrape_page(self, url): |
| | if not self.can_fetch(url): |
| | logger.info(f"Robots.txt disallows scraping: {url}") |
| | return None |
| |
|
| | for attempt in range(self.max_retries): |
| | try: |
| | self.respect_rate_limit(url) |
| | response = self.session.get(url, timeout=self.timeout) |
| | response.raise_for_status() |
| | return self.extract_content(response.text, url) |
| | except requests.RequestException as e: |
| | logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}") |
| | if attempt == self.max_retries - 1: |
| | logger.error(f"Failed to scrape {url} after {self.max_retries} attempts") |
| | return None |
| | time.sleep(2 ** attempt) |
| |
|
| | def extract_content(self, html, url): |
| | soup = BeautifulSoup(html, 'html.parser') |
| |
|
| | |
| | for element in soup(["script", "style", "nav", "footer", "header"]): |
| | element.decompose() |
| |
|
| | |
| | title = soup.title.string if soup.title else "" |
| |
|
| | |
| | main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') |
| |
|
| | if main_content: |
| | paragraphs = main_content.find_all('p') |
| | else: |
| | paragraphs = soup.find_all('p') |
| |
|
| | |
| | text = ' '.join([p.get_text().strip() for p in paragraphs]) |
| |
|
| | |
| | if not text: |
| | text = soup.get_text() |
| |
|
| | |
| | text = re.sub(r'\s+', ' ', text).strip() |
| |
|
| | |
| | links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)] |
| |
|
| | return { |
| | "url": url, |
| | "title": title, |
| | "content": text[:2400], |
| | "links": links[:10] |
| | } |
| |
|
| | def scrape_multiple_pages(urls, max_workers=5): |
| | scraper = WebScraper() |
| | results = {} |
| |
|
| | with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| | future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls} |
| | for future in as_completed(future_to_url): |
| | url = future_to_url[future] |
| | try: |
| | data = future.result() |
| | if data: |
| | results[url] = data |
| | logger.info(f"Successfully scraped: {url}") |
| | else: |
| | logger.warning(f"Failed to scrape: {url}") |
| | except Exception as exc: |
| | logger.error(f"{url} generated an exception: {exc}") |
| |
|
| | return results |
| |
|
| | |
| | def get_web_content(urls): |
| | scraped_data = scrape_multiple_pages(urls) |
| | return {url: data['content'] for url, data in scraped_data.items() if data} |
| |
|
| | |
| | def can_fetch(url): |
| | parsed_url = urlparse(url) |
| | robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" |
| | rp = RobotFileParser() |
| | rp.set_url(robots_url) |
| | try: |
| | rp.read() |
| | return rp.can_fetch("*", url) |
| | except Exception as e: |
| | logger.warning(f"Error reading robots.txt for {url}: {e}") |
| | return True |
| |
|
| | if __name__ == "__main__": |
| | test_urls = [ |
| | "https://en.wikipedia.org/wiki/Web_scraping", |
| | "https://example.com", |
| | "https://www.python.org" |
| | ] |
| | scraped_content = get_web_content(test_urls) |
| | for url, content in scraped_content.items(): |
| | print(f"Content from {url}:") |
| | print(content[:500]) |
| | print("\n---\n") |
| |
|