mibrahimzia commited on
Commit
0f94f61
·
verified ·
1 Parent(s): 8d00c99

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +88 -8
scraper.py CHANGED
@@ -1,12 +1,92 @@
 
1
  import requests
2
  from bs4 import BeautifulSoup
 
 
 
3
 
4
- def scrape_website(url: str):
5
- """Scrape title and text content from a webpage."""
6
- headers = {"User-Agent": "Mozilla/5.0"}
7
- resp = requests.get(url, headers=headers, timeout=10)
8
- soup = BeautifulSoup(resp.text, "html.parser")
 
9
 
10
- title = soup.title.string if soup.title else "No title"
11
- paragraphs = " ".join([p.get_text() for p in soup.find_all("p")])
12
- return {"title": title.strip(), "content": paragraphs.strip()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scraper.py
2
  import requests
3
  from bs4 import BeautifulSoup
4
+ import time
5
+ from typing import Dict, List
6
+ from urllib.parse import urljoin
7
 
8
+ # Playwright import (sync)
9
+ try:
10
+ from playwright.sync_api import sync_playwright
11
+ PLAYWRIGHT_AVAILABLE = True
12
+ except Exception:
13
+ PLAYWRIGHT_AVAILABLE = False
14
 
15
+ HEADERS = {
16
+ "User-Agent": "ai-scraper-bot/1.0 (+https://example.com)"
17
+ }
18
+
19
+ def _extract_basic(url: str, html: str) -> Dict:
20
+ soup = BeautifulSoup(html, "html.parser")
21
+ title = soup.title.string.strip() if soup.title and soup.title.string else ""
22
+ # collect headings, links, and paragraphs
23
+ headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])][:30]
24
+ paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:200]
25
+ links = []
26
+ for a in soup.find_all("a", href=True)[:200]:
27
+ href = a["href"]
28
+ links.append({
29
+ "text": a.get_text(strip=True),
30
+ "href": urljoin(url, href)
31
+ })
32
+ full_text = "\n\n".join(paragraphs) or soup.get_text(separator="\n", strip=True)
33
+ return {
34
+ "url": url,
35
+ "title": title,
36
+ "headings": headings,
37
+ "links": links,
38
+ "text": full_text.strip()
39
+ }
40
+
41
+ def scrape_with_requests(url: str, timeout: int = 12) -> Dict:
42
+ """Try a simple requests + BeautifulSoup scrape first."""
43
+ resp = requests.get(url, headers=HEADERS, timeout=timeout)
44
+ resp.raise_for_status()
45
+ return _extract_basic(url, resp.text)
46
+
47
+ def scrape_with_playwright(url: str, timeout: int = 25) -> Dict:
48
+ """Use Playwright to render JS-heavy pages and return the same structure."""
49
+ if not PLAYWRIGHT_AVAILABLE:
50
+ raise RuntimeError("Playwright not available in this environment.")
51
+ with sync_playwright() as p:
52
+ browser = p.chromium.launch(headless=True)
53
+ page = browser.new_page(extra_http_headers=HEADERS)
54
+ page.goto(url, timeout=timeout * 1000)
55
+ # wait a bit for content
56
+ time.sleep(1.2)
57
+ html = page.content()
58
+ browser.close()
59
+ return _extract_basic(url, html)
60
+
61
+ def scrape(url: str, force_render: bool = False) -> Dict:
62
+ """
63
+ High-level scrape function:
64
+ - Try requests+BS first.
65
+ - If content looks too small or force_render=True, fallback to Playwright.
66
+ """
67
+ try:
68
+ data = scrape_with_requests(url)
69
+ # if lightweight response or very little text, try playwright
70
+ text_len = len(data.get("text", "") or "")
71
+ if force_render or text_len < 500:
72
+ # fallback to playwright if available
73
+ if PLAYWRIGHT_AVAILABLE:
74
+ data2 = scrape_with_playwright(url)
75
+ # prefer rendered text if richer
76
+ if len(data2.get("text", "")) > text_len:
77
+ return data2
78
+ # else return what we have
79
+ return data
80
+ except Exception as e:
81
+ # requests failed -> try playwright
82
+ if PLAYWRIGHT_AVAILABLE:
83
+ try:
84
+ return scrape_with_playwright(url)
85
+ except Exception as e2:
86
+ raise RuntimeError(f"Both requests and playwright scrapes failed: {e}, {e2}")
87
+ raise
88
+
89
+ if __name__ == "__main__":
90
+ # quick local test
91
+ print("Quick test (requests).")
92
+ print(scrape("https://example.com")["title"])