Almaatla commited on
Commit
5d89b25
·
verified ·
1 Parent(s): 0e565e7

Upload 4 files

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. main.py +44 -34
  3. requirements.txt +2 -1
Dockerfile CHANGED
@@ -1,5 +1,5 @@
1
- # Use an official Python runtime as a parent image
2
- FROM python:3.9-slim
3
 
4
  # Set the working directory in the container
5
  WORKDIR /app
 
1
+ # Use the official Playwright image which includes Python and browsers
2
+ FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
3
 
4
  # Set the working directory in the container
5
  WORKDIR /app
main.py CHANGED
@@ -1,19 +1,28 @@
1
- from fastapi import FastAPI, HTTPException, Query
 
 
 
2
  from pydantic import BaseModel, HttpUrl
3
- import requests
4
  from bs4 import BeautifulSoup, Comment
5
  import re
 
6
 
7
- app = FastAPI(title="Web Scraping Service")
8
 
9
  class ScrapeRequest(BaseModel):
10
  url: HttpUrl
11
 
12
  @app.get("/")
13
  def read_root():
14
- return {"message": "Welcome to the Web Scraping Service! Send a POST request to /scrape with a JSON body {'url': '...'} or use GET /scrape?url=..."}
 
 
 
 
 
 
15
 
16
- def clean_html(soup: BeautifulSoup):
17
  # Remove script, style, iframe, and other non-content tags
18
  for tag in soup(["script", "style", "iframe", "noscript", "meta", "link", "svg", "button", "input", "form"]):
19
  tag.decompose()
@@ -23,51 +32,54 @@ def clean_html(soup: BeautifulSoup):
23
  comment.extract()
24
 
25
  # Remove common ad and clutter classes/ids
26
- # This list is not exhaustive but catches many common patterns
27
  ad_patterns = re.compile(
28
  r"(ad|ads|advert|advertisement|banner|social|share|nav|footer|header|menu|sidebar|cookie|popup|modal|newsletter)",
29
  re.IGNORECASE
30
  )
31
 
32
- # Remove elements by class or id matching ad patterns
33
  for tag in soup.find_all(attrs={"class": ad_patterns}):
34
  tag.decompose()
35
  for tag in soup.find_all(attrs={"id": ad_patterns}):
36
  tag.decompose()
37
 
38
- return soup
39
-
40
- @app.post("/scrape")
41
- def scrape_url(request: ScrapeRequest):
42
- return process_scrape(str(request.url))
43
 
44
- @app.get("/scrape")
45
- def scrape_url_get(url: str):
46
- return process_scrape(url)
47
 
48
- def process_scrape(url: str):
49
- headers = {
50
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
51
- }
 
 
 
52
 
53
- try:
54
- response = requests.get(url, headers=headers, timeout=10)
55
- response.raise_for_status()
56
 
57
- soup = BeautifulSoup(response.content, "lxml")
 
58
 
59
- # Extract title before cleaning
60
- title = soup.title.string.strip() if soup.title else "No title found"
 
61
 
62
- # Clean the HTML
63
- cleaned_soup = clean_html(soup)
 
64
 
65
- # Extract text
66
- # get_text with separator handles block elements better
67
- text = cleaned_soup.get_text(separator="\n", strip=True)
68
 
69
- # Simple cleanup of excessive newlines
70
- text = re.sub(r'\n{3,}', '\n\n', text)
 
 
71
 
72
  return {
73
  "url": url,
@@ -76,8 +88,6 @@ def process_scrape(url: str):
76
  "status": "success"
77
  }
78
 
79
- except requests.RequestException as e:
80
- raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")
81
  except Exception as e:
82
  raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
83
 
 
1
+ import nest_asyncio
2
+ nest_asyncio.apply()
3
+
4
+ from fastapi import FastAPI, HTTPException
5
  from pydantic import BaseModel, HttpUrl
6
+ from playwright.async_api import async_playwright
7
  from bs4 import BeautifulSoup, Comment
8
  import re
9
+ import asyncio
10
 
11
+ app = FastAPI(title="Web Scraper API")
12
 
13
  class ScrapeRequest(BaseModel):
14
  url: HttpUrl
15
 
16
  @app.get("/")
17
  def read_root():
18
+ return {"message": "Welcome to the Playwright Web Scraping Service! Send a POST request to /scrape with a JSON body {'url': '...'} or use GET /scrape?url=..."}
19
+
20
+ def clean_html(html_content: str):
21
+ soup = BeautifulSoup(html_content, "lxml")
22
+
23
+ # Extract title before cleaning
24
+ title = soup.title.string.strip() if soup.title else "No title found"
25
 
 
26
  # Remove script, style, iframe, and other non-content tags
27
  for tag in soup(["script", "style", "iframe", "noscript", "meta", "link", "svg", "button", "input", "form"]):
28
  tag.decompose()
 
32
  comment.extract()
33
 
34
  # Remove common ad and clutter classes/ids
 
35
  ad_patterns = re.compile(
36
  r"(ad|ads|advert|advertisement|banner|social|share|nav|footer|header|menu|sidebar|cookie|popup|modal|newsletter)",
37
  re.IGNORECASE
38
  )
39
 
 
40
  for tag in soup.find_all(attrs={"class": ad_patterns}):
41
  tag.decompose()
42
  for tag in soup.find_all(attrs={"id": ad_patterns}):
43
  tag.decompose()
44
 
45
+ # Extract text
46
+ text = soup.get_text(separator="\n", strip=True)
47
+ # Simple cleanup of excessive newlines
48
+ text = re.sub(r'\n{3,}', '\n\n', text)
 
49
 
50
+ return title, text
 
 
51
 
52
+ async def scrape_with_playwright(url: str):
53
+ async with async_playwright() as p:
54
+ browser = await p.chromium.launch(headless=True)
55
+ context = await browser.new_context(
56
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
57
+ )
58
+ page = await context.new_page()
59
 
60
+ try:
61
+ # Go to URL and wait for network to be idle (load complete)
62
+ await page.goto(url, wait_until="networkidle", timeout=30000)
63
 
64
+ # Get content
65
+ content = await page.content()
66
 
67
+ return content
68
+ finally:
69
+ await browser.close()
70
 
71
+ @app.post("/scrape")
72
+ async def scrape_url(request: ScrapeRequest):
73
+ return await process_scrape(str(request.url))
74
 
75
+ @app.get("/scrape")
76
+ async def scrape_url_get(url: str):
77
+ return await process_scrape(url)
78
 
79
+ async def process_scrape(url: str):
80
+ try:
81
+ html_content = await scrape_with_playwright(url)
82
+ title, text = clean_html(html_content)
83
 
84
  return {
85
  "url": url,
 
88
  "status": "success"
89
  }
90
 
 
 
91
  except Exception as e:
92
  raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")
93
 
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  fastapi
2
  uvicorn
3
- requests
 
4
  beautifulsoup4
5
  lxml
 
1
  fastapi
2
  uvicorn
3
+ playwright
4
+ nest_asyncio
5
  beautifulsoup4
6
  lxml