Spaces:

Almaatla
/

web-scraper

Sleeping

App Files Files Community

Almaatla commited on Feb 17

Commit

5d89b25

verified ·

1 Parent(s): 0e565e7

Upload 4 files

Browse files

Files changed (3) hide show

Dockerfile +2 -2
main.py +44 -34
requirements.txt +2 -1

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
-# Use an official Python runtime as a parent image
-FROM python:3.9-slim
 # Set the working directory in the container
 WORKDIR /app

+# Use the official Playwright image which includes Python and browsers
+FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
 # Set the working directory in the container
 WORKDIR /app

main.py CHANGED Viewed

@@ -1,19 +1,28 @@
-from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel, HttpUrl
-import requests
 from bs4 import BeautifulSoup, Comment
 import re
-app = FastAPI(title="Web Scraping Service")
 class ScrapeRequest(BaseModel):
     url: HttpUrl
 @app.get("/")
 def read_root():
-    return {"message": "Welcome to the Web Scraping Service! Send a POST request to /scrape with a JSON body {'url': '...'} or use GET /scrape?url=..."}
-def clean_html(soup: BeautifulSoup):
     # Remove script, style, iframe, and other non-content tags
     for tag in soup(["script", "style", "iframe", "noscript", "meta", "link", "svg", "button", "input", "form"]):
         tag.decompose()
@@ -23,51 +32,54 @@ def clean_html(soup: BeautifulSoup):
         comment.extract()
     # Remove common ad and clutter classes/ids
-    # This list is not exhaustive but catches many common patterns
     ad_patterns = re.compile(
         r"(ad|ads|advert|advertisement|banner|social|share|nav|footer|header|menu|sidebar|cookie|popup|modal|newsletter)",
         re.IGNORECASE
     )
-    # Remove elements by class or id matching ad patterns
     for tag in soup.find_all(attrs={"class": ad_patterns}):
         tag.decompose()
     for tag in soup.find_all(attrs={"id": ad_patterns}):
         tag.decompose()
-    return soup
-@app.post("/scrape")
-def scrape_url(request: ScrapeRequest):
-    return process_scrape(str(request.url))
-@app.get("/scrape")
-def scrape_url_get(url: str):
-    return process_scrape(url)
-def process_scrape(url: str):
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    try:
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.content, "lxml")
-        # Extract title before cleaning
-        title = soup.title.string.strip() if soup.title else "No title found"
-        # Clean the HTML
-        cleaned_soup = clean_html(soup)
-        # Extract text
-        # get_text with separator handles block elements better
-        text = cleaned_soup.get_text(separator="\n", strip=True)
-        # Simple cleanup of excessive newlines
-        text = re.sub(r'\n{3,}', '\n\n', text)
         return {
             "url": url,
@@ -76,8 +88,6 @@ def process_scrape(url: str):
             "status": "success"
         }
-    except requests.RequestException as e:
-        raise HTTPException(status_code=400, detail=f"Failed to fetch URL: {str(e)}")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")

+import nest_asyncio
+nest_asyncio.apply()
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, HttpUrl
+from playwright.async_api import async_playwright
 from bs4 import BeautifulSoup, Comment
 import re
+import asyncio
+app = FastAPI(title="Web Scraper API")
 class ScrapeRequest(BaseModel):
     url: HttpUrl
 @app.get("/")
 def read_root():
+    return {"message": "Welcome to the Playwright Web Scraping Service! Send a POST request to /scrape with a JSON body {'url': '...'} or use GET /scrape?url=..."}
+def clean_html(html_content: str):
+    soup = BeautifulSoup(html_content, "lxml")
+    # Extract title before cleaning
+    title = soup.title.string.strip() if soup.title else "No title found"
     # Remove script, style, iframe, and other non-content tags
     for tag in soup(["script", "style", "iframe", "noscript", "meta", "link", "svg", "button", "input", "form"]):
         tag.decompose()
         comment.extract()
     # Remove common ad and clutter classes/ids
     ad_patterns = re.compile(
         r"(ad|ads|advert|advertisement|banner|social|share|nav|footer|header|menu|sidebar|cookie|popup|modal|newsletter)",
         re.IGNORECASE
     )
     for tag in soup.find_all(attrs={"class": ad_patterns}):
         tag.decompose()
     for tag in soup.find_all(attrs={"id": ad_patterns}):
         tag.decompose()
+    # Extract text
+    text = soup.get_text(separator="\n", strip=True)
+    # Simple cleanup of excessive newlines
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return title, text
+async def scrape_with_playwright(url: str):
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        )
+        page = await context.new_page()
+        try:
+            # Go to URL and wait for network to be idle (load complete)
+            await page.goto(url, wait_until="networkidle", timeout=30000)
+            # Get content
+            content = await page.content()
+            return content
+        finally:
+            await browser.close()
+@app.post("/scrape")
+async def scrape_url(request: ScrapeRequest):
+    return await process_scrape(str(request.url))
+@app.get("/scrape")
+async def scrape_url_get(url: str):
+    return await process_scrape(url)
+async def process_scrape(url: str):
+    try:
+        html_content = await scrape_with_playwright(url)
+        title, text = clean_html(html_content)
         return {
             "url": url,
             "status": "success"
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Scraping error: {str(e)}")

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 fastapi
 uvicorn
-requests
 beautifulsoup4
 lxml

 fastapi
 uvicorn
+playwright
+nest_asyncio
 beautifulsoup4
 lxml