File size: 4,376 Bytes
115c46a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74a2967
115c46a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import asyncio
import json
import os
import base64
import nest_asyncio
from io import BytesIO
import pandas as pd
from playwright.async_api import async_playwright
from openai import OpenAI
from PIL import Image
from tabulate import tabulate
from IPython.display import display, HTML, Markdown
from pydantic import BaseModel
import streamlit as st
from helper import get_openai_api_key, visualizeCourses

# Apply nested asyncio support for Jupyter / Streamlit environments
nest_asyncio.apply()

# Init OpenAI client securely
client = OpenAI(api_key="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")

class WebScraperAgent:
    def __init__(self):
        self.playwright = None
        self.browser = None
        self.page = None

    async def init_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(headless=True)
        self.page = await self.browser.new_page()

    async def scrape_content(self, url):
        if not self.page or self.page.is_closed():
            await self.init_browser()
        await self.page.goto(url, wait_until="load")
        await self.page.wait_for_timeout(2000)  # Wait for dynamic content
        return await self.page.content()

    async def take_screenshot(self, path="screenshot.png"):
        await self.page.screenshot(path=path, full_page=True)
        return path

    async def screenshot_buffer(self):
        screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
        return screenshot_bytes

    async def close(self):
        await self.browser.close()
        await self.playwright.stop()


# Pydantic models for structured output
class DeeplearningCourse(BaseModel):
    title: str
    description: str
    presenter: list[str]
    imageUrl: str
    courseURL: str

class DeeplearningCourseList(BaseModel):
    courses: list[DeeplearningCourse]

# LLM interaction
async def process_with_llm(html, instructions):
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": f"""
                You are an expert web scraping agent. Your task is to:
                Extract relevant information from this HTML to JSON 
                following these instructions:
                {instructions}
                
                Extract the title, description, presenter, 
                the image URL and course URL for each course from deeplearning.ai
                
                Return ONLY valid JSON.
                """
            },
            {
                "role": "user",
                "content": html[:150000]
            }
        ],
        temperature=0.1
    )

    content = response.choices[0].message.content
    try:
        json_obj = json.loads(content)
        return DeeplearningCourseList(**json_obj)
    except Exception as e:
        raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}")

# Scraper workflow
async def webscraper(target_url, instructions):
    scraper = WebScraperAgent()
    try:
        st.info("Extracting HTML Content...")
        html_content = await scraper.scrape_content(target_url)

        st.info("Taking Screenshot...")
        screenshot = await scraper.screenshot_buffer()

        st.info("Processing with LLM...")
        result = await process_with_llm(html_content, instructions)
        return result, screenshot
    except Exception as e:
        st.error(f"Error: {str(e)}")
        return None, None
    finally:
        await scraper.close()


# Streamlit entrypoint
def main():
    st.title("AI Web Browser Agent (Hugging Face + Streamlit)")
    target_url = "https://www.deeplearning.ai/courses"
    instructions = "Get all the courses."

    if st.button("Start Scraping"):
        result, screenshot = asyncio.run(webscraper(target_url, instructions))

        if result:
            st.success("Successfully extracted course data!")
            visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai")
        else:
            st.error("Failed to extract course data.")

if __name__ == "__main__":
    main()