|
|
import asyncio |
|
|
import json |
|
|
import os |
|
|
import base64 |
|
|
import nest_asyncio |
|
|
from io import BytesIO |
|
|
import pandas as pd |
|
|
from playwright.async_api import async_playwright |
|
|
from openai import OpenAI |
|
|
from PIL import Image |
|
|
from tabulate import tabulate |
|
|
from IPython.display import display, HTML, Markdown |
|
|
from pydantic import BaseModel |
|
|
import streamlit as st |
|
|
from helper import get_openai_api_key, visualizeCourses |
|
|
|
|
|
|
|
|
nest_asyncio.apply() |
|
|
|
|
|
|
|
|
client = OpenAI(api_key="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") |
|
|
|
|
|
class WebScraperAgent: |
|
|
def __init__(self): |
|
|
self.playwright = None |
|
|
self.browser = None |
|
|
self.page = None |
|
|
|
|
|
async def init_browser(self): |
|
|
self.playwright = await async_playwright().start() |
|
|
self.browser = await self.playwright.chromium.launch(headless=True) |
|
|
self.page = await self.browser.new_page() |
|
|
|
|
|
async def scrape_content(self, url): |
|
|
if not self.page or self.page.is_closed(): |
|
|
await self.init_browser() |
|
|
await self.page.goto(url, wait_until="load") |
|
|
await self.page.wait_for_timeout(2000) |
|
|
return await self.page.content() |
|
|
|
|
|
async def take_screenshot(self, path="screenshot.png"): |
|
|
await self.page.screenshot(path=path, full_page=True) |
|
|
return path |
|
|
|
|
|
async def screenshot_buffer(self): |
|
|
screenshot_bytes = await self.page.screenshot(type="png", full_page=False) |
|
|
return screenshot_bytes |
|
|
|
|
|
async def close(self): |
|
|
await self.browser.close() |
|
|
await self.playwright.stop() |
|
|
|
|
|
|
|
|
|
|
|
class DeeplearningCourse(BaseModel): |
|
|
title: str |
|
|
description: str |
|
|
presenter: list[str] |
|
|
imageUrl: str |
|
|
courseURL: str |
|
|
|
|
|
class DeeplearningCourseList(BaseModel): |
|
|
courses: list[DeeplearningCourse] |
|
|
|
|
|
|
|
|
async def process_with_llm(html, instructions): |
|
|
response = await client.chat.completions.create( |
|
|
model="gpt-4o", |
|
|
messages=[ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": f""" |
|
|
You are an expert web scraping agent. Your task is to: |
|
|
Extract relevant information from this HTML to JSON |
|
|
following these instructions: |
|
|
{instructions} |
|
|
|
|
|
Extract the title, description, presenter, |
|
|
the image URL and course URL for each course from deeplearning.ai |
|
|
|
|
|
Return ONLY valid JSON. |
|
|
""" |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": html[:150000] |
|
|
} |
|
|
], |
|
|
temperature=0.1 |
|
|
) |
|
|
|
|
|
content = response.choices[0].message.content |
|
|
try: |
|
|
json_obj = json.loads(content) |
|
|
return DeeplearningCourseList(**json_obj) |
|
|
except Exception as e: |
|
|
raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}") |
|
|
|
|
|
|
|
|
async def webscraper(target_url, instructions): |
|
|
scraper = WebScraperAgent() |
|
|
try: |
|
|
st.info("Extracting HTML Content...") |
|
|
html_content = await scraper.scrape_content(target_url) |
|
|
|
|
|
st.info("Taking Screenshot...") |
|
|
screenshot = await scraper.screenshot_buffer() |
|
|
|
|
|
st.info("Processing with LLM...") |
|
|
result = await process_with_llm(html_content, instructions) |
|
|
return result, screenshot |
|
|
except Exception as e: |
|
|
st.error(f"Error: {str(e)}") |
|
|
return None, None |
|
|
finally: |
|
|
await scraper.close() |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
st.title("AI Web Browser Agent (Hugging Face + Streamlit)") |
|
|
target_url = "https://www.deeplearning.ai/courses" |
|
|
instructions = "Get all the courses." |
|
|
|
|
|
if st.button("Start Scraping"): |
|
|
result, screenshot = asyncio.run(webscraper(target_url, instructions)) |
|
|
|
|
|
if result: |
|
|
st.success("Successfully extracted course data!") |
|
|
visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai") |
|
|
else: |
|
|
st.error("Failed to extract course data.") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|