import asyncio import json import os import base64 import nest_asyncio from io import BytesIO import pandas as pd from playwright.async_api import async_playwright from openai import OpenAI from PIL import Image from tabulate import tabulate from IPython.display import display, HTML, Markdown from pydantic import BaseModel import streamlit as st from helper import get_openai_api_key, visualizeCourses # Apply nested asyncio support for Jupyter / Streamlit environments nest_asyncio.apply() # Init OpenAI client securely client = OpenAI(api_key="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") class WebScraperAgent: def __init__(self): self.playwright = None self.browser = None self.page = None async def init_browser(self): self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch(headless=True) self.page = await self.browser.new_page() async def scrape_content(self, url): if not self.page or self.page.is_closed(): await self.init_browser() await self.page.goto(url, wait_until="load") await self.page.wait_for_timeout(2000) # Wait for dynamic content return await self.page.content() async def take_screenshot(self, path="screenshot.png"): await self.page.screenshot(path=path, full_page=True) return path async def screenshot_buffer(self): screenshot_bytes = await self.page.screenshot(type="png", full_page=False) return screenshot_bytes async def close(self): await self.browser.close() await self.playwright.stop() # Pydantic models for structured output class DeeplearningCourse(BaseModel): title: str description: str presenter: list[str] imageUrl: str courseURL: str class DeeplearningCourseList(BaseModel): courses: list[DeeplearningCourse] # LLM interaction async def process_with_llm(html, instructions): response = await client.chat.completions.create( model="gpt-4o", messages=[ { "role": "system", "content": f""" You are an expert web scraping agent. Your task is to: Extract relevant information from this HTML to JSON following these instructions: {instructions} Extract the title, description, presenter, the image URL and course URL for each course from deeplearning.ai Return ONLY valid JSON. """ }, { "role": "user", "content": html[:150000] } ], temperature=0.1 ) content = response.choices[0].message.content try: json_obj = json.loads(content) return DeeplearningCourseList(**json_obj) except Exception as e: raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}") # Scraper workflow async def webscraper(target_url, instructions): scraper = WebScraperAgent() try: st.info("Extracting HTML Content...") html_content = await scraper.scrape_content(target_url) st.info("Taking Screenshot...") screenshot = await scraper.screenshot_buffer() st.info("Processing with LLM...") result = await process_with_llm(html_content, instructions) return result, screenshot except Exception as e: st.error(f"Error: {str(e)}") return None, None finally: await scraper.close() # Streamlit entrypoint def main(): st.title("AI Web Browser Agent (Hugging Face + Streamlit)") target_url = "https://www.deeplearning.ai/courses" instructions = "Get all the courses." if st.button("Start Scraping"): result, screenshot = asyncio.run(webscraper(target_url, instructions)) if result: st.success("Successfully extracted course data!") visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai") else: st.error("Failed to extract course data.") if __name__ == "__main__": main()