web-agent / app.py
Afeefa123's picture
Update app.py
74a2967 verified
import asyncio
import json
import os
import base64
import nest_asyncio
from io import BytesIO
import pandas as pd
from playwright.async_api import async_playwright
from openai import OpenAI
from PIL import Image
from tabulate import tabulate
from IPython.display import display, HTML, Markdown
from pydantic import BaseModel
import streamlit as st
from helper import get_openai_api_key, visualizeCourses
# Apply nested asyncio support for Jupyter / Streamlit environments
nest_asyncio.apply()
# Init OpenAI client securely
client = OpenAI(api_key="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")
class WebScraperAgent:
def __init__(self):
self.playwright = None
self.browser = None
self.page = None
async def init_browser(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(headless=True)
self.page = await self.browser.new_page()
async def scrape_content(self, url):
if not self.page or self.page.is_closed():
await self.init_browser()
await self.page.goto(url, wait_until="load")
await self.page.wait_for_timeout(2000) # Wait for dynamic content
return await self.page.content()
async def take_screenshot(self, path="screenshot.png"):
await self.page.screenshot(path=path, full_page=True)
return path
async def screenshot_buffer(self):
screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
return screenshot_bytes
async def close(self):
await self.browser.close()
await self.playwright.stop()
# Pydantic models for structured output
class DeeplearningCourse(BaseModel):
title: str
description: str
presenter: list[str]
imageUrl: str
courseURL: str
class DeeplearningCourseList(BaseModel):
courses: list[DeeplearningCourse]
# LLM interaction
async def process_with_llm(html, instructions):
response = await client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": f"""
You are an expert web scraping agent. Your task is to:
Extract relevant information from this HTML to JSON
following these instructions:
{instructions}
Extract the title, description, presenter,
the image URL and course URL for each course from deeplearning.ai
Return ONLY valid JSON.
"""
},
{
"role": "user",
"content": html[:150000]
}
],
temperature=0.1
)
content = response.choices[0].message.content
try:
json_obj = json.loads(content)
return DeeplearningCourseList(**json_obj)
except Exception as e:
raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}")
# Scraper workflow
async def webscraper(target_url, instructions):
scraper = WebScraperAgent()
try:
st.info("Extracting HTML Content...")
html_content = await scraper.scrape_content(target_url)
st.info("Taking Screenshot...")
screenshot = await scraper.screenshot_buffer()
st.info("Processing with LLM...")
result = await process_with_llm(html_content, instructions)
return result, screenshot
except Exception as e:
st.error(f"Error: {str(e)}")
return None, None
finally:
await scraper.close()
# Streamlit entrypoint
def main():
st.title("AI Web Browser Agent (Hugging Face + Streamlit)")
target_url = "https://www.deeplearning.ai/courses"
instructions = "Get all the courses."
if st.button("Start Scraping"):
result, screenshot = asyncio.run(webscraper(target_url, instructions))
if result:
st.success("Successfully extracted course data!")
visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai")
else:
st.error("Failed to extract course data.")
if __name__ == "__main__":
main()