Spaces:

Afeefa123
/

web-agent

Sleeping

App Files Files Community

Afeefa123 commited on Apr 18, 2025

Commit

115c46a

verified ·

1 Parent(s): 1bfd074

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -0

app.py CHANGED Viewed

	@@ -0,0 +1,135 @@

+import asyncio
+import json
+import os
+import base64
+import nest_asyncio
+from io import BytesIO
+import pandas as pd
+from playwright.async_api import async_playwright
+from openai import OpenAI
+from PIL import Image
+from tabulate import tabulate
+from IPython.display import display, HTML, Markdown
+from pydantic import BaseModel
+import streamlit as st
+from helper import get_openai_api_key, visualizeCourses
+# Apply nested asyncio support for Jupyter / Streamlit environments
+nest_asyncio.apply()
+# Init OpenAI client securely
+client = OpenAI(api_key=get_openai_api_key())
+class WebScraperAgent:
+    def __init__(self):
+        self.playwright = None
+        self.browser = None
+        self.page = None
+    async def init_browser(self):
+        self.playwright = await async_playwright().start()
+        self.browser = await self.playwright.chromium.launch(headless=True)
+        self.page = await self.browser.new_page()
+    async def scrape_content(self, url):
+        if not self.page or self.page.is_closed():
+            await self.init_browser()
+        await self.page.goto(url, wait_until="load")
+        await self.page.wait_for_timeout(2000)  # Wait for dynamic content
+        return await self.page.content()
+    async def take_screenshot(self, path="screenshot.png"):
+        await self.page.screenshot(path=path, full_page=True)
+        return path
+    async def screenshot_buffer(self):
+        screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
+        return screenshot_bytes
+    async def close(self):
+        await self.browser.close()
+        await self.playwright.stop()
+# Pydantic models for structured output
+class DeeplearningCourse(BaseModel):
+    title: str
+    description: str
+    presenter: list[str]
+    imageUrl: str
+    courseURL: str
+class DeeplearningCourseList(BaseModel):
+    courses: list[DeeplearningCourse]
+# LLM interaction
+async def process_with_llm(html, instructions):
+    response = await client.chat.completions.create(
+        model="gpt-4o",
+        messages=[
+            {
+                "role": "system",
+                "content": f"""
+                You are an expert web scraping agent. Your task is to:
+                Extract relevant information from this HTML to JSON
+                following these instructions:
+                {instructions}
+                Extract the title, description, presenter,
+                the image URL and course URL for each course from deeplearning.ai
+                Return ONLY valid JSON.
+                """
+            },
+            {
+                "role": "user",
+                "content": html[:150000]
+            }
+        ],
+        temperature=0.1
+    )
+    content = response.choices[0].message.content
+    try:
+        json_obj = json.loads(content)
+        return DeeplearningCourseList(**json_obj)
+    except Exception as e:
+        raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}")
+# Scraper workflow
+async def webscraper(target_url, instructions):
+    scraper = WebScraperAgent()
+    try:
+        st.info("Extracting HTML Content...")
+        html_content = await scraper.scrape_content(target_url)
+        st.info("Taking Screenshot...")
+        screenshot = await scraper.screenshot_buffer()
+        st.info("Processing with LLM...")
+        result = await process_with_llm(html_content, instructions)
+        return result, screenshot
+    except Exception as e:
+        st.error(f"Error: {str(e)}")
+        return None, None
+    finally:
+        await scraper.close()
+# Streamlit entrypoint
+def main():
+    st.title("AI Web Browser Agent (Hugging Face + Streamlit)")
+    target_url = "https://www.deeplearning.ai/courses"
+    instructions = "Get all the courses."
+    if st.button("Start Scraping"):
+        result, screenshot = asyncio.run(webscraper(target_url, instructions))
+        if result:
+            st.success("Successfully extracted course data!")
+            visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai")
+        else:
+            st.error("Failed to extract course data.")
+if __name__ == "__main__":
+    main()