Afeefa123 commited on
Commit
115c46a
·
verified ·
1 Parent(s): 1bfd074

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py CHANGED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import base64
5
+ import nest_asyncio
6
+ from io import BytesIO
7
+ import pandas as pd
8
+ from playwright.async_api import async_playwright
9
+ from openai import OpenAI
10
+ from PIL import Image
11
+ from tabulate import tabulate
12
+ from IPython.display import display, HTML, Markdown
13
+ from pydantic import BaseModel
14
+ import streamlit as st
15
+ from helper import get_openai_api_key, visualizeCourses
16
+
17
+ # Apply nested asyncio support for Jupyter / Streamlit environments
18
+ nest_asyncio.apply()
19
+
20
+ # Init OpenAI client securely
21
+ client = OpenAI(api_key=get_openai_api_key())
22
+
23
+ class WebScraperAgent:
24
+ def __init__(self):
25
+ self.playwright = None
26
+ self.browser = None
27
+ self.page = None
28
+
29
+ async def init_browser(self):
30
+ self.playwright = await async_playwright().start()
31
+ self.browser = await self.playwright.chromium.launch(headless=True)
32
+ self.page = await self.browser.new_page()
33
+
34
+ async def scrape_content(self, url):
35
+ if not self.page or self.page.is_closed():
36
+ await self.init_browser()
37
+ await self.page.goto(url, wait_until="load")
38
+ await self.page.wait_for_timeout(2000) # Wait for dynamic content
39
+ return await self.page.content()
40
+
41
+ async def take_screenshot(self, path="screenshot.png"):
42
+ await self.page.screenshot(path=path, full_page=True)
43
+ return path
44
+
45
+ async def screenshot_buffer(self):
46
+ screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
47
+ return screenshot_bytes
48
+
49
+ async def close(self):
50
+ await self.browser.close()
51
+ await self.playwright.stop()
52
+
53
+
54
+ # Pydantic models for structured output
55
+ class DeeplearningCourse(BaseModel):
56
+ title: str
57
+ description: str
58
+ presenter: list[str]
59
+ imageUrl: str
60
+ courseURL: str
61
+
62
+ class DeeplearningCourseList(BaseModel):
63
+ courses: list[DeeplearningCourse]
64
+
65
+ # LLM interaction
66
+ async def process_with_llm(html, instructions):
67
+ response = await client.chat.completions.create(
68
+ model="gpt-4o",
69
+ messages=[
70
+ {
71
+ "role": "system",
72
+ "content": f"""
73
+ You are an expert web scraping agent. Your task is to:
74
+ Extract relevant information from this HTML to JSON
75
+ following these instructions:
76
+ {instructions}
77
+
78
+ Extract the title, description, presenter,
79
+ the image URL and course URL for each course from deeplearning.ai
80
+
81
+ Return ONLY valid JSON.
82
+ """
83
+ },
84
+ {
85
+ "role": "user",
86
+ "content": html[:150000]
87
+ }
88
+ ],
89
+ temperature=0.1
90
+ )
91
+
92
+ content = response.choices[0].message.content
93
+ try:
94
+ json_obj = json.loads(content)
95
+ return DeeplearningCourseList(**json_obj)
96
+ except Exception as e:
97
+ raise ValueError(f"Parsing failed: {e}\nRaw output:\n{content}")
98
+
99
+ # Scraper workflow
100
+ async def webscraper(target_url, instructions):
101
+ scraper = WebScraperAgent()
102
+ try:
103
+ st.info("Extracting HTML Content...")
104
+ html_content = await scraper.scrape_content(target_url)
105
+
106
+ st.info("Taking Screenshot...")
107
+ screenshot = await scraper.screenshot_buffer()
108
+
109
+ st.info("Processing with LLM...")
110
+ result = await process_with_llm(html_content, instructions)
111
+ return result, screenshot
112
+ except Exception as e:
113
+ st.error(f"Error: {str(e)}")
114
+ return None, None
115
+ finally:
116
+ await scraper.close()
117
+
118
+
119
+ # Streamlit entrypoint
120
+ def main():
121
+ st.title("AI Web Browser Agent (Hugging Face + Streamlit)")
122
+ target_url = "https://www.deeplearning.ai/courses"
123
+ instructions = "Get all the courses."
124
+
125
+ if st.button("Start Scraping"):
126
+ result, screenshot = asyncio.run(webscraper(target_url, instructions))
127
+
128
+ if result:
129
+ st.success("Successfully extracted course data!")
130
+ visualizeCourses(result=result, screenshot=screenshot, target_url=target_url, instructions=instructions, base_url="https://deeplearning.ai")
131
+ else:
132
+ st.error("Failed to extract course data.")
133
+
134
+ if __name__ == "__main__":
135
+ main()