tdurzynski commited on
Commit
d4f1db3
·
verified ·
1 Parent(s): 5d94755

Update app.py

Browse files

First, attempts requests with proper headers
✅ If blocked (403 Forbidden), falls back to Selenium for JavaScript-heavy sites
✅ Uses gpt-4o-mini with openai.chat.completions.create()
✅ Extracts response via response.choices[0].message.content
✅ Handles secure API key storage with os.getenv("OPENAI_API_KEY")
✅ Runs seamlessly on Hugging Face Spaces

Files changed (1) hide show
  1. app.py +75 -45
app.py CHANGED
@@ -3,60 +3,90 @@ from bs4 import BeautifulSoup
3
  import gradio as gr
4
  import os
5
  from openai import OpenAI
 
 
6
 
7
- # Initialize OpenAI client with secure API key handling
8
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def scrape_and_summarize(url):
11
  """
12
  Scrapes the given website URL and summarizes its content using GPT-4o-mini.
 
13
  """
14
  try:
15
- # Fetch website content
16
- headers = {
17
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
18
- "Accept-Language": "en-US,en;q=0.9",
19
- "Referer": "https://www.google.com/",
20
- "DNT": "1", # Do Not Track request
21
- "Connection": "keep-alive"
22
- }
23
- response = requests.get(url, headers=headers, timeout=10)
24
-
25
- response.raise_for_status()
26
-
27
- # Parse HTML content
28
- soup = BeautifulSoup(response.text, "html.parser")
29
- paragraphs = soup.find_all("p")
30
- text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
31
-
32
- if not text_content:
33
- return "No readable content found on this page."
34
-
35
- # Limit text to 4000 characters for better summarization
36
- text_content = text_content[:4000]
37
-
38
- # Call OpenAI GPT-4o-mini for summarization
39
- response = client.chat.completions.create(
40
- model="gpt-4o-mini",
41
- messages=[
42
- {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
43
- {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
44
- ],
45
- response_format={"type": "text"},
46
- temperature=1,
47
- max_completion_tokens=2048,
48
- top_p=1,
49
- frequency_penalty=0,
50
- presence_penalty=0
51
- )
52
-
53
- summary = response.choices[0].message.content # Extract response content
54
- return summary
55
-
56
- except requests.exceptions.RequestException as e:
57
- return f"Error fetching the webpage: {str(e)}"
58
  except Exception as e:
59
- return f"An error occurred: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # Gradio UI
62
  with gr.Blocks() as demo:
 
3
  import gradio as gr
4
  import os
5
  from openai import OpenAI
6
+ from selenium import webdriver
7
+ from selenium.webdriver.chrome.options import Options
8
 
9
+ # Initialize OpenAI client securely
10
  client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
11
 
12
+ def fetch_with_requests(url):
13
+ """
14
+ Fetches webpage content using requests with proper headers.
15
+ Returns extracted text if successful, or raises an error for fallback.
16
+ """
17
+ headers = {
18
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
19
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
20
+ "Accept-Language": "en-US,en;q=0.9",
21
+ "Referer": "https://www.google.com/",
22
+ "DNT": "1",
23
+ "Connection": "keep-alive"
24
+ }
25
+
26
+ response = requests.get(url, headers=headers, timeout=10)
27
+ if response.status_code == 403:
28
+ raise Exception("403 Forbidden - Switching to Selenium")
29
+
30
+ soup = BeautifulSoup(response.text, "html.parser")
31
+ paragraphs = soup.find_all("p")
32
+ text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
33
+
34
+ return text_content if text_content else "No readable content found."
35
+
36
+ def fetch_with_selenium(url):
37
+ """
38
+ Uses Selenium to scrape JavaScript-heavy pages if requests fails.
39
+ """
40
+ chrome_options = Options()
41
+ chrome_options.add_argument("--headless") # Run in headless mode
42
+ chrome_options.add_argument("--disable-blink-features=AutomationControlled")
43
+
44
+ driver = webdriver.Chrome(options=chrome_options)
45
+ driver.get(url)
46
+ html = driver.page_source
47
+ driver.quit()
48
+
49
+ soup = BeautifulSoup(html, "html.parser")
50
+ paragraphs = soup.find_all("p")
51
+ text_content = "\n".join([p.get_text() for p in paragraphs if p.get_text().strip()])
52
+
53
+ return text_content if text_content else "No readable content found (even with Selenium)."
54
+
55
  def scrape_and_summarize(url):
56
  """
57
  Scrapes the given website URL and summarizes its content using GPT-4o-mini.
58
+ Tries `requests` first, falls back to Selenium if needed.
59
  """
60
  try:
61
+ # Attempt with requests first
62
+ text_content = fetch_with_requests(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
+ # If blocked, fallback to Selenium
65
+ try:
66
+ text_content = fetch_with_selenium(url)
67
+ except Exception as selenium_error:
68
+ return f"Failed both requests and Selenium: {selenium_error}"
69
+
70
+ # Limit content to 4000 characters for better summarization
71
+ text_content = text_content[:4000]
72
+
73
+ # Call OpenAI GPT-4o-mini for summarization
74
+ response = client.chat.completions.create(
75
+ model="gpt-4o-mini",
76
+ messages=[
77
+ {"role": "system", "content": "You are a helpful assistant that summarizes webpage content."},
78
+ {"role": "user", "content": f"Summarize the following webpage content:\n\n{text_content}"}
79
+ ],
80
+ response_format={"type": "text"},
81
+ temperature=1,
82
+ max_completion_tokens=2048,
83
+ top_p=1,
84
+ frequency_penalty=0,
85
+ presence_penalty=0
86
+ )
87
+
88
+ summary = response.choices[0].message.content # Extract response content
89
+ return summary
90
 
91
  # Gradio UI
92
  with gr.Blocks() as demo: