jackculpan commited on
Commit
db62874
·
1 Parent(s): 5d2229e
Files changed (1) hide show
  1. conversation.py +32 -1
conversation.py CHANGED
@@ -4,6 +4,8 @@ import gradio as gr
4
  import requests
5
  from bs4 import BeautifulSoup
6
  import urllib.parse
 
 
7
 
8
  try:
9
  from dotenv import load_dotenv
@@ -43,9 +45,36 @@ class Conversation:
43
  # ... your existing get_data implementation ...
44
  # Replace `messages` with `self.messages`
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  url = self.to_valid_url(old_url)
47
  self.messages
48
- html = requests.get(url).text
49
  doc = BeautifulSoup(html, 'html.parser')
50
  if not doc:
51
  raise ValueError("Please try again")
@@ -53,6 +82,7 @@ class Conversation:
53
  headings_1 = [e.text for e in doc.find_all('h1')]
54
  headings_2 = [e.text for e in doc.find_all('h2')]
55
  # headings_3 = [e.text for e in doc.find_all('h3')]
 
56
  paragraphs = [e.text for e in doc.find_all('p')]
57
  # spans = [e.text for e in doc.find_all('span')]
58
  joined_paragraphs = (' '.join(paragraphs))
@@ -64,6 +94,7 @@ class Conversation:
64
  self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
65
  self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
66
  self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
 
67
  # messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
68
  self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
69
  # messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})
 
4
  import requests
5
  from bs4 import BeautifulSoup
6
  import urllib.parse
7
+ from selenium import webdriver
8
+ from webdriver_manager.chrome import ChromeDriverManager
9
 
10
  try:
11
  from dotenv import load_dotenv
 
45
  # ... your existing get_data implementation ...
46
  # Replace `messages` with `self.messages`
47
 
48
+ def extract_html_content(url):
49
+ response = requests.get(url)
50
+ return response.text
51
+
52
+ def extract_js_content(url):
53
+ options = webdriver.ChromeOptions()
54
+ options.add_argument('--headless')
55
+ driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
56
+ driver.get(url)
57
+ rendered_content = driver.page_source
58
+ driver.quit()
59
+ return rendered_content
60
+
61
+ def smart_scraper(url):
62
+ html_content = extract_html_content(url)
63
+ selector_to_find = "body"
64
+
65
+ # Check if the content is incomplete or if a specific tag is missing
66
+ # if not html_content or not html_content.find(selector_to_find):
67
+ if not html_content or not html_content.find(selector_to_find):
68
+ # If incomplete, use Selenium to render JavaScript
69
+ print("Using Selenium for JavaScript rendering...")
70
+ js_content = extract_js_content(url)
71
+ return js_content
72
+ else:
73
+ return html_content
74
+
75
  url = self.to_valid_url(old_url)
76
  self.messages
77
+ html = smart_scraper(url)
78
  doc = BeautifulSoup(html, 'html.parser')
79
  if not doc:
80
  raise ValueError("Please try again")
 
82
  headings_1 = [e.text for e in doc.find_all('h1')]
83
  headings_2 = [e.text for e in doc.find_all('h2')]
84
  # headings_3 = [e.text for e in doc.find_all('h3')]
85
+ links = [e.text for e in doc.find_all('a')]
86
  paragraphs = [e.text for e in doc.find_all('p')]
87
  # spans = [e.text for e in doc.find_all('span')]
88
  joined_paragraphs = (' '.join(paragraphs))
 
94
  self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
95
  self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
96
  self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
97
+ self.messages.append({'role': 'system', 'content': f"here are the links - {links}"})
98
  # messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
99
  self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
100
  # messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})