Spaces:
Running
Running
Commit ·
db62874
1
Parent(s): 5d2229e
links
Browse files- conversation.py +32 -1
conversation.py
CHANGED
|
@@ -4,6 +4,8 @@ import gradio as gr
|
|
| 4 |
import requests
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import urllib.parse
|
|
|
|
|
|
|
| 7 |
|
| 8 |
try:
|
| 9 |
from dotenv import load_dotenv
|
|
@@ -43,9 +45,36 @@ class Conversation:
|
|
| 43 |
# ... your existing get_data implementation ...
|
| 44 |
# Replace `messages` with `self.messages`
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
url = self.to_valid_url(old_url)
|
| 47 |
self.messages
|
| 48 |
-
html =
|
| 49 |
doc = BeautifulSoup(html, 'html.parser')
|
| 50 |
if not doc:
|
| 51 |
raise ValueError("Please try again")
|
|
@@ -53,6 +82,7 @@ class Conversation:
|
|
| 53 |
headings_1 = [e.text for e in doc.find_all('h1')]
|
| 54 |
headings_2 = [e.text for e in doc.find_all('h2')]
|
| 55 |
# headings_3 = [e.text for e in doc.find_all('h3')]
|
|
|
|
| 56 |
paragraphs = [e.text for e in doc.find_all('p')]
|
| 57 |
# spans = [e.text for e in doc.find_all('span')]
|
| 58 |
joined_paragraphs = (' '.join(paragraphs))
|
|
@@ -64,6 +94,7 @@ class Conversation:
|
|
| 64 |
self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
|
| 65 |
self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
|
| 66 |
self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
|
|
|
|
| 67 |
# messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
|
| 68 |
self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
|
| 69 |
# messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})
|
|
|
|
| 4 |
import requests
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
import urllib.parse
|
| 7 |
+
from selenium import webdriver
|
| 8 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 9 |
|
| 10 |
try:
|
| 11 |
from dotenv import load_dotenv
|
|
|
|
| 45 |
# ... your existing get_data implementation ...
|
| 46 |
# Replace `messages` with `self.messages`
|
| 47 |
|
| 48 |
+
def extract_html_content(url):
|
| 49 |
+
response = requests.get(url)
|
| 50 |
+
return response.text
|
| 51 |
+
|
| 52 |
+
def extract_js_content(url):
|
| 53 |
+
options = webdriver.ChromeOptions()
|
| 54 |
+
options.add_argument('--headless')
|
| 55 |
+
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
|
| 56 |
+
driver.get(url)
|
| 57 |
+
rendered_content = driver.page_source
|
| 58 |
+
driver.quit()
|
| 59 |
+
return rendered_content
|
| 60 |
+
|
| 61 |
+
def smart_scraper(url):
|
| 62 |
+
html_content = extract_html_content(url)
|
| 63 |
+
selector_to_find = "body"
|
| 64 |
+
|
| 65 |
+
# Check if the content is incomplete or if a specific tag is missing
|
| 66 |
+
# if not html_content or not html_content.find(selector_to_find):
|
| 67 |
+
if not html_content or not html_content.find(selector_to_find):
|
| 68 |
+
# If incomplete, use Selenium to render JavaScript
|
| 69 |
+
print("Using Selenium for JavaScript rendering...")
|
| 70 |
+
js_content = extract_js_content(url)
|
| 71 |
+
return js_content
|
| 72 |
+
else:
|
| 73 |
+
return html_content
|
| 74 |
+
|
| 75 |
url = self.to_valid_url(old_url)
|
| 76 |
self.messages
|
| 77 |
+
html = smart_scraper(url)
|
| 78 |
doc = BeautifulSoup(html, 'html.parser')
|
| 79 |
if not doc:
|
| 80 |
raise ValueError("Please try again")
|
|
|
|
| 82 |
headings_1 = [e.text for e in doc.find_all('h1')]
|
| 83 |
headings_2 = [e.text for e in doc.find_all('h2')]
|
| 84 |
# headings_3 = [e.text for e in doc.find_all('h3')]
|
| 85 |
+
links = [e.text for e in doc.find_all('a')]
|
| 86 |
paragraphs = [e.text for e in doc.find_all('p')]
|
| 87 |
# spans = [e.text for e in doc.find_all('span')]
|
| 88 |
joined_paragraphs = (' '.join(paragraphs))
|
|
|
|
| 94 |
self.messages.append({'role': 'system', 'content': "You are a helpful assistant that must answer questions about a website."})
|
| 95 |
self.messages.append({'role': 'system', 'content': f"here are the h1s - {headings_1}"})
|
| 96 |
self.messages.append({'role': 'system', 'content': f"here are the h2s - {headings_2}"})
|
| 97 |
+
self.messages.append({'role': 'system', 'content': f"here are the links - {links}"})
|
| 98 |
# messages.append({'role': 'system', 'content': f"here are the h3s - {headings_3}"})
|
| 99 |
self.messages.append({'role': 'system', 'content': f"here are the paragraphs - {paragraphs}"})
|
| 100 |
# messages.append({'role': 'system', 'content': f"here are the spans - {spans}"})
|