thehexatechcb / scrapWebpage.py
rohanshaw's picture
Upload 8 files
2b11763 verified
raw
history blame contribute delete
930 Bytes
import requests
from bs4 import BeautifulSoup
# def load_from_website(url):
# response = requests.get(url)
# soup = BeautifulSoup(response.content, 'html.parser')
# text = soup.get_text(separator="\n")
# return [text]
# print(load_from_website("https://thehexatech.com"))
# print()
# print(load_from_website("https://thehexatech.com/about/index.html"))
# print()
# print(load_from_website("https://thehexatech.com/quote/index.html"))
# print()
import asyncio
from langchain_unstructured import UnstructuredLoader
page_url = "https://thehexatech.com/about"
loader = UnstructuredLoader(web_url=page_url)
docs = []
async def get_data():
global docs
async for doc in loader.alazy_load():
docs.append(doc)
async def main():
await get_data()
# print(docs)
for doc in docs:
print(doc.page_content)
asyncio.run(main())