Spaces:

rohanshaw
/

thehexatechcb

Running

thehexatechcb / scrapWebpage.py

Upload 8 files

2b11763 verified 6 months ago

930 Bytes

	import requests
	from bs4 import BeautifulSoup

	# def load_from_website(url):
	# response = requests.get(url)
	# soup = BeautifulSoup(response.content, 'html.parser')
	# text = soup.get_text(separator="\n")
	# return [text]

	# print(load_from_website("https://thehexatech.com"))
	# print()
	# print(load_from_website("https://thehexatech.com/about/index.html"))
	# print()
	# print(load_from_website("https://thehexatech.com/quote/index.html"))
	# print()


	import asyncio
	from langchain_unstructured import UnstructuredLoader

	page_url = "https://thehexatech.com/about"
	loader = UnstructuredLoader(web_url=page_url)

	docs = []

	async def get_data():
	global docs
	async for doc in loader.alazy_load():
	docs.append(doc)

	async def main():
	await get_data()
	# print(docs)
	for doc in docs:
	print(doc.page_content)

	asyncio.run(main())