Spaces:

sreedeepEK
/

Torchie

Sleeping

App Files Files Community

Torchie / scraper.py

sreedeepEK

Initial commit with Git LFS tracking large files

5ca0311 over 1 year ago

raw

history blame contribute delete

1.78 kB

	import os
	import requests
	from bs4 import BeautifulSoup

	def extract_documentation(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract title
	title = soup.title.string.strip()

	# Scrape main content
	documentation_section = soup.find('article', class_='pytorch-article')

	if documentation_section:
	documentation = documentation_section.get_text().strip()
	else:
	documentation = "No documentation found."

	# Extract internal links
	internal_links = []


	for link in soup.find_all('a', class_='reference internal', href=True):
	href = link['href']
	# Create a full URL for relative links
	full_url = f"https://pytorch.org/docs/stable/{href}"
	internal_links.append(full_url)

	return documentation, title, internal_links


	# Save extracted content to a folder
	def save_text_to_folder(documentation, title, folder_name='docs'):
	if not os.path.exists(folder_name):
	os.makedirs(folder_name)

	file_path = os.path.join(folder_name, f"{title}.txt")

	try:
	with open(file_path, 'w', encoding='utf-8') as text_file:
	text_file.write(documentation)
	print(f"Successfully saved to {file_path}")
	except Exception as e:
	print(f"Error saving file: {str(e)}")



	# specified url

	urls = ["https://pytorch.org/docs/stable/torch_environment_variables.html",
	"https://pytorch.org/docs/stable/index.html"
	]
	for url in urls:
	documentation, title, internal_links = extract_documentation(url)
	save_text_to_folder(documentation, title)


	for link in internal_links:
	documentation, title, _ = extract_documentation(link)
	save_text_to_folder(documentation, title)