Spaces:
Sleeping
Sleeping
File size: 1,779 Bytes
5ca0311 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | import os
import requests
from bs4 import BeautifulSoup
def extract_documentation(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract title
title = soup.title.string.strip()
# Scrape main content
documentation_section = soup.find('article', class_='pytorch-article')
if documentation_section:
documentation = documentation_section.get_text().strip()
else:
documentation = "No documentation found."
# Extract internal links
internal_links = []
for link in soup.find_all('a', class_='reference internal', href=True):
href = link['href']
# Create a full URL for relative links
full_url = f"https://pytorch.org/docs/stable/{href}"
internal_links.append(full_url)
return documentation, title, internal_links
# Save extracted content to a folder
def save_text_to_folder(documentation, title, folder_name='docs'):
if not os.path.exists(folder_name):
os.makedirs(folder_name)
file_path = os.path.join(folder_name, f"{title}.txt")
try:
with open(file_path, 'w', encoding='utf-8') as text_file:
text_file.write(documentation)
print(f"Successfully saved to {file_path}")
except Exception as e:
print(f"Error saving file: {str(e)}")
# specified url
urls = ["https://pytorch.org/docs/stable/torch_environment_variables.html",
"https://pytorch.org/docs/stable/index.html"
]
for url in urls:
documentation, title, internal_links = extract_documentation(url)
save_text_to_folder(documentation, title)
for link in internal_links:
documentation, title, _ = extract_documentation(link)
save_text_to_folder(documentation, title)
|