import os import requests from bs4 import BeautifulSoup def extract_documentation(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') # Extract title title = soup.title.string.strip() # Scrape main content documentation_section = soup.find('article', class_='pytorch-article') if documentation_section: documentation = documentation_section.get_text().strip() else: documentation = "No documentation found." # Extract internal links internal_links = [] for link in soup.find_all('a', class_='reference internal', href=True): href = link['href'] # Create a full URL for relative links full_url = f"https://pytorch.org/docs/stable/{href}" internal_links.append(full_url) return documentation, title, internal_links # Save extracted content to a folder def save_text_to_folder(documentation, title, folder_name='docs'): if not os.path.exists(folder_name): os.makedirs(folder_name) file_path = os.path.join(folder_name, f"{title}.txt") try: with open(file_path, 'w', encoding='utf-8') as text_file: text_file.write(documentation) print(f"Successfully saved to {file_path}") except Exception as e: print(f"Error saving file: {str(e)}") # specified url urls = ["https://pytorch.org/docs/stable/torch_environment_variables.html", "https://pytorch.org/docs/stable/index.html" ] for url in urls: documentation, title, internal_links = extract_documentation(url) save_text_to_folder(documentation, title) for link in internal_links: documentation, title, _ = extract_documentation(link) save_text_to_folder(documentation, title)