File size: 1,779 Bytes
5ca0311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import requests 
from bs4 import BeautifulSoup

def extract_documentation(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract title 
    title = soup.title.string.strip()
    
    # Scrape main content
    documentation_section = soup.find('article', class_='pytorch-article')
    
    if documentation_section:
        documentation = documentation_section.get_text().strip()
    else:
        documentation = "No documentation found."

    # Extract internal links
    internal_links = []


    for link in soup.find_all('a', class_='reference internal', href=True):
        href = link['href']
        # Create a full URL for relative links
        full_url = f"https://pytorch.org/docs/stable/{href}"
        internal_links.append(full_url)

    return documentation, title, internal_links


# Save extracted content to a folder 
def save_text_to_folder(documentation, title, folder_name='docs'):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
   
    file_path = os.path.join(folder_name, f"{title}.txt")
   
    try:
        with open(file_path, 'w', encoding='utf-8') as text_file:
            text_file.write(documentation)
        print(f"Successfully saved to {file_path}")
    except Exception as e:
        print(f"Error saving file: {str(e)}")



# specified url 

urls = ["https://pytorch.org/docs/stable/torch_environment_variables.html",
        "https://pytorch.org/docs/stable/index.html"
       ]
for url in urls:
    documentation, title, internal_links = extract_documentation(url)
    save_text_to_folder(documentation, title)


for link in internal_links:
    documentation, title, _ = extract_documentation(link)
    save_text_to_folder(documentation, title)