File size: 3,664 Bytes
de39e8f
 
264afdc
de39e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264afdc
 
de39e8f
264afdc
de39e8f
 
 
 
 
 
 
 
 
 
 
 
 
264afdc
 
de39e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264afdc
 
de39e8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264afdc
 
de39e8f
 
 
264afdc
de39e8f
 
 
 
 
264afdc
de39e8f
264afdc
de39e8f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import requests
from lxml import html
from html_to_markdown import convert_to_markdown

# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page

class MyLibreTextsAPI:
    LIBRETEXTS_BASE_URL = "https://chem.libretexts.org"
    user_agent_headers = {"user-Agent":
        "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}

    def __init__(self):
        print(f"***KS*** Initializing LibreTexts API")

    def get_bookshelves(self):
        html_content = requests.get(
            f"{self.LIBRETEXTS_BASE_URL}/Bookshelves",
            headers=self.user_agent_headers,
            ).text
        tree = html.fromstring(html_content)
        link_class = "mt-sortable-listing-link mt-edit-section internal"
        elements = tree.xpath(f"//a[@class='{link_class}']")
        #for element in elements:
        #    print("-----------------------------------")
        #    print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")

        bookshelves = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
        #print(f"Bookshelf:\n{bookshelf}\n\n")

        return bookshelves

    def get_books(self, bookshelf_url):
        html_content = requests.get(
            bookshelf_url,
            headers=self.user_agent_headers,
        ).text
        tree = html.fromstring(html_content)
        link_class = "mt-sortable-listing-link mt-edit-section internal"
        elements = tree.xpath(f"//a[@class='{link_class}']")
        #for element in elements:
        #    print("-----------------------------------")
        #    print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")

        books = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
        #print(f"Books:\n{books}\n\n")

        return books

    def get_book_sections(self, book_url):
        html_content = requests.get(
            book_url,
            headers=self.user_agent_headers,
        ).text
        tree = html.fromstring(html_content)
        link_class = "mt-sortable-listing-link mt-edit-section internal"
        elements = tree.xpath(f"//a[@class='{link_class}']")
        #for element in elements:
        #    print("-----------------------------------")
        #    print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")

        sections = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
        #print(f"Sections:\n{sections}\n\n")

        return sections

    def get_book_section_paragraphs(self, section_url):
        html_content = requests.get(
            section_url,
            headers=self.user_agent_headers,
        ).text
        tree = html.fromstring(html_content)
        link_class = "internal"
        elements = tree.xpath(f"//a[@class='{link_class}']")
        #for element in elements:
        #    print("-----------------------------------")
        #    print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")

        paragraphs = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
        #print(f"Paragraphs:\n{paragraphs}\n\n")

        return paragraphs

    def get_paragraph_contents(self, paragraph_url):
        html_content = requests.get(
            paragraph_url,
            headers=self.user_agent_headers,
        ).text

        markdown = convert_to_markdown(html_content)

        return markdown