import requests
from lxml import html
from html_to_markdown import convert_to_markdown
# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page
class MyLibreTextsAPI:
LIBRETEXTS_BASE_URL = "https://chem.libretexts.org"
user_agent_headers = {"user-Agent":
"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}
def __init__(self):
print(f"***KS*** Initializing LibreTexts API")
def get_bookshelves(self):
html_content = requests.get(
f"{self.LIBRETEXTS_BASE_URL}/Bookshelves",
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
bookshelves = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
#print(f"Bookshelf:\n{bookshelf}\n\n")
return bookshelves
def get_books(self, bookshelf_url):
html_content = requests.get(
bookshelf_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
books = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
#print(f"Books:\n{books}\n\n")
return books
def get_book_sections(self, book_url):
html_content = requests.get(
book_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
sections = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
#print(f"Sections:\n{sections}\n\n")
return sections
def get_book_section_paragraphs(self, section_url):
html_content = requests.get(
section_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
paragraphs = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements]
#print(f"Paragraphs:\n{paragraphs}\n\n")
return paragraphs
def get_paragraph_contents(self, paragraph_url):
html_content = requests.get(
paragraph_url,
headers=self.user_agent_headers,
).text
markdown = convert_to_markdown(html_content)
return markdown