import requests from lxml import html from html_to_markdown import convert_to_markdown # https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6 # https://www.mediawiki.org/wiki/API:Main_page class MyLibreTextsAPI: LIBRETEXTS_BASE_URL = "https://chem.libretexts.org" user_agent_headers = {"user-Agent": "AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} def __init__(self): print(f"***KS*** Initializing LibreTexts API") def get_bookshelves(self): html_content = requests.get( f"{self.LIBRETEXTS_BASE_URL}/Bookshelves", headers=self.user_agent_headers, ).text tree = html.fromstring(html_content) link_class = "mt-sortable-listing-link mt-edit-section internal" elements = tree.xpath(f"//a[@class='{link_class}']") #for element in elements: # print("-----------------------------------") # print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}") bookshelves = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] #print(f"Bookshelf:\n{bookshelf}\n\n") return bookshelves def get_books(self, bookshelf_url): html_content = requests.get( bookshelf_url, headers=self.user_agent_headers, ).text tree = html.fromstring(html_content) link_class = "mt-sortable-listing-link mt-edit-section internal" elements = tree.xpath(f"//a[@class='{link_class}']") #for element in elements: # print("-----------------------------------") # print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}") books = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] #print(f"Books:\n{books}\n\n") return books def get_book_sections(self, book_url): html_content = requests.get( book_url, headers=self.user_agent_headers, ).text tree = html.fromstring(html_content) link_class = "mt-sortable-listing-link mt-edit-section internal" elements = tree.xpath(f"//a[@class='{link_class}']") #for element in elements: # print("-----------------------------------") # print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}") sections = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] #print(f"Sections:\n{sections}\n\n") return sections def get_book_section_paragraphs(self, section_url): html_content = requests.get( section_url, headers=self.user_agent_headers, ).text tree = html.fromstring(html_content) link_class = "internal" elements = tree.xpath(f"//a[@class='{link_class}']") #for element in elements: # print("-----------------------------------") # print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}") paragraphs = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] #print(f"Paragraphs:\n{paragraphs}\n\n") return paragraphs def get_paragraph_contents(self, paragraph_url): html_content = requests.get( paragraph_url, headers=self.user_agent_headers, ).text markdown = convert_to_markdown(html_content) return markdown