Agents-Course-Assignment / my_base_libretexts_api.py
krzsam's picture
commit
de39e8f
raw
history blame
3.71 kB
import requests
import json
from lxml import etree
from lxml import html
# https://gist.github.com/scionoftech/0f35d5e231be2cf46823d774023268b6
# https://www.mediawiki.org/wiki/API:Main_page
class MyLibreTextsAPI:
LIBRETEXTS_BASE_URL = "https://chem.libretexts.org"
user_agent_headers = {"user-Agent":
"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"}
def __init__(self):
print(f"***KS*** Initializing LibreTexts API")
def get_bookshelves(self):
html_content = requests.get(
f"{self.LIBRETEXTS_BASE_URL}/Bookshelves",
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
bookshelf = [(element.attrib['title'], element.attrib['href']) for element in elements]
print(f"Bookshelf:\n{bookshelf}\n\n")
return bookshelf
def get_books(self, bookshelf_url):
html_content = requests.get(
bookshelf_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
books = [(element.attrib['title'], element.attrib['href']) for element in elements]
print(f"Books:\n{books}\n\n")
return books
def get_book_sections(self, book_url):
html_content = requests.get(
book_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "mt-sortable-listing-link mt-edit-section internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
sections = [(element.attrib['title'], element.attrib['href']) for element in elements]
print(f"Sections:\n{sections}\n\n")
return sections
def get_book_section_paragraphs(self, section_url):
html_content = requests.get(
section_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
#for element in elements:
# print("-----------------------------------")
# print(f"Tag {element.tag} : title: {element.attrib['title']} href: {element.attrib['href']}")
paragraphs = [(element.attrib['title'], element.attrib['href']) for element in elements]
print(f"Paragraphs:\n{paragraphs}\n\n")
return paragraphs
def get_section_contents(self, paragraph_url):
html_content = requests.get(
paragraph_url,
headers=self.user_agent_headers,
).text
tree = html.fromstring(html_content)
link_class = "internal"
elements = tree.xpath(f"//a[@class='{link_class}']")
# TODO get contents and convert to MD format
return contents