|
|
import requests |
|
|
from lxml import html |
|
|
from html_to_markdown import convert_to_markdown |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MyLibreTextsAPI: |
|
|
LIBRETEXTS_BASE_URL = "https://chem.libretexts.org" |
|
|
user_agent_headers = {"user-Agent": |
|
|
"AgentsCourseAssignment/1.0 (https://huggingface.co/spaces/krzsam/Agents-Course-Assignment)"} |
|
|
|
|
|
def __init__(self): |
|
|
print(f"***KS*** Initializing LibreTexts API") |
|
|
|
|
|
def get_bookshelves(self): |
|
|
html_content = requests.get( |
|
|
f"{self.LIBRETEXTS_BASE_URL}/Bookshelves", |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
tree = html.fromstring(html_content) |
|
|
link_class = "mt-sortable-listing-link mt-edit-section internal" |
|
|
elements = tree.xpath(f"//a[@class='{link_class}']") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bookshelves = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
|
|
|
|
|
|
|
|
return bookshelves |
|
|
|
|
|
def get_books(self, bookshelf_url): |
|
|
html_content = requests.get( |
|
|
bookshelf_url, |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
tree = html.fromstring(html_content) |
|
|
link_class = "mt-sortable-listing-link mt-edit-section internal" |
|
|
elements = tree.xpath(f"//a[@class='{link_class}']") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
books = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
|
|
|
|
|
|
|
|
return books |
|
|
|
|
|
def get_book_sections(self, book_url): |
|
|
html_content = requests.get( |
|
|
book_url, |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
tree = html.fromstring(html_content) |
|
|
link_class = "mt-sortable-listing-link mt-edit-section internal" |
|
|
elements = tree.xpath(f"//a[@class='{link_class}']") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sections = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
|
|
|
|
|
|
|
|
return sections |
|
|
|
|
|
def get_book_section_paragraphs(self, section_url): |
|
|
html_content = requests.get( |
|
|
section_url, |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
tree = html.fromstring(html_content) |
|
|
link_class = "internal" |
|
|
elements = tree.xpath(f"//a[@class='{link_class}']") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
paragraphs = [{"title": element.attrib['title'], "url": element.attrib['href']} for element in elements] |
|
|
|
|
|
|
|
|
return paragraphs |
|
|
|
|
|
def get_paragraph_contents(self, paragraph_url): |
|
|
html_content = requests.get( |
|
|
paragraph_url, |
|
|
headers=self.user_agent_headers, |
|
|
).text |
|
|
|
|
|
markdown = convert_to_markdown(html_content) |
|
|
|
|
|
return markdown |
|
|
|
|
|
|