Spaces:
Runtime error
Runtime error
| import requests | |
| import re | |
| from bs4 import BeautifulSoup | |
| class PhysicsAQA: | |
| def __init__(self) -> None: | |
| self.url = "https://www.savemyexams.co.uk/a-level/physics/aqa/-/pages/topic-questions-pdf/" | |
| response= requests.get(self.url).text | |
| self.soup = BeautifulSoup(response,features='lxml')# | |
| def collectdata(self): | |
| physicsaqa = {} | |
| slice_indexes = [] | |
| data = [] | |
| data = self.soup.find_all(["td"])[2:] | |
| for ind,td in enumerate(data): | |
| if "." in td.text or "Section" in td.text: | |
| #print(td.text,ind) | |
| slice_indexes.append(ind) | |
| #print(len(slice_indexes)) | |
| for ind in range(len(slice_indexes)+1): | |
| if ind == len(slice_indexes) -1: | |
| break | |
| sliceone = slice_indexes[ind] | |
| slicetwo = slice_indexes[ind + 1] | |
| chapterdata = data[sliceone:slicetwo] | |
| chapternum = data[sliceone:slicetwo][0].text.replace('\n','').replace('\xa0','') | |
| #print(sliceone,slicetwo) | |
| #print(data[sliceone:slicetwo]) | |
| physicsaqa[chapternum] = {} | |
| for chapter in chapterdata: | |
| #print(chapter) | |
| if chapter.find("a",href=True) != None: | |
| #print(chapter) | |
| physicsaqa[chapternum][chapter.find("a",href=True).text.replace('\xa0','').replace('\n','').replace("\u200b","")] = chapter.find("a",href=True)["href"] | |
| return physicsaqa | |
| if __name__ == "__main__": | |
| data = PhysicsAQA().collectdata() | |
| print(data) | |