Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| import time | |
| base_url = "https://courses.analyticsvidhya.com/collections?page=" | |
| course_url_base = "https://courses.analyticsvidhya.com" | |
| course_data = [] | |
| for page in range(1,9): | |
| print(f"Scraping page {page}...") | |
| response = requests.get(base_url + str(page)) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| course_section = soup.find_all('div', class_="collections__product-cards collections__product-cards___0b9ab") | |
| if not course_section: | |
| print("No course section found, skipping this page.") | |
| continue | |
| courses = course_section[0].find_all('li') | |
| for course in courses: | |
| link_tag = course.find('a', href=True) | |
| if not link_tag: | |
| continue | |
| course_relative_link = link_tag['href'] | |
| course_link = course_url_base + course_relative_link | |
| course_response = requests.get(course_link) | |
| course_soup = BeautifulSoup(course_response.text, 'html.parser') | |
| title_tag = course_soup.find('h1', class_="section__heading") | |
| if title_tag: | |
| course_title = title_tag.get_text(strip=True) | |
| else: | |
| course_title = "N/A" | |
| description_tag = course_soup.find_all('div', class_="rich-text__container") | |
| course_description = " ".join([p.get_text(strip=True) for tag in description_tag for p in tag.find_all('p')]) if description_tag else "N/A" | |
| curriculum_section = course_soup.find('div', class_="course-curriculum__container") | |
| if curriculum_section: | |
| curriculum_content = [] | |
| chapters = curriculum_section.find_all('li', class_="course-curriculum__chapter") | |
| for chapter in chapters: | |
| title = chapter.find('h5', class_="course-curriculum__chapter-title") | |
| if title: | |
| curriculum_content.append(title.get_text(strip=True)) | |
| chapter_content = chapter.find('ul', class_="course-curriculum__chapter-content") | |
| if chapter_content: | |
| curriculum_content.extend( | |
| [f" - {item.get_text(strip=True)}" for item in chapter_content.find_all('li')] | |
| ) | |
| course_curriculum = "\n".join(curriculum_content) if curriculum_content else "N/A" | |
| else: | |
| course_curriculum = "N/A" | |
| course_data.append({ | |
| "Course Title": course_title, | |
| "Course Description": course_description, | |
| "Course Curriculum": course_curriculum, | |
| "Link": course_link | |
| }) | |
| time.sleep(1) | |
| df = pd.DataFrame(course_data) | |
| file_path = r"C:\Users\rachi\OneDrive\Desktop\Analytics VIdya - Gen AI\analytics_vidhya_courses.xlsx" | |
| df.to_excel(file_path, index=False) | |
| print(f"Data saved to {file_path}") | |