AmritSbisht's picture
Upload 3701 files
d31253d verified
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
from urllib.parse import urljoin
# Constants
BASE_URL = 'https://courses.analyticsvidhya.com/collections/'
COURSE_LISTING_URL = f'{BASE_URL}courses'
CSV_FILE = 'detailed_courses.csv'
# Function to fetch and parse HTML content
def fetch_html(url):
try:
response = requests.get(url)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except requests.exceptions.HTTPError as errh:
print(f"HTTP Error: {errh}")
except requests.exceptions.ConnectionError as errc:
print(f"Error Connecting: {errc}")
except requests.exceptions.Timeout as errt:
print(f"Timeout Error: {errt}")
except requests.exceptions.RequestException as err:
print(f"Error: {err}")
# Function to scrape course listing pages
def scrape_course_listings():
courses = [] # Initialize empty list to store all courses
page_num = 1
max_pages = 9
while page_num <= max_pages:
print(f"\nProcessing page {page_num}")
page_url = f"{COURSE_LISTING_URL}?page={page_num}"
soup = fetch_html(page_url)
if not soup:
print(f"Failed to fetch or parse page {page_num}")
break
course_container = soup.find('div', class_='collections__product-cards collections__product-cards___0b9ab')
if not course_container:
print(f"No course container found on page {page_num}")
break
course_cards = course_container.find_all('li', class_='products__list-item')
courses_on_this_page = 0 # Counter for courses added from current page
for card in course_cards:
price = card.find('span', class_='course-card__price')
if price and price.text.strip() == 'Free':
link = card.find('a')
title = card.find('h3')
if link and title:
full_link = urljoin(BASE_URL, link['href'])
courses.append({
'Title': title.text.strip(),
'Link': full_link,
'Page': page_num # Adding page number for verification
})
courses_on_this_page += 1
page_num += 1
time.sleep(1)
return courses
#Function to scrape detailed course information
def scrape_course_details(courses):
for course in courses:
soup = fetch_html(course['Link'])
if not soup:
continue
try:
# Brief
h2_elements = soup.find_all('h2')
course['Brief'] = h2_elements[0].text if h2_elements else 'No brief available'
# Duration, Rating, Level
h4_elements = soup.find_all('h4', class_=None)
if len(h4_elements) >= 3:
course['Duration'] = h4_elements[0].text
course['Rating'] = h4_elements[1].text
course['Level'] = h4_elements[2].text
else:
course['Duration'] = 'No duration available'
course['Rating'] = 'No rating available'
course['Level'] = 'No level available'
# Trainer information
trainer = []
inst = soup.find_all('h4', class_=lambda x: x and x.startswith("section__subheading"))
trainer.extend(i.text for i in inst)
tf = soup.find_all('div', class_='section__body')
if tf and tf[0].get_text(strip=True).startswith("Unlock a lifetime-valid"):
tf = tf[1:]
trainer_dict = {}
for i in range(len(trainer)):
if i < len(tf):
trainer_dict[trainer[i]] = tf[i].get_text(strip=True)
course['Trainer'] = trainer_dict if trainer_dict else 'No trainer available'
# Description
description_elements = soup.find_all('div', class_='custom-theme')
course['Description'] = description_elements[0].text if description_elements else 'No description available'
# Curriculum
spans = soup.find_all('span', class_='course-curriculum__chapter-lesson')
curriculum = [span.get_text(strip=True) for span in spans]
course['Curriculum'] = curriculum if curriculum else 'No curriculum available'
# What should enroll & takeaway
wse_ta = soup.find_all('li', class_='checklist__list-item')
wa = [i.get_text(strip=True) for i in wse_ta]
course['What should enroll & takeaway'] = wa if wa else 'No what should enroll & takeaway available'
# FAQ
faq_list_items = soup.find_all('li', class_='faq__list-item')
faq_data = []
for item in faq_list_items:
question = item.find('strong')
answer = item.find('p')
if question and answer:
faq_data.append({
'Question': question.text,
'Answer': answer.text
})
course['FAQ'] = faq_data if faq_data else 'No FAQ available'
except Exception as e:
print(f"Error processing {course['Title']}: {str(e)}")
continue
time.sleep(1) # Respectful delay between requests
return courses
# Example usage:
courses = scrape_course_listings() # Get the initial courses
detailed_courses = scrape_course_details(courses) # Add details to each course
# Function to save data to CSV
def save_to_csv(courses):
df = pd.DataFrame(courses)
df.to_csv(CSV_FILE, index=False)
print(f"Data saved to {CSV_FILE}")
course_list = scrape_course_listings()
cr = scrape_course_details(course_list)
save_to_csv(cr)