File size: 6,089 Bytes
d31253d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
from urllib.parse import urljoin

# Constants
BASE_URL = 'https://courses.analyticsvidhya.com/collections/'
COURSE_LISTING_URL = f'{BASE_URL}courses'
CSV_FILE = 'detailed_courses.csv'

# Function to fetch and parse HTML content
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'html.parser')
    except requests.exceptions.HTTPError as errh:
        print(f"HTTP Error: {errh}")
    except requests.exceptions.ConnectionError as errc:
        print(f"Error Connecting: {errc}")
    except requests.exceptions.Timeout as errt:
        print(f"Timeout Error: {errt}")
    except requests.exceptions.RequestException as err:
        print(f"Error: {err}")

# Function to scrape course listing pages
def scrape_course_listings():
    courses = []  # Initialize empty list to store all courses
    page_num = 1
    max_pages = 9

    while page_num <= max_pages:
        print(f"\nProcessing page {page_num}")
        page_url = f"{COURSE_LISTING_URL}?page={page_num}"
        soup = fetch_html(page_url)

        if not soup:
            print(f"Failed to fetch or parse page {page_num}")
            break

        course_container = soup.find('div', class_='collections__product-cards collections__product-cards___0b9ab')
        if not course_container:
            print(f"No course container found on page {page_num}")
            break

        course_cards = course_container.find_all('li', class_='products__list-item')

        courses_on_this_page = 0  # Counter for courses added from current page

        for card in course_cards:
            price = card.find('span', class_='course-card__price')
            if price and price.text.strip() == 'Free':
                link = card.find('a')
                title = card.find('h3')

                if link and title:
                    full_link = urljoin(BASE_URL, link['href'])
                    courses.append({
                        'Title': title.text.strip(),
                        'Link': full_link,
                        'Page': page_num  # Adding page number for verification
                    })
                    courses_on_this_page += 1


        page_num += 1
        time.sleep(1)

    return courses

#Function to scrape detailed course information
def scrape_course_details(courses):
    for course in courses:
        soup = fetch_html(course['Link'])
        if not soup:
            continue

        try:
            # Brief
            h2_elements = soup.find_all('h2')
            course['Brief'] = h2_elements[0].text if h2_elements else 'No brief available'

            # Duration, Rating, Level
            h4_elements = soup.find_all('h4', class_=None)
            if len(h4_elements) >= 3:
                course['Duration'] = h4_elements[0].text
                course['Rating'] = h4_elements[1].text
                course['Level'] = h4_elements[2].text
            else:
                course['Duration'] = 'No duration available'
                course['Rating'] = 'No rating available'
                course['Level'] = 'No level available'

            # Trainer information
            trainer = []
            inst = soup.find_all('h4', class_=lambda x: x and x.startswith("section__subheading"))
            trainer.extend(i.text for i in inst)

            tf = soup.find_all('div', class_='section__body')
            if tf and tf[0].get_text(strip=True).startswith("Unlock a lifetime-valid"):
                tf = tf[1:]

            trainer_dict = {}
            for i in range(len(trainer)):
                if i < len(tf):
                    trainer_dict[trainer[i]] = tf[i].get_text(strip=True)
            course['Trainer'] = trainer_dict if trainer_dict else 'No trainer available'

            # Description
            description_elements = soup.find_all('div', class_='custom-theme')
            course['Description'] = description_elements[0].text if description_elements else 'No description available'

            # Curriculum
            spans = soup.find_all('span', class_='course-curriculum__chapter-lesson')
            curriculum = [span.get_text(strip=True) for span in spans]
            course['Curriculum'] = curriculum if curriculum else 'No curriculum available'

            # What should enroll & takeaway
            wse_ta = soup.find_all('li', class_='checklist__list-item')
            wa = [i.get_text(strip=True) for i in wse_ta]
            course['What should enroll & takeaway'] = wa if wa else 'No what should enroll & takeaway available'

            # FAQ
            faq_list_items = soup.find_all('li', class_='faq__list-item')
            faq_data = []
            for item in faq_list_items:
                question = item.find('strong')
                answer = item.find('p')
                if question and answer:
                    faq_data.append({
                        'Question': question.text,
                        'Answer': answer.text
                    })
            course['FAQ'] = faq_data if faq_data else 'No FAQ available'

        except Exception as e:
            print(f"Error processing {course['Title']}: {str(e)}")
            continue

        time.sleep(1)  # Respectful delay between requests

    return courses

# Example usage:
courses = scrape_course_listings()  # Get the initial courses
detailed_courses = scrape_course_details(courses)  # Add details to each course


# Function to save data to CSV
def save_to_csv(courses):
    df = pd.DataFrame(courses)
    df.to_csv(CSV_FILE, index=False)
    print(f"Data saved to {CSV_FILE}")


course_list = scrape_course_listings()
cr = scrape_course_details(course_list)
save_to_csv(cr)