CourseSearch / preprocess.py
Pontonkid's picture
Create preprocess.py
4e214e0 verified
raw
history blame
4.64 kB
import time
import numpy as np
import pandas as pd
import requests
import streamlit as st
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
def preprocess():
# Base URL for navigation
base_url = 'https://courses.analyticsvidhya.com/collections/courses?page='
course_list_url = "https://courses.analyticsvidhya.com/"
# List to hold course data
courses = []
page_number = 1 # Start with the first page
while True:
# Construct URL for the current page
current_page_url = base_url + str(page_number)
print(f"Processing page {page_number}...")
# Get the current page content
response = requests.get(current_page_url)
if response.status_code != 200:
print(f"Failed to fetch page {page_number}. Status code: {response.status_code}")
break
soup = BeautifulSoup(response.content, 'html.parser')
# Find all course cards
course_cards = soup.find_all('li', class_='products__list-item')
if not course_cards:
print("No more courses found. Ending extraction.")
break
# Extract course data from each card
for course_card in course_cards:
title_tag = course_card.find('h3')
link_tag = course_card.find('a')
if title_tag and link_tag: # Check if both title and link exist
title = title_tag.text.strip()
course_link = link_tag['href']
# Construct full course URL (assume relative links)
course_url = course_list_url.rstrip('/') + course_link
# Visit each course link to get the description
course_response = requests.get(course_url)
if course_response.status_code == 200:
course_soup = BeautifulSoup(course_response.content, 'html.parser')
description_tag = course_soup.find('div', class_='fr-view') # Adjust based on actual class or tag
description = description_tag.text.strip() if description_tag else 'No description available'
curriculum_tag = course_soup.find('ul', class_='course-curriculum__chapter-content') # Adjust based on actual class or tag
curriculum = curriculum_tag.text.strip() if curriculum_tag else 'No curriculum available'
#enroll_tag = course_soup.find('article', class_='section__content section__content___ae733') # Adjust based on actual class or tag
#enroll = enroll_tag.text.strip() if enroll_tag else 'No enroll available'
instructor_tag = course_soup.find('section', class_='text-image section-height__medium section__content-alignment--left text-image___07200') # Adjust based on actual class or tag
instructor = instructor_tag.text.strip() if instructor_tag else 'No instructor available'
# Append the data to the list
courses.append({'title': title, 'description': description, 'Course curriculum': curriculum, 'About the Instructor': instructor})
else:
print(f"Failed to fetch course page: {course_url}")
# Sleep to avoid overwhelming the server (optional)
time.sleep(1)
else:
print("Skipped a course card due to missing title or link.")
# Move to the next page
page_number += 1
# break
# Save the collected data to a CSV file
df = pd.DataFrame(courses)
df.to_csv('courses.csv', index=False)
print("Data collection complete. Saved to courses.csv.")
# Load the data
df = pd.read_csv('courses.csv')
# Combine relevant text fields for embedding (e.g., title, description, curriculum)
df['combined_text'] = df['title'] + ' ' + df['description'] + ' ' + df['Course curriculum'] + ' ' + df['About the Instructor']
# Load a pre-trained model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
# Create embeddings for each course
embeddings = model.encode(df['combined_text'].tolist(), convert_to_tensor=True)
# Save embeddings and DataFrame for later use
np.save('course_embeddings.npy', embeddings)
df.to_csv('courses_with_embeddings.csv', index=False)
# Load embeddings and DataFrame
embeddings = np.load('course_embeddings.npy')
df = pd.read_csv('courses_with_embeddings.csv')