import pandas as pd import requests import streamlit as st from sentence_transformers import SentenceTransformer import faiss import numpy as np import os import requests from bs4 import BeautifulSoup # Function to scrape courses from a single page using BeautifulSoup def scrape_courses_from_page(url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') courses = [] # Extract course title, image, and course link course_cards = soup.find_all('header', class_='course-card__img-container') for course_card in course_cards: img_tag = course_card.find('img', class_='course-card__img') if img_tag: title = img_tag.get('alt') image_url = img_tag.get('src') # Find the course link using the previous 'a' tag link_tag = course_card.find_previous('a') course_link = link_tag.get('href') if not course_link.startswith('http'): course_link = 'https://courses.analyticsvidhya.com' + course_link courses.append({ 'title': title, 'image_url': image_url, 'course_link': course_link }) return courses # Function to scrape across multiple pages using BeautifulSoup def scrape_courses_from_all_pages(base_url, total_pages): all_courses = [] for page_num in range(1, total_pages + 1): url = f"{base_url}?page={page_num}" courses_on_page = scrape_courses_from_page(url) all_courses.extend(courses_on_page) return pd.DataFrame(all_courses) # Define base URL and total pages base_url = "https://courses.analyticsvidhya.com/collections/courses" total_pages = 8 # Assuming there are 8 pages of courses # Check if the CSV file already exists if not os.path.exists("scraped.csv"): courses_df = scrape_courses_from_all_pages(base_url, total_pages) courses_df.to_csv("scraped.csv", index=False) else: pass # Load Scraped courses data df = pd.read_csv("scraped.csv") # Load the Hugging Face embedding model model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') # Generate embeddings for course titles course_embeddings = model.encode(df['title'].tolist()) # Create FAISS index for similarity search dimension = course_embeddings.shape[1] index = faiss.IndexFlatL2(dimension) # Using L2 distance index.add(np.array(course_embeddings)) # Function to search courses using embeddings def search_courses(query): try: # Generate the embedding for the query query_embedding = model.encode([query]) # Search for the closest courses using FAISS top_k = 6 # Retrieve the top 6 closest matches distances, indices = index.search(np.array(query_embedding), top_k) # Collect the results based on the indices returned by FAISS results = [] for idx, distance in zip(indices[0], distances[0]): course = df.iloc[idx] results.append({ 'title': course['title'], 'image_url': course['image_url'], 'course_link': course['course_link'], 'score': 1 - distance # Convert distance to similarity score }) return sorted(results, key=lambda x: x['score'], reverse=True) except Exception as e: st.error(f"An error occurred in search_courses: {str(e)}") return [] # Function to display search results in Streamlit def display_search_results(result_list): if result_list: for item in result_list: course_title = item['title'] course_image = item['image_url'] course_link = item['course_link'] st.image(course_image, use_column_width=True) st.write(f"### {course_title}") button_html = f""" """ st.markdown(button_html, unsafe_allow_html=True) else: st.write("No results found. Please try a different query.") # Streamlit UI st.title("Analytics Vidhya Free Courses๐") st.image("cc.jpg") st.markdown("#### ๐๐ Get the most appropriate course as per your learning requirement.") st.markdown("
Made by @metechmohit
", unsafe_allow_html=True )