Spaces:
Build error
Build error
File size: 5,514 Bytes
b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 39cd134 06fbf75 b974700 06fbf75 d245f1a 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 06fbf75 b974700 fd87433 b974700 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import pandas as pd
import requests
import streamlit as st
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
# Function to scrape courses from a single page using BeautifulSoup
def scrape_courses_from_page(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
courses = []
# Extract course title, image, and course link
course_cards = soup.find_all('header', class_='course-card__img-container')
for course_card in course_cards:
img_tag = course_card.find('img', class_='course-card__img')
if img_tag:
title = img_tag.get('alt')
image_url = img_tag.get('src')
# Find the course link using the previous 'a' tag
link_tag = course_card.find_previous('a')
course_link = link_tag.get('href')
if not course_link.startswith('http'):
course_link = 'https://courses.analyticsvidhya.com' + course_link
courses.append({
'title': title,
'image_url': image_url,
'course_link': course_link
})
return courses
# Function to scrape across multiple pages using BeautifulSoup
def scrape_courses_from_all_pages(base_url, total_pages):
all_courses = []
for page_num in range(1, total_pages + 1):
url = f"{base_url}?page={page_num}"
courses_on_page = scrape_courses_from_page(url)
all_courses.extend(courses_on_page)
return pd.DataFrame(all_courses)
# Define base URL and total pages
base_url = "https://courses.analyticsvidhya.com/collections/courses"
total_pages = 8 # Assuming there are 8 pages of courses
# Check if the CSV file already exists
if not os.path.exists("scraped.csv"):
courses_df = scrape_courses_from_all_pages(base_url, total_pages)
courses_df.to_csv("scraped.csv", index=False)
else:
pass
# Load Scraped courses data
df = pd.read_csv("scraped.csv")
# Load the Hugging Face embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Generate embeddings for course titles
course_embeddings = model.encode(df['title'].tolist())
# Create FAISS index for similarity search
dimension = course_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension) # Using L2 distance
index.add(np.array(course_embeddings))
# Function to search courses using embeddings
def search_courses(query):
try:
# Generate the embedding for the query
query_embedding = model.encode([query])
# Search for the closest courses using FAISS
top_k = 6 # Retrieve the top 6 closest matches
distances, indices = index.search(np.array(query_embedding), top_k)
# Collect the results based on the indices returned by FAISS
results = []
for idx, distance in zip(indices[0], distances[0]):
course = df.iloc[idx]
results.append({
'title': course['title'],
'image_url': course['image_url'],
'course_link': course['course_link'],
'score': 1 - distance # Convert distance to similarity score
})
return sorted(results, key=lambda x: x['score'], reverse=True)
except Exception as e:
st.error(f"An error occurred in search_courses: {str(e)}")
return []
# Function to display search results in Streamlit
def display_search_results(result_list):
if result_list:
for item in result_list:
course_title = item['title']
course_image = item['image_url']
course_link = item['course_link']
st.image(course_image, use_column_width=True)
st.write(f"### {course_title}")
button_html = f"""
<a href="{course_link}" target="_blank">
<button style="background-color:#4CAF50; border:none; color:white; padding:10px 20px; text-align:center; text-decoration:none; display:inline-block; font-size:16px; margin:4px 2px; cursor:pointer; border-radius:5px;">
View Course
</button>
</a>
"""
st.markdown(button_html, unsafe_allow_html=True)
else:
st.write("No results found. Please try a different query.")
# Streamlit UI
st.title("Analytics Vidhya Free Courses🔍")
st.image("cc.jpg")
st.markdown("#### 🔍🌐 Get the most appropriate course as per your learning requirement.")
st.markdown("<hr style='border:1px solid #eee;'>", unsafe_allow_html=True)
query = st.text_input(
"Enter course related keywords...",
placeholder="e.g., machine learning, data science, python",
help="Type in a keyword to find related free courses"
)
search_button = st.button("Search")
# Search results section
if search_button and query:
st.write("Collecting courses......")
result_list = search_courses(query)
# Display search results if available
if result_list:
st.markdown(f"### Top results for: {query}")
display_search_results(result_list)
else:
st.write("No results found. Please try a different keyword.")
else:
st.write("Please enter a search query to find relevant courses.")
# Footer with subtle text
st.markdown(
"<p style='text-align:center; color:grey;'>Made by @metechmohit </p>",
unsafe_allow_html=True
)
|