File size: 5,514 Bytes
b974700
06fbf75
b974700
06fbf75
 
 
b974700
06fbf75
 
b974700
06fbf75
 
 
 
b974700
06fbf75
 
 
 
 
 
b974700
06fbf75
 
 
 
 
 
b974700
06fbf75
b974700
 
 
 
 
 
 
 
06fbf75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b974700
 
06fbf75
 
 
 
 
 
b974700
06fbf75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b974700
 
06fbf75
 
 
 
39cd134
06fbf75
 
 
b974700
06fbf75
d245f1a
 
 
 
 
 
 
06fbf75
 
b974700
 
 
 
 
06fbf75
b974700
 
 
 
 
 
 
 
 
06fbf75
 
 
 
 
 
 
 
 
 
b974700
 
 
06fbf75
b974700
fd87433
b974700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import requests
import streamlit as st
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import requests
from bs4 import BeautifulSoup

# Function to scrape courses from a single page using BeautifulSoup
def scrape_courses_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    courses = []
    
    # Extract course title, image, and course link
    course_cards = soup.find_all('header', class_='course-card__img-container')
    for course_card in course_cards:
        img_tag = course_card.find('img', class_='course-card__img')
        
        if img_tag:
            title = img_tag.get('alt')
            image_url = img_tag.get('src')
            
            # Find the course link using the previous 'a' tag
            link_tag = course_card.find_previous('a')
            course_link = link_tag.get('href')
            
            if not course_link.startswith('http'):
                course_link = 'https://courses.analyticsvidhya.com' + course_link

            courses.append({
                'title': title,
                'image_url': image_url,
                'course_link': course_link
            })
    
    return courses

# Function to scrape across multiple pages using BeautifulSoup
def scrape_courses_from_all_pages(base_url, total_pages):
    all_courses = []
    
    for page_num in range(1, total_pages + 1):
        url = f"{base_url}?page={page_num}"
        courses_on_page = scrape_courses_from_page(url)
        all_courses.extend(courses_on_page)
    
    return pd.DataFrame(all_courses)

# Define base URL and total pages
base_url = "https://courses.analyticsvidhya.com/collections/courses"
total_pages = 8  # Assuming there are 8 pages of courses


# Check if the CSV file already exists
if not os.path.exists("scraped.csv"):
    courses_df = scrape_courses_from_all_pages(base_url, total_pages)
    courses_df.to_csv("scraped.csv", index=False)
else:
    pass


# Load Scraped courses data
df = pd.read_csv("scraped.csv")

# Load the Hugging Face embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for course titles
course_embeddings = model.encode(df['title'].tolist())

# Create FAISS index for similarity search
dimension = course_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance
index.add(np.array(course_embeddings))

# Function to search courses using embeddings
def search_courses(query):
    try:
        # Generate the embedding for the query
        query_embedding = model.encode([query])

        # Search for the closest courses using FAISS
        top_k = 6  # Retrieve the top 6 closest matches
        distances, indices = index.search(np.array(query_embedding), top_k)

        # Collect the results based on the indices returned by FAISS
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            course = df.iloc[idx]
            results.append({
                'title': course['title'],
                'image_url': course['image_url'],
                'course_link': course['course_link'],
                'score': 1 - distance  # Convert distance to similarity score
            })

        return sorted(results, key=lambda x: x['score'], reverse=True)

    except Exception as e:
        st.error(f"An error occurred in search_courses: {str(e)}")
        return []

# Function to display search results in Streamlit
def display_search_results(result_list):
    if result_list:
        for item in result_list:
            course_title = item['title']
            course_image = item['image_url']
            course_link = item['course_link']

            st.image(course_image, use_column_width=True)
            st.write(f"### {course_title}")
            
            
            button_html = f"""
            <a href="{course_link}" target="_blank">
                <button style="background-color:#4CAF50; border:none; color:white; padding:10px 20px; text-align:center; text-decoration:none; display:inline-block; font-size:16px; margin:4px 2px; cursor:pointer; border-radius:5px;">
                    View Course
                </button>
            </a>
            """
            st.markdown(button_html, unsafe_allow_html=True)
    else:
        st.write("No results found. Please try a different query.")

# Streamlit UI
st.title("Analytics Vidhya Free Courses🔍")
st.image("cc.jpg")
st.markdown("#### 🔍🌐 Get the most appropriate course as per your learning requirement.")
st.markdown("<hr style='border:1px solid #eee;'>", unsafe_allow_html=True)

query = st.text_input(
    "Enter course related keywords...", 
    placeholder="e.g., machine learning, data science, python",
    help="Type in a keyword to find related free courses"
)

search_button = st.button("Search")

# Search results section
if search_button and query:
    st.write("Collecting courses......")
    result_list = search_courses(query)
    
    # Display search results if available
    if result_list:
        st.markdown(f"### Top results for: {query}")
        display_search_results(result_list)
    else:
        st.write("No results found. Please try a different keyword.")
else:
    st.write("Please enter a search query to find relevant courses.")

# Footer with subtle text
st.markdown(
    "<p style='text-align:center; color:grey;'>Made by @metechmohit </p>",
    unsafe_allow_html=True
)