Spaces:

metechmohit
/

Smart_Search_LLM

Build error

File size: 5,514 Bytes

import pandas as pd
import requests
import streamlit as st
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os
import requests
from bs4 import BeautifulSoup

# Function to scrape courses from a single page using BeautifulSoup
def scrape_courses_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    courses = []
    
    # Extract course title, image, and course link
    course_cards = soup.find_all('header', class_='course-card__img-container')
    for course_card in course_cards:
        img_tag = course_card.find('img', class_='course-card__img')
        
        if img_tag:
            title = img_tag.get('alt')
            image_url = img_tag.get('src')
            
            # Find the course link using the previous 'a' tag
            link_tag = course_card.find_previous('a')
            course_link = link_tag.get('href')
            
            if not course_link.startswith('http'):
                course_link = 'https://courses.analyticsvidhya.com' + course_link

            courses.append({
                'title': title,
                'image_url': image_url,
                'course_link': course_link
            })
    
    return courses

# Function to scrape across multiple pages using BeautifulSoup
def scrape_courses_from_all_pages(base_url, total_pages):
    all_courses = []
    
    for page_num in range(1, total_pages + 1):
        url = f"{base_url}?page={page_num}"
        courses_on_page = scrape_courses_from_page(url)
        all_courses.extend(courses_on_page)
    
    return pd.DataFrame(all_courses)

# Define base URL and total pages
base_url = "https://courses.analyticsvidhya.com/collections/courses"
total_pages = 8  # Assuming there are 8 pages of courses


# Check if the CSV file already exists
if not os.path.exists("scraped.csv"):
    courses_df = scrape_courses_from_all_pages(base_url, total_pages)
    courses_df.to_csv("scraped.csv", index=False)
else:
    pass


# Load Scraped courses data
df = pd.read_csv("scraped.csv")

# Load the Hugging Face embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for course titles
course_embeddings = model.encode(df['title'].tolist())

# Create FAISS index for similarity search
dimension = course_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance
index.add(np.array(course_embeddings))

# Function to search courses using embeddings
def search_courses(query):
    try:
        # Generate the embedding for the query
        query_embedding = model.encode([query])

        # Search for the closest courses using FAISS
        top_k = 6  # Retrieve the top 6 closest matches
        distances, indices = index.search(np.array(query_embedding), top_k)

        # Collect the results based on the indices returned by FAISS
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            course = df.iloc[idx]
            results.append({
                'title': course['title'],
                'image_url': course['image_url'],
                'course_link': course['course_link'],
                'score': 1 - distance  # Convert distance to similarity score
            })

        return sorted(results, key=lambda x: x['score'], reverse=True)

    except Exception as e:
        st.error(f"An error occurred in search_courses: {str(e)}")
        return []

# Function to display search results in Streamlit
def display_search_results(result_list):
    if result_list:
        for item in result_list:
            course_title = item['title']
            course_image = item['image_url']
            course_link = item['course_link']

            st.image(course_image, use_column_width=True)
            st.write(f"### {course_title}")
            
            
            button_html = f"""
            <a href="{course_link}" target="_blank">
                <button style="background-color:#4CAF50; border:none; color:white; padding:10px 20px; text-align:center; text-decoration:none; display:inline-block; font-size:16px; margin:4px 2px; cursor:pointer; border-radius:5px;">
                    View Course
                </button>
            </a>
            """
            st.markdown(button_html, unsafe_allow_html=True)
    else:
        st.write("No results found. Please try a different query.")

# Streamlit UI
st.title("Analytics Vidhya Free Courses🔍")
st.image("cc.jpg")
st.markdown("#### 🔍🌐 Get the most appropriate course as per your learning requirement.")
st.markdown("<hr style='border:1px solid #eee;'>", unsafe_allow_html=True)

query = st.text_input(
    "Enter course related keywords...", 
    placeholder="e.g., machine learning, data science, python",
    help="Type in a keyword to find related free courses"
)

search_button = st.button("Search")

# Search results section
if search_button and query:
    st.write("Collecting courses......")
    result_list = search_courses(query)
    
    # Display search results if available
    if result_list:
        st.markdown(f"### Top results for: {query}")
        display_search_results(result_list)
    else:
        st.write("No results found. Please try a different keyword.")
else:
    st.write("Please enter a search query to find relevant courses.")

# Footer with subtle text
st.markdown(
    "<p style='text-align:center; color:grey;'>Made by @metechmohit </p>",
    unsafe_allow_html=True
)