Spaces:

metechmohit
/

Smart_Search_LLM

Build error

App Files Files Community

Smart_Search_LLM / app.py

metechmohit

Update app.py

fd87433 verified about 1 year ago

raw

history blame contribute delete

5.51 kB

	import pandas as pd
	import requests
	import streamlit as st
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import os
	import requests
	from bs4 import BeautifulSoup

	# Function to scrape courses from a single page using BeautifulSoup
	def scrape_courses_from_page(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, 'html.parser')

	courses = []

	# Extract course title, image, and course link
	course_cards = soup.find_all('header', class_='course-card__img-container')
	for course_card in course_cards:
	img_tag = course_card.find('img', class_='course-card__img')

	if img_tag:
	title = img_tag.get('alt')
	image_url = img_tag.get('src')

	# Find the course link using the previous 'a' tag
	link_tag = course_card.find_previous('a')
	course_link = link_tag.get('href')

	if not course_link.startswith('http'):
	course_link = 'https://courses.analyticsvidhya.com' + course_link

	courses.append({
	'title': title,
	'image_url': image_url,
	'course_link': course_link
	})

	return courses

	# Function to scrape across multiple pages using BeautifulSoup
	def scrape_courses_from_all_pages(base_url, total_pages):
	all_courses = []

	for page_num in range(1, total_pages + 1):
	url = f"{base_url}?page={page_num}"
	courses_on_page = scrape_courses_from_page(url)
	all_courses.extend(courses_on_page)

	return pd.DataFrame(all_courses)

	# Define base URL and total pages
	base_url = "https://courses.analyticsvidhya.com/collections/courses"
	total_pages = 8 # Assuming there are 8 pages of courses


	# Check if the CSV file already exists
	if not os.path.exists("scraped.csv"):
	courses_df = scrape_courses_from_all_pages(base_url, total_pages)
	courses_df.to_csv("scraped.csv", index=False)
	else:
	pass


	# Load Scraped courses data
	df = pd.read_csv("scraped.csv")

	# Load the Hugging Face embedding model
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

	# Generate embeddings for course titles
	course_embeddings = model.encode(df['title'].tolist())

	# Create FAISS index for similarity search
	dimension = course_embeddings.shape[1]
	index = faiss.IndexFlatL2(dimension) # Using L2 distance
	index.add(np.array(course_embeddings))

	# Function to search courses using embeddings
	def search_courses(query):
	try:
	# Generate the embedding for the query
	query_embedding = model.encode([query])

	# Search for the closest courses using FAISS
	top_k = 6 # Retrieve the top 6 closest matches
	distances, indices = index.search(np.array(query_embedding), top_k)

	# Collect the results based on the indices returned by FAISS
	results = []
	for idx, distance in zip(indices[0], distances[0]):
	course = df.iloc[idx]
	results.append({
	'title': course['title'],
	'image_url': course['image_url'],
	'course_link': course['course_link'],
	'score': 1 - distance # Convert distance to similarity score
	})

	return sorted(results, key=lambda x: x['score'], reverse=True)

	except Exception as e:
	st.error(f"An error occurred in search_courses: {str(e)}")
	return []

	# Function to display search results in Streamlit
	def display_search_results(result_list):
	if result_list:
	for item in result_list:
	course_title = item['title']
	course_image = item['image_url']
	course_link = item['course_link']

	st.image(course_image, use_column_width=True)
	st.write(f"### {course_title}")


	button_html = f"""
	<a href="{course_link}" target="_blank">
	<button style="background-color:#4CAF50; border:none; color:white; padding:10px 20px; text-align:center; text-decoration:none; display:inline-block; font-size:16px; margin:4px 2px; cursor:pointer; border-radius:5px;">
	View Course
	</button>
	</a>
	"""
	st.markdown(button_html, unsafe_allow_html=True)
	else:
	st.write("No results found. Please try a different query.")

	# Streamlit UI
	st.title("Analytics Vidhya Free Courses🔍")
	st.image("cc.jpg")
	st.markdown("#### 🔍🌐 Get the most appropriate course as per your learning requirement.")
	st.markdown("<hr style='border:1px solid #eee;'>", unsafe_allow_html=True)

	query = st.text_input(
	"Enter course related keywords...",
	placeholder="e.g., machine learning, data science, python",
	help="Type in a keyword to find related free courses"
	)

	search_button = st.button("Search")

	# Search results section
	if search_button and query:
	st.write("Collecting courses......")
	result_list = search_courses(query)

	# Display search results if available
	if result_list:
	st.markdown(f"### Top results for: {query}")
	display_search_results(result_list)
	else:
	st.write("No results found. Please try a different keyword.")
	else:
	st.write("Please enter a search query to find relevant courses.")

	# Footer with subtle text
	st.markdown(
	"<p style='text-align:center; color:grey;'>Made by @metechmohit </p>",
	unsafe_allow_html=True
	)