Spaces:
Sleeping
Sleeping
done
Browse files- course_titles.csv +72 -0
- docx.txt +2 -0
- embeddings_data.json +0 -0
- main.py +61 -0
- requirements.txt +8 -0
- transf.py +33 -0
- webScp.py +60 -0
- work.py +54 -0
course_titles.csv
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
title
|
| 2 |
+
Coding a ChatGPT-style Language Model from Scratch in PyTorch
|
| 3 |
+
Mastering Multilingual GenAI Open-Weights for Indic Languages
|
| 4 |
+
Learning Autonomous Driving Behaviors with LLMs & RL
|
| 5 |
+
GenAI Applied to Quantitative Finance: For Control Implementation
|
| 6 |
+
"Navigating LLM Tradeoffs: Techniques for Speed, Cost, Scale & Accuracy"
|
| 7 |
+
Creating Problem-Solving Agents using GenAI for Action Composition
|
| 8 |
+
Improving Real World RAG Systems: Key Challenges & Practical Solutions
|
| 9 |
+
Framework to Choose the Right LLM for your Business
|
| 10 |
+
Building Smarter LLMs with Mamba and State Space Model
|
| 11 |
+
Generative AI - A Way of Life - Free Course
|
| 12 |
+
Building LLM Applications using Prompt Engineering - Free Course
|
| 13 |
+
Building Your First Computer Vision Model - Free Course
|
| 14 |
+
Bagging and Boosting ML Algorithms - Free Course
|
| 15 |
+
MidJourney: From Inspiration to Implementation - Free Course
|
| 16 |
+
Understanding Linear Regression - Free Course
|
| 17 |
+
The Working of Neural Networks - Free Course
|
| 18 |
+
The A to Z of Unsupervised ML - Free Course
|
| 19 |
+
Building Your first RAG System using LlamaIndex - Free Course
|
| 20 |
+
Data Preprocessing on a Real-World Problem Statement - Free Course
|
| 21 |
+
Exploring Stability.AI - Free Course
|
| 22 |
+
Building a Text Classification Model with Natural Language Processing - Free Course
|
| 23 |
+
Getting Started with Large Language Models
|
| 24 |
+
Introduction to Generative AI
|
| 25 |
+
Nano Course: Dreambooth-Stable Diffusion for Custom Images
|
| 26 |
+
A Comprehensive Learning Path for Deep Learning in 2023
|
| 27 |
+
A Comprehensive Learning Path to Become a Data Scientist in 2024
|
| 28 |
+
Nano Course: Building Large Language Models for Code
|
| 29 |
+
Certified AI & ML BlackBelt+ Program
|
| 30 |
+
Machine Learning Summer Training
|
| 31 |
+
AI Ethics by Fractal
|
| 32 |
+
A Comprehensive Learning Path to Become a Data Engineer in 2022
|
| 33 |
+
Certified Business Analytics Program
|
| 34 |
+
Certified Machine Learning Master's Program (MLMP)
|
| 35 |
+
Certified Natural Language Processing Master’s Program
|
| 36 |
+
Certified Computer Vision Master's Program
|
| 37 |
+
Applied Machine Learning - Beginner to Professional
|
| 38 |
+
Ace Data Science Interviews
|
| 39 |
+
Writing Powerful Data Science Articles
|
| 40 |
+
Machine Learning Certification Course for Beginners
|
| 41 |
+
Data Science Career Conclave
|
| 42 |
+
Top Data Science Projects for Analysts and Data Scientists
|
| 43 |
+
Getting Started with Git and GitHub for Data Science Professionals
|
| 44 |
+
Machine Learning Starter Program
|
| 45 |
+
"Data Science Hacks, Tips and Tricks"
|
| 46 |
+
Introduction to Business Analytics
|
| 47 |
+
Introduction to PyTorch for Deep Learning
|
| 48 |
+
Introductory Data Science for Business Managers
|
| 49 |
+
Introduction to Natural Language Processing
|
| 50 |
+
Getting started with Decision Trees
|
| 51 |
+
Introduction to Python
|
| 52 |
+
Loan Prediction Practice Problem (Using Python)
|
| 53 |
+
Big Mart Sales Prediction Using R
|
| 54 |
+
Twitter Sentiment Analysis
|
| 55 |
+
Pandas for Data Analysis in Python
|
| 56 |
+
Support Vector Machine (SVM) in Python and R
|
| 57 |
+
Evaluation Metrics for Machine Learning Models
|
| 58 |
+
Fundamentals of Regression Analysis
|
| 59 |
+
Getting Started with scikit-learn (sklearn) for Machine Learning
|
| 60 |
+
Convolutional Neural Networks (CNN) from Scratch
|
| 61 |
+
Dimensionality Reduction for Machine Learning
|
| 62 |
+
K-Nearest Neighbors (KNN) Algorithm in Python and R
|
| 63 |
+
Ensemble Learning and Ensemble Learning Techniques
|
| 64 |
+
Linear Programming for Data Science Professionals
|
| 65 |
+
Naive Bayes from Scratch
|
| 66 |
+
Learn Swift for Data Science
|
| 67 |
+
Introduction to Web Scraping using Python
|
| 68 |
+
Tableau for Beginners
|
| 69 |
+
Getting Started with Neural Networks
|
| 70 |
+
Introduction to AI & ML
|
| 71 |
+
Winning Data Science Hackathons - Learn from Elite Data Scientists
|
| 72 |
+
Hypothesis Testing for Data Science and Analytics
|
docx.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1. webscapping is done using bs4
|
| 2 |
+
2.
|
embeddings_data.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
main.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
|
| 7 |
+
# Load the pre-trained model
|
| 8 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 9 |
+
|
| 10 |
+
# Load the embeddings data from the JSON file
|
| 11 |
+
with open('embeddings_data.json', 'r') as file:
|
| 12 |
+
courses_data = json.load(file)
|
| 13 |
+
|
| 14 |
+
# Extract the embeddings and titles
|
| 15 |
+
course_embeddings = [np.array(course['embedding']) for course in courses_data]
|
| 16 |
+
course_titles = [course['title'] for course in courses_data]
|
| 17 |
+
|
| 18 |
+
# Function to get query embedding
|
| 19 |
+
def get_query_embedding(query):
|
| 20 |
+
return model.encode(query, convert_to_tensor=True)
|
| 21 |
+
|
| 22 |
+
# Function to add relevance factors
|
| 23 |
+
def add_relevance_factors(similarities, indices):
|
| 24 |
+
relevance_factor = 0.2 # Weight for curriculum match
|
| 25 |
+
enhanced_scores = []
|
| 26 |
+
|
| 27 |
+
for idx in indices:
|
| 28 |
+
curriculum_match = 1 if "deep learning" in course_titles[idx].lower() else 0
|
| 29 |
+
enhanced_score = similarities[idx] + relevance_factor * curriculum_match
|
| 30 |
+
enhanced_scores.append((idx, enhanced_score))
|
| 31 |
+
|
| 32 |
+
enhanced_scores.sort(key=lambda x: x[1], reverse=True)
|
| 33 |
+
return [x[0] for x in enhanced_scores]
|
| 34 |
+
|
| 35 |
+
# Streamlit UI for input
|
| 36 |
+
st.title('Course Recommendation System')
|
| 37 |
+
st.write('Enter a search query to find relevant courses')
|
| 38 |
+
|
| 39 |
+
# Input box for the search query
|
| 40 |
+
user_query = st.text_input('Search Query')
|
| 41 |
+
|
| 42 |
+
if user_query:
|
| 43 |
+
# Convert the user query to embedding
|
| 44 |
+
query_embedding = get_query_embedding(user_query)
|
| 45 |
+
|
| 46 |
+
# Calculate cosine similarities
|
| 47 |
+
cosine_similarities = cosine_similarity([np.array(query_embedding)], course_embeddings)
|
| 48 |
+
|
| 49 |
+
# Get top 5 most similar courses
|
| 50 |
+
top_k = 5
|
| 51 |
+
top_indices = cosine_similarities[0].argsort()[-top_k:][::-1]
|
| 52 |
+
|
| 53 |
+
# Apply relevance factors (optional)
|
| 54 |
+
top_indices_with_relevance = add_relevance_factors(cosine_similarities[0], top_indices)
|
| 55 |
+
|
| 56 |
+
# Display the top results
|
| 57 |
+
st.write("### Top Course Recommendations")
|
| 58 |
+
for i in top_indices_with_relevance:
|
| 59 |
+
st.write(f"**Title**: {course_titles[i]}")
|
| 60 |
+
st.write(f"**Cosine Similarity**: {cosine_similarities[0][i]:.4f}")
|
| 61 |
+
st.write("-" * 50)
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
transformers
|
| 2 |
+
tf-keras
|
| 3 |
+
numpy
|
| 4 |
+
scikit-learn
|
| 5 |
+
pandas
|
| 6 |
+
tensorflow
|
| 7 |
+
streamlit
|
| 8 |
+
sentence-transformers
|
transf.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
# Load the CSV file
|
| 4 |
+
df = pd.read_csv('course_titles.csv')
|
| 5 |
+
sentences = df['title'].tolist() # Replace with your sentence column name
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
|
| 10 |
+
# Load pre-trained model
|
| 11 |
+
model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose a different pre-trained model if needed
|
| 12 |
+
|
| 13 |
+
# Convert sentences to embeddings
|
| 14 |
+
embeddings = model.encode(sentences, convert_to_tensor=True)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
import json
|
| 19 |
+
import numpy as np
|
| 20 |
+
|
| 21 |
+
# Create a list of dictionaries containing metadata and embeddings
|
| 22 |
+
data = []
|
| 23 |
+
for i, sentence in enumerate(sentences):
|
| 24 |
+
# Example of adding course metadata along with embeddings
|
| 25 |
+
item = {
|
| 26 |
+
'title': df['title'][i], # Replace with your course title column
|
| 27 |
+
'embedding': embeddings[i].tolist() # Convert the tensor to a list
|
| 28 |
+
}
|
| 29 |
+
data.append(item)
|
| 30 |
+
|
| 31 |
+
# Save data to JSON file
|
| 32 |
+
with open('embeddings_data.json', 'w') as json_file:
|
| 33 |
+
json.dump(data, json_file, indent=4)
|
webScp.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import csv
|
| 4 |
+
|
| 5 |
+
## function for csv
|
| 6 |
+
def save_to_csv(course_titles, filename='course_titles.csv'):
|
| 7 |
+
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
| 8 |
+
csv_w = csv.writer(csvfile)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
for i in course_titles:
|
| 13 |
+
csv_w.writerow([i])
|
| 14 |
+
print(f"Data saved to {filename}.")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
## function for web scrapping
|
| 19 |
+
def scrape_courses(base_url, max_pages=8):
|
| 20 |
+
|
| 21 |
+
## list to store all course titles
|
| 22 |
+
all_course_titles = []
|
| 23 |
+
|
| 24 |
+
for i in range(1, max_pages + 1):
|
| 25 |
+
# Construct the URL for the current page
|
| 26 |
+
url = f"{base_url}{i}"
|
| 27 |
+
print(f"Scraping page {i}: {url}")
|
| 28 |
+
|
| 29 |
+
## response
|
| 30 |
+
response = requests.get(url)
|
| 31 |
+
|
| 32 |
+
# check if the page exists--> status code 200
|
| 33 |
+
if response.status_code != 200:
|
| 34 |
+
print(f"Page {i} does not exist or cannot be accessed.")
|
| 35 |
+
break # Stop if we hit a page that doesn’t exist
|
| 36 |
+
|
| 37 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 38 |
+
|
| 39 |
+
## 'products__list'
|
| 40 |
+
products_list = soup.find(class_='products__list')
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if products_list:
|
| 44 |
+
# findling h3
|
| 45 |
+
titles = products_list.find_all('h3')
|
| 46 |
+
|
| 47 |
+
for title in titles:
|
| 48 |
+
title_text = title.get_text(strip=True)
|
| 49 |
+
print(f"Course Title: {title_text}")
|
| 50 |
+
all_course_titles.append(title_text) # Add title to the list
|
| 51 |
+
else:
|
| 52 |
+
print(f"No 'products__list' container found on page {i}.")
|
| 53 |
+
|
| 54 |
+
# saving the course titles to a CSV file
|
| 55 |
+
save_to_csv(all_course_titles)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
## function calling
|
| 59 |
+
base_url = 'https://courses.analyticsvidhya.com/collections?page='
|
| 60 |
+
scrape_courses(base_url)
|
work.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
| 5 |
+
|
| 6 |
+
# Load the pre-trained model
|
| 7 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 8 |
+
|
| 9 |
+
# Load the embeddings data from the JSON file
|
| 10 |
+
with open('embeddings_data.json', 'r') as file:
|
| 11 |
+
courses_data = json.load(file)
|
| 12 |
+
|
| 13 |
+
# Extract the embeddings and titles
|
| 14 |
+
course_embeddings = [np.array(course['embedding']) for course in courses_data]
|
| 15 |
+
course_titles = [course['title'] for course in courses_data]
|
| 16 |
+
|
| 17 |
+
# Function to get query embedding
|
| 18 |
+
def get_query_embedding(query):
|
| 19 |
+
return model.encode(query, convert_to_tensor=True)
|
| 20 |
+
|
| 21 |
+
# Function to add relevance factors
|
| 22 |
+
def add_relevance_factors(similarities, indices):
|
| 23 |
+
relevance_factor = 0.2 # Weight for curriculum match
|
| 24 |
+
enhanced_scores = []
|
| 25 |
+
|
| 26 |
+
for idx in indices:
|
| 27 |
+
curriculum_match = 1 if "deep learning" in course_titles[idx].lower() else 0
|
| 28 |
+
enhanced_score = similarities[idx] + relevance_factor * curriculum_match
|
| 29 |
+
enhanced_scores.append((idx, enhanced_score))
|
| 30 |
+
|
| 31 |
+
enhanced_scores.sort(key=lambda x: x[1], reverse=True)
|
| 32 |
+
return [x[0] for x in enhanced_scores]
|
| 33 |
+
|
| 34 |
+
# Example user query
|
| 35 |
+
user_query = "machine learning courses with deep learning"
|
| 36 |
+
|
| 37 |
+
# Convert the user query to embedding
|
| 38 |
+
query_embedding = get_query_embedding(user_query)
|
| 39 |
+
|
| 40 |
+
# Calculate cosine similarities
|
| 41 |
+
cosine_similarities = cosine_similarity([np.array(query_embedding)], course_embeddings)
|
| 42 |
+
|
| 43 |
+
# Get top 5 most similar courses
|
| 44 |
+
top_k = 5
|
| 45 |
+
top_indices = cosine_similarities[0].argsort()[-top_k:][::-1]
|
| 46 |
+
|
| 47 |
+
# Apply relevance factors (optional)
|
| 48 |
+
top_indices_with_relevance = add_relevance_factors(cosine_similarities[0], top_indices)
|
| 49 |
+
|
| 50 |
+
# Display the top results
|
| 51 |
+
for i in top_indices_with_relevance:
|
| 52 |
+
print(f"Title: {course_titles[i]}")
|
| 53 |
+
print(f"Cosine Similarity: {cosine_similarities[0][i]:.4f}")
|
| 54 |
+
print("-" * 50)
|