RiH-137Rishi commited on
Commit
ab476b4
·
verified ·
1 Parent(s): f495de6
Files changed (8) hide show
  1. course_titles.csv +72 -0
  2. docx.txt +2 -0
  3. embeddings_data.json +0 -0
  4. main.py +61 -0
  5. requirements.txt +8 -0
  6. transf.py +33 -0
  7. webScp.py +60 -0
  8. work.py +54 -0
course_titles.csv ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title
2
+ Coding a ChatGPT-style Language Model from Scratch in PyTorch
3
+ Mastering Multilingual GenAI Open-Weights for Indic Languages
4
+ Learning Autonomous Driving Behaviors with LLMs & RL
5
+ GenAI Applied to Quantitative Finance: For Control Implementation
6
+ "Navigating LLM Tradeoffs: Techniques for Speed, Cost, Scale & Accuracy"
7
+ Creating Problem-Solving Agents using GenAI for Action Composition
8
+ Improving Real World RAG Systems: Key Challenges & Practical Solutions
9
+ Framework to Choose the Right LLM for your Business
10
+ Building Smarter LLMs with Mamba and State Space Model
11
+ Generative AI - A Way of Life - Free Course
12
+ Building LLM Applications using Prompt Engineering - Free Course
13
+ Building Your First Computer Vision Model - Free Course
14
+ Bagging and Boosting ML Algorithms - Free Course
15
+ MidJourney: From Inspiration to Implementation - Free Course
16
+ Understanding Linear Regression - Free Course
17
+ The Working of Neural Networks - Free Course
18
+ The A to Z of Unsupervised ML - Free Course
19
+ Building Your first RAG System using LlamaIndex - Free Course
20
+ Data Preprocessing on a Real-World Problem Statement - Free Course
21
+ Exploring Stability.AI - Free Course
22
+ Building a Text Classification Model with Natural Language Processing - Free Course
23
+ Getting Started with Large Language Models
24
+ Introduction to Generative AI
25
+ Nano Course: Dreambooth-Stable Diffusion for Custom Images
26
+ A Comprehensive Learning Path for Deep Learning in 2023
27
+ A Comprehensive Learning Path to Become a Data Scientist in 2024
28
+ Nano Course: Building Large Language Models for Code
29
+ Certified AI & ML BlackBelt+ Program
30
+ Machine Learning Summer Training
31
+ AI Ethics by Fractal
32
+ A Comprehensive Learning Path to Become a Data Engineer in 2022
33
+ Certified Business Analytics Program
34
+ Certified Machine Learning Master's Program (MLMP)
35
+ Certified Natural Language Processing Master’s Program
36
+ Certified Computer Vision Master's Program
37
+ Applied Machine Learning - Beginner to Professional
38
+ Ace Data Science Interviews
39
+ Writing Powerful Data Science Articles
40
+ Machine Learning Certification Course for Beginners
41
+ Data Science Career Conclave
42
+ Top Data Science Projects for Analysts and Data Scientists
43
+ Getting Started with Git and GitHub for Data Science Professionals
44
+ Machine Learning Starter Program
45
+ "Data Science Hacks, Tips and Tricks"
46
+ Introduction to Business Analytics
47
+ Introduction to PyTorch for Deep Learning
48
+ Introductory Data Science for Business Managers
49
+ Introduction to Natural Language Processing
50
+ Getting started with Decision Trees
51
+ Introduction to Python
52
+ Loan Prediction Practice Problem (Using Python)
53
+ Big Mart Sales Prediction Using R
54
+ Twitter Sentiment Analysis
55
+ Pandas for Data Analysis in Python
56
+ Support Vector Machine (SVM) in Python and R
57
+ Evaluation Metrics for Machine Learning Models
58
+ Fundamentals of Regression Analysis
59
+ Getting Started with scikit-learn (sklearn) for Machine Learning
60
+ Convolutional Neural Networks (CNN) from Scratch
61
+ Dimensionality Reduction for Machine Learning
62
+ K-Nearest Neighbors (KNN) Algorithm in Python and R
63
+ Ensemble Learning and Ensemble Learning Techniques
64
+ Linear Programming for Data Science Professionals
65
+ Naive Bayes from Scratch
66
+ Learn Swift for Data Science
67
+ Introduction to Web Scraping using Python
68
+ Tableau for Beginners
69
+ Getting Started with Neural Networks
70
+ Introduction to AI & ML
71
+ Winning Data Science Hackathons - Learn from Elite Data Scientists
72
+ Hypothesis Testing for Data Science and Analytics
docx.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ 1. webscapping is done using bs4
2
+ 2.
embeddings_data.json ADDED
The diff for this file is too large to render. See raw diff
 
main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import json
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ # Load the pre-trained model
8
+ model = SentenceTransformer('all-MiniLM-L6-v2')
9
+
10
+ # Load the embeddings data from the JSON file
11
+ with open('embeddings_data.json', 'r') as file:
12
+ courses_data = json.load(file)
13
+
14
+ # Extract the embeddings and titles
15
+ course_embeddings = [np.array(course['embedding']) for course in courses_data]
16
+ course_titles = [course['title'] for course in courses_data]
17
+
18
+ # Function to get query embedding
19
+ def get_query_embedding(query):
20
+ return model.encode(query, convert_to_tensor=True)
21
+
22
+ # Function to add relevance factors
23
+ def add_relevance_factors(similarities, indices):
24
+ relevance_factor = 0.2 # Weight for curriculum match
25
+ enhanced_scores = []
26
+
27
+ for idx in indices:
28
+ curriculum_match = 1 if "deep learning" in course_titles[idx].lower() else 0
29
+ enhanced_score = similarities[idx] + relevance_factor * curriculum_match
30
+ enhanced_scores.append((idx, enhanced_score))
31
+
32
+ enhanced_scores.sort(key=lambda x: x[1], reverse=True)
33
+ return [x[0] for x in enhanced_scores]
34
+
35
+ # Streamlit UI for input
36
+ st.title('Course Recommendation System')
37
+ st.write('Enter a search query to find relevant courses')
38
+
39
+ # Input box for the search query
40
+ user_query = st.text_input('Search Query')
41
+
42
+ if user_query:
43
+ # Convert the user query to embedding
44
+ query_embedding = get_query_embedding(user_query)
45
+
46
+ # Calculate cosine similarities
47
+ cosine_similarities = cosine_similarity([np.array(query_embedding)], course_embeddings)
48
+
49
+ # Get top 5 most similar courses
50
+ top_k = 5
51
+ top_indices = cosine_similarities[0].argsort()[-top_k:][::-1]
52
+
53
+ # Apply relevance factors (optional)
54
+ top_indices_with_relevance = add_relevance_factors(cosine_similarities[0], top_indices)
55
+
56
+ # Display the top results
57
+ st.write("### Top Course Recommendations")
58
+ for i in top_indices_with_relevance:
59
+ st.write(f"**Title**: {course_titles[i]}")
60
+ st.write(f"**Cosine Similarity**: {cosine_similarities[0][i]:.4f}")
61
+ st.write("-" * 50)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ tf-keras
3
+ numpy
4
+ scikit-learn
5
+ pandas
6
+ tensorflow
7
+ streamlit
8
+ sentence-transformers
transf.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Load the CSV file
4
+ df = pd.read_csv('course_titles.csv')
5
+ sentences = df['title'].tolist() # Replace with your sentence column name
6
+
7
+
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ # Load pre-trained model
11
+ model = SentenceTransformer('all-MiniLM-L6-v2') # You can choose a different pre-trained model if needed
12
+
13
+ # Convert sentences to embeddings
14
+ embeddings = model.encode(sentences, convert_to_tensor=True)
15
+
16
+
17
+
18
+ import json
19
+ import numpy as np
20
+
21
+ # Create a list of dictionaries containing metadata and embeddings
22
+ data = []
23
+ for i, sentence in enumerate(sentences):
24
+ # Example of adding course metadata along with embeddings
25
+ item = {
26
+ 'title': df['title'][i], # Replace with your course title column
27
+ 'embedding': embeddings[i].tolist() # Convert the tensor to a list
28
+ }
29
+ data.append(item)
30
+
31
+ # Save data to JSON file
32
+ with open('embeddings_data.json', 'w') as json_file:
33
+ json.dump(data, json_file, indent=4)
webScp.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import csv
4
+
5
+ ## function for csv
6
+ def save_to_csv(course_titles, filename='course_titles.csv'):
7
+ with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
8
+ csv_w = csv.writer(csvfile)
9
+
10
+
11
+
12
+ for i in course_titles:
13
+ csv_w.writerow([i])
14
+ print(f"Data saved to {filename}.")
15
+
16
+
17
+
18
+ ## function for web scrapping
19
+ def scrape_courses(base_url, max_pages=8):
20
+
21
+ ## list to store all course titles
22
+ all_course_titles = []
23
+
24
+ for i in range(1, max_pages + 1):
25
+ # Construct the URL for the current page
26
+ url = f"{base_url}{i}"
27
+ print(f"Scraping page {i}: {url}")
28
+
29
+ ## response
30
+ response = requests.get(url)
31
+
32
+ # check if the page exists--> status code 200
33
+ if response.status_code != 200:
34
+ print(f"Page {i} does not exist or cannot be accessed.")
35
+ break # Stop if we hit a page that doesn’t exist
36
+
37
+ soup = BeautifulSoup(response.text, 'html.parser')
38
+
39
+ ## 'products__list'
40
+ products_list = soup.find(class_='products__list')
41
+
42
+
43
+ if products_list:
44
+ # findling h3
45
+ titles = products_list.find_all('h3')
46
+
47
+ for title in titles:
48
+ title_text = title.get_text(strip=True)
49
+ print(f"Course Title: {title_text}")
50
+ all_course_titles.append(title_text) # Add title to the list
51
+ else:
52
+ print(f"No 'products__list' container found on page {i}.")
53
+
54
+ # saving the course titles to a CSV file
55
+ save_to_csv(all_course_titles)
56
+
57
+
58
+ ## function calling
59
+ base_url = 'https://courses.analyticsvidhya.com/collections?page='
60
+ scrape_courses(base_url)
work.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from sklearn.metrics.pairwise import cosine_similarity
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ # Load the pre-trained model
7
+ model = SentenceTransformer('all-MiniLM-L6-v2')
8
+
9
+ # Load the embeddings data from the JSON file
10
+ with open('embeddings_data.json', 'r') as file:
11
+ courses_data = json.load(file)
12
+
13
+ # Extract the embeddings and titles
14
+ course_embeddings = [np.array(course['embedding']) for course in courses_data]
15
+ course_titles = [course['title'] for course in courses_data]
16
+
17
+ # Function to get query embedding
18
+ def get_query_embedding(query):
19
+ return model.encode(query, convert_to_tensor=True)
20
+
21
+ # Function to add relevance factors
22
+ def add_relevance_factors(similarities, indices):
23
+ relevance_factor = 0.2 # Weight for curriculum match
24
+ enhanced_scores = []
25
+
26
+ for idx in indices:
27
+ curriculum_match = 1 if "deep learning" in course_titles[idx].lower() else 0
28
+ enhanced_score = similarities[idx] + relevance_factor * curriculum_match
29
+ enhanced_scores.append((idx, enhanced_score))
30
+
31
+ enhanced_scores.sort(key=lambda x: x[1], reverse=True)
32
+ return [x[0] for x in enhanced_scores]
33
+
34
+ # Example user query
35
+ user_query = "machine learning courses with deep learning"
36
+
37
+ # Convert the user query to embedding
38
+ query_embedding = get_query_embedding(user_query)
39
+
40
+ # Calculate cosine similarities
41
+ cosine_similarities = cosine_similarity([np.array(query_embedding)], course_embeddings)
42
+
43
+ # Get top 5 most similar courses
44
+ top_k = 5
45
+ top_indices = cosine_similarities[0].argsort()[-top_k:][::-1]
46
+
47
+ # Apply relevance factors (optional)
48
+ top_indices_with_relevance = add_relevance_factors(cosine_similarities[0], top_indices)
49
+
50
+ # Display the top results
51
+ for i in top_indices_with_relevance:
52
+ print(f"Title: {course_titles[i]}")
53
+ print(f"Cosine Similarity: {cosine_similarities[0][i]:.4f}")
54
+ print("-" * 50)