navneetsatyamkumar commited on
Commit
a9faa64
·
verified ·
1 Parent(s): f6475d0

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +80 -0
  2. course_details.json +181 -0
  3. course_faiss.index +0 -0
  4. index_courses.py +39 -0
  5. requirements.txt +90 -0
  6. scrape_courses.py +85 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import faiss
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ import json
6
+
7
+ base_url = "https://courses.analyticsvidhya.com/"
8
+ course_paths = [
9
+ "/courses/frameworks-for-effective-problem-solving",
10
+ "/courses/your-ultimate-guide-to-becoming-an-agentic-ai-expert-by-2025",
11
+ "/courses/a-comprehensive-learning-path-to-become-a-data-analyst-in-2025",
12
+ "/courses/reimagining-genai-common-mistakes-and-best-practices-for-success",
13
+ "/courses/coding-a-chatgpt-style-language-model-from-scratch-in-pytorch",
14
+ "/courses/mastering-multilingual-genai-open-weights-for-indic-languages",
15
+ "/courses/learning-autonomous-driving-behaviors-with-llms-and-rl",
16
+ "/courses/genai-applied-to-quantitative-finance-for-control-implementation",
17
+ "/courses/navigating-llm-tradeoffs-techniques-for-speed-cost-scale-and-accuracy",
18
+ "/courses/applied-machine-learning-beginner-to-professional",
19
+ "courses/ace-data-science-interviews",
20
+ "courses/data-science-hacks-tips-and-tricks",
21
+ "courses/getting-started-with-decision-trees",
22
+ "courses/loan-prediction-practice-problem-using-python",
23
+ "courses/big-mart-sales-prediction-using-r",
24
+ "courses/twitter-sentiment-analysis",
25
+ "courses/pandas-for-data-analysis-in-python",
26
+ "courses/support-vector-machine-svm-in-python-and-r",
27
+ "courses/nano-course-dreambooth-stable-diffusion-for-custom-images",
28
+ "courses/building-large-language-models-for-code",
29
+ "courses/cutting-edge-llm-tricks",
30
+ ]
31
+
32
+ index = faiss.read_index("course_faiss.index")
33
+ with open("course_details.json", "r") as f:
34
+ course_details = json.load(f)
35
+
36
+ model = SentenceTransformer('all-MiniLM-L6-v2')
37
+
38
+ def search_courses(query, top_k=5):
39
+ # Encode the query to get its embedding
40
+ query_embedding = model.encode([query])
41
+ query_embedding = np.array(query_embedding).astype("float32")
42
+
43
+ # Search the FAISS index for the top_k most similar courses
44
+ distances, indices = index.search(query_embedding, top_k)
45
+ results = []
46
+ for idx, dist in zip(indices[0], distances[0]):
47
+ course = course_details[idx]
48
+ results.append({
49
+ "title": course["title"],
50
+ "description": course["description"],
51
+ "curriculum": course["curriculum"], # Include curriculum
52
+ "additional_info": course["additional_info"], # Include additional info
53
+ "link": base_url + course_paths[idx], # Use the base URL and course paths to generate the full link
54
+ "distance": dist
55
+ })
56
+ return results
57
+
58
+ # Streamlit UI
59
+ st.title("Smart Search for Free Courses")
60
+ st.write("Search for free courses on Analytics Vidhya!")
61
+
62
+ query = st.text_input("Enter your query:")
63
+ if query:
64
+ results = search_courses(query)
65
+ for res in results:
66
+ st.subheader(res['title'])
67
+ st.write(res['description'])
68
+
69
+ if res['curriculum']:
70
+ st.write("### Curriculum")
71
+ for item in res['curriculum']:
72
+ st.write(f"- {item}")
73
+
74
+ if res['additional_info']:
75
+ st.write("### Additional Information")
76
+ st.write(f"**Duration:** {res['additional_info'].get('duration', 'N/A')}")
77
+ st.write(f"**Rating:** {res['additional_info'].get('rating', 'N/A')}")
78
+ st.write(f"**Difficulty:** {res['additional_info'].get('difficulty', 'N/A')}")
79
+
80
+ st.markdown(f"[Learn More]({res['link']})")
course_details.json ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "title": "Frameworks for Effective Problem Solving",
4
+ "description": "Learn structured thinking with SMART, MECE, and Issue Trees. Define clear problem statements, solve challenges systematically, and avoid common pitfalls.",
5
+ "curriculum": [],
6
+ "additional_info": {
7
+ "duration": "1 Hour",
8
+ "rating": "4.9/5",
9
+ "difficulty": "Beginner"
10
+ }
11
+ },
12
+ {
13
+ "title": "Anyone can Build AI Agents - Free Course",
14
+ "description": "Dive into the world of AI with ease! This short and engaging course introduces the exciting possibilities of creating AI agents using no-code platforms. Whether you're new to AI or just curious, we've designed this course to be beginner-friendly.",
15
+ "curriculum": [],
16
+ "additional_info": {}
17
+ },
18
+ {
19
+ "title": "A Comprehensive Learning Path to Become a Data Analyst in 2025",
20
+ "description": "Want to become a data analyst this year, but confused about where to start and what to follow? This comprehensive learning path from Analytics Vidhya should provide you with all the answers you need!",
21
+ "curriculum": [],
22
+ "additional_info": {
23
+ "duration": "2 Hours",
24
+ "rating": "4.8/5",
25
+ "difficulty": "Beginner"
26
+ }
27
+ },
28
+ {
29
+ "title": "Reimagining GenAI: Common Mistakes and Best Practices for Success",
30
+ "description": "Discover the Secrets to Implementing Generative AI Successfully",
31
+ "curriculum": [],
32
+ "additional_info": {
33
+ "duration": "1 Hour",
34
+ "rating": "4.8/5",
35
+ "difficulty": "Beginner"
36
+ }
37
+ },
38
+ {
39
+ "title": "Coding a ChatGPT-style Language Model from Scratch in PyTorch",
40
+ "description": "Learn to build your own language model with PyTorch step-by-step.",
41
+ "curriculum": [],
42
+ "additional_info": {
43
+ "duration": "1 Hour",
44
+ "rating": "4.8/5",
45
+ "difficulty": "Beginner"
46
+ }
47
+ },
48
+ {
49
+ "title": "Mastering Multilingual GenAI Open-Weights for Indic Languages",
50
+ "description": "Unlock the power of open-weight models to build cutting-edge multilingual AI solutions.",
51
+ "curriculum": [],
52
+ "additional_info": {
53
+ "duration": "1 Hour",
54
+ "rating": "4.7/5",
55
+ "difficulty": "Beginner"
56
+ }
57
+ },
58
+ {
59
+ "title": "Learning Autonomous Driving Behaviors with LLMs & RL",
60
+ "description": "Learn to train autonomous driving agents using Reinforcement Learning (RL) and Large Language Models (LLMs). Gain practical experience designing AI systems that simulate safe, human-like driving behavior.",
61
+ "curriculum": [],
62
+ "additional_info": {
63
+ "duration": "1 Hour",
64
+ "rating": "4.7/5",
65
+ "difficulty": "Intermediate"
66
+ }
67
+ },
68
+ {
69
+ "title": "GenAI Applied to Quantitative Finance: For Control Implementation",
70
+ "description": "Embark on the journey to understand quantitative finance with GenAI. Learn to implement AI-driven control systems for trading, risk management, and predictive modeling, optimizing financial decision-making and performance.",
71
+ "curriculum": [],
72
+ "additional_info": {
73
+ "duration": "1 Hour",
74
+ "rating": "4.7/5",
75
+ "difficulty": "Intermediate"
76
+ }
77
+ },
78
+ {
79
+ "title": "Navigating LLM Tradeoffs: Techniques for Speed, Cost, Scale & Accuracy",
80
+ "description": "Master the art of optimizing LLMs with practical techniques to achieve the best balance of performance and cost.",
81
+ "curriculum": [],
82
+ "additional_info": {
83
+ "duration": "1 Hour",
84
+ "rating": "4.8/5",
85
+ "difficulty": "Beginner"
86
+ }
87
+ },
88
+ {
89
+ "title": "Applied Machine Learning - Beginner to Professional",
90
+ "description": "This course provides you all the tools and techniques you need to apply machine learning to solve business problems. We will cover the basics of machine learning, how to build machine learning models, improve and deploy your machine learning models.",
91
+ "curriculum": [],
92
+ "additional_info": {}
93
+ },
94
+ {
95
+ "title": "Ace Data Science Interviews",
96
+ "description": "A comprehensive course covering different kinds of interviews in data science industry and how to ace these interviews. This includes technical interviews on data science / machine learning, case study interviews, guesstimate based interviews.",
97
+ "curriculum": [],
98
+ "additional_info": {}
99
+ },
100
+ {
101
+ "title": "Data Science Hacks, Tips and Tricks",
102
+ "description": "Become a better data scientist with crucial data science tips, tricks, python hacks, and efficient python code. Get python efficiency tips from industry experts at your finger tips.",
103
+ "curriculum": [],
104
+ "additional_info": {}
105
+ },
106
+ {
107
+ "title": "Getting started with Decision Trees",
108
+ "description": "Unleash the power of decision tree algorithm in machine learning with our free decision tree course and training designed for beginners to learn coding in python.",
109
+ "curriculum": [],
110
+ "additional_info": {}
111
+ },
112
+ {
113
+ "title": "Loan Prediction Practice Problem (Using Python)",
114
+ "description": "This course is aimed for people getting started into Data Science and Machine Learning while working on a real life practical problem.",
115
+ "curriculum": [],
116
+ "additional_info": {
117
+ "rating": "4.7/5",
118
+ "difficulty": "Intermediate"
119
+ }
120
+ },
121
+ {
122
+ "title": "Big Mart Sales Prediction Using R",
123
+ "description": "This course is aimed for people getting started into Data Science and Machine Learning while solving the Big Mart Sales Prediction problem.",
124
+ "curriculum": [],
125
+ "additional_info": {
126
+ "rating": "4.6/5",
127
+ "difficulty": "Intermediate"
128
+ }
129
+ },
130
+ {
131
+ "title": "Twitter Sentiment Analysis",
132
+ "description": "What is sentiment analysis? Why is sentiment analysis so popular in data science? And how can you perform sentiment analysis? Find the answers to all these questions in this free course on Sentiment Analysis using Python!",
133
+ "curriculum": [],
134
+ "additional_info": {
135
+ "rating": "4.7/5",
136
+ "difficulty": "Intermediate"
137
+ }
138
+ },
139
+ {
140
+ "title": "Pandas for Data Analysis in Python",
141
+ "description": "Learn high-performance pandas in python tutorial, pandas library in python for data analysis in data science. Explore python libraries for data science in this exemplary free course.",
142
+ "curriculum": [],
143
+ "additional_info": {}
144
+ },
145
+ {
146
+ "title": "Support Vector Machine (SVM) in Python and R",
147
+ "description": "Upskill with Support Vector Machine (SVM) in python, learn about SVM implementation in python from scratch in this free course for data scientists to ace in their data science career.",
148
+ "curriculum": [],
149
+ "additional_info": {}
150
+ },
151
+ {
152
+ "title": "Nano Course: Dreambooth-Stable Diffusion for Custom Images",
153
+ "description": "Theory to Practice: Dive into Stable Diffusion, its history, and significance, then master the Dreambooth process. Learn how to fine-tune Dreambooth model with your custom images discussing step by step in detail.",
154
+ "curriculum": [],
155
+ "additional_info": {
156
+ "duration": "1 Hour",
157
+ "rating": "4.6/5",
158
+ "difficulty": "Advanced"
159
+ }
160
+ },
161
+ {
162
+ "title": "Nano Course: Building Large Language Models for Code",
163
+ "description": "Learn how to train Large Language Models for Code from Scratch covering each step involved in detail from training data curation to model evaluation. Deep dive into the journey of creating Starcoder, a 15B parameter code generation model.",
164
+ "curriculum": [],
165
+ "additional_info": {
166
+ "duration": "38 Mins",
167
+ "rating": "4.7",
168
+ "difficulty": "Intermediate"
169
+ }
170
+ },
171
+ {
172
+ "title": "Nano Course: Cutting Edge LLM Tricks",
173
+ "description": "Learn cutting edge LLM tricks and techniques from top research papers including DeepMind and Meta AI and apply these tricks in building your own state of the art LLMs.",
174
+ "curriculum": [],
175
+ "additional_info": {
176
+ "duration": "38 Mins",
177
+ "rating": "4.7/5",
178
+ "difficulty": "Advanced"
179
+ }
180
+ }
181
+ ]
course_faiss.index ADDED
Binary file (32.3 kB). View file
 
index_courses.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import faiss
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from scrape_courses import all_course_details # Import the scraped course data
6
+
7
+ # Initialize SentenceTransformer model for embeddings
8
+ model = SentenceTransformer('all-MiniLM-L6-v2')
9
+
10
+ # Function to store course details in FAISS
11
+ def store_in_faiss(course_details):
12
+ titles = [course["title"] for course in course_details]
13
+ descriptions = [course["description"] for course in course_details]
14
+
15
+ # Combine titles and descriptions into one string for a more comprehensive embedding
16
+ combined_texts = [title + " " + description for title, description in zip(titles, descriptions)]
17
+
18
+ # Generate embeddings for course details
19
+ embeddings = model.encode(combined_texts)
20
+
21
+ # Convert embeddings to numpy array for FAISS
22
+ embeddings = np.array(embeddings).astype("float32")
23
+
24
+ # Initialize FAISS index
25
+ dimension = embeddings.shape[1] # Get the dimensionality of the embeddings
26
+ index = faiss.IndexFlatL2(dimension) # Use L2 distance for similarity
27
+
28
+ # Add embeddings to FAISS index
29
+ index.add(embeddings)
30
+
31
+ return index
32
+
33
+ # Store course details in FAISS
34
+ faiss_index = store_in_faiss(all_course_details)
35
+
36
+ # Optionally, save the FAISS index to disk
37
+ faiss.write_index(faiss_index, "course_faiss.index")
38
+
39
+ print("Indexing completed. FAISS index saved to 'course_faiss.index'.")
requirements.txt ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohappyeyeballs==2.4.4
2
+ aiohttp==3.11.11
3
+ aiosignal==1.3.2
4
+ altair==5.5.0
5
+ annotated-types==0.7.0
6
+ anyio==4.7.0
7
+ attrs==24.3.0
8
+ beautifulsoup4==4.12.3
9
+ blinker==1.9.0
10
+ bs4==0.0.2
11
+ cachetools==5.5.0
12
+ certifi==2024.12.14
13
+ charset-normalizer==3.4.1
14
+ click==8.1.8
15
+ faiss-cpu==1.9.0.post1
16
+ filelock==3.16.1
17
+ frozenlist==1.5.0
18
+ fsspec==2024.12.0
19
+ gitdb==4.0.12
20
+ GitPython==3.1.44
21
+ h11==0.14.0
22
+ httpcore==1.0.7
23
+ httpx==0.28.1
24
+ huggingface-hub==0.27.0
25
+ idna==3.10
26
+ Jinja2==3.1.5
27
+ joblib==1.4.2
28
+ jsonpatch==1.33
29
+ jsonpointer==3.0.0
30
+ jsonschema==4.23.0
31
+ jsonschema-specifications==2024.10.1
32
+ langchain==0.3.13
33
+ langchain-core==0.3.28
34
+ langchain-text-splitters==0.3.4
35
+ langsmith==0.2.7
36
+ markdown-it-py==3.0.0
37
+ MarkupSafe==3.0.2
38
+ mdurl==0.1.2
39
+ mpmath==1.3.0
40
+ multidict==6.1.0
41
+ narwhals==1.20.1
42
+ networkx==3.4.2
43
+ numpy==2.2.1
44
+ orjson==3.10.13
45
+ packaging==24.2
46
+ pandas==2.2.3
47
+ pillow==11.1.0
48
+ pinecone-client==5.0.1
49
+ pinecone-plugin-inference==1.1.0
50
+ pinecone-plugin-interface==0.0.7
51
+ propcache==0.2.1
52
+ protobuf==5.29.2
53
+ pyarrow==18.1.0
54
+ pydantic==2.10.4
55
+ pydantic_core==2.27.2
56
+ pydeck==0.9.1
57
+ Pygments==2.18.0
58
+ python-dateutil==2.9.0.post0
59
+ pytz==2024.2
60
+ PyYAML==6.0.2
61
+ referencing==0.35.1
62
+ regex==2024.11.6
63
+ requests==2.32.3
64
+ requests-toolbelt==1.0.0
65
+ rich==13.9.4
66
+ rpds-py==0.22.3
67
+ safetensors==0.5.0
68
+ scikit-learn==1.6.0
69
+ scipy==1.14.1
70
+ sentence-transformers==3.3.1
71
+ setuptools==75.6.0
72
+ six==1.17.0
73
+ smmap==5.0.2
74
+ sniffio==1.3.1
75
+ soupsieve==2.6
76
+ SQLAlchemy==2.0.36
77
+ streamlit==1.41.1
78
+ sympy==1.13.1
79
+ tenacity==9.0.0
80
+ threadpoolctl==3.5.0
81
+ tokenizers==0.21.0
82
+ toml==0.10.2
83
+ torch==2.5.1
84
+ tornado==6.4.2
85
+ tqdm==4.67.1
86
+ transformers==4.47.1
87
+ typing_extensions==4.12.2
88
+ tzdata==2024.2
89
+ urllib3==2.3.0
90
+ yarl==1.18.3
scrape_courses.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ base_url = "https://courses.analyticsvidhya.com/"
5
+
6
+ course_paths = [
7
+ "/courses/frameworks-for-effective-problem-solving",
8
+ "/courses/your-ultimate-guide-to-becoming-an-agentic-ai-expert-by-2025",
9
+ "/courses/a-comprehensive-learning-path-to-become-a-data-analyst-in-2025",
10
+ "/courses/reimagining-genai-common-mistakes-and-best-practices-for-success",
11
+ "/courses/coding-a-chatgpt-style-language-model-from-scratch-in-pytorch",
12
+ "/courses/mastering-multilingual-genai-open-weights-for-indic-languages",
13
+ "/courses/learning-autonomous-driving-behaviors-with-llms-and-rl",
14
+ "/courses/genai-applied-to-quantitative-finance-for-control-implementation",
15
+ "/courses/navigating-llm-tradeoffs-techniques-for-speed-cost-scale-and-accuracy",
16
+ "/courses/applied-machine-learning-beginner-to-professional",
17
+ "courses/ace-data-science-interviews",
18
+ "courses/data-science-hacks-tips-and-tricks",
19
+ "courses/getting-started-with-decision-trees",
20
+ "courses/loan-prediction-practice-problem-using-python",
21
+ "courses/big-mart-sales-prediction-using-r",
22
+ "courses/twitter-sentiment-analysis",
23
+ "courses/pandas-for-data-analysis-in-python",
24
+ "courses/support-vector-machine-svm-in-python-and-r",
25
+ "courses/nano-course-dreambooth-stable-diffusion-for-custom-images",
26
+ "courses/building-large-language-models-for-code",
27
+ "courses/cutting-edge-llm-tricks",
28
+
29
+ ]
30
+
31
+ def scrape_course_details(course_path):
32
+ url = base_url + course_path
33
+ response = requests.get(url)
34
+ if response.status_code != 200:
35
+ print(f"Failed to fetch {url}")
36
+ return None
37
+
38
+ soup = BeautifulSoup(response.text, 'html.parser')
39
+
40
+ # Extract title
41
+ title = soup.find("h1").text.strip() if soup.find("h1") else "No title found"
42
+
43
+ # Extract description
44
+ description = soup.find("meta", {"name": "description"})["content"].strip() if soup.find("meta", {"name": "description"}) else "No description found"
45
+
46
+ # Extract curriculum
47
+ curriculum_header = soup.find("h3", class_="section__heading", string="Course curriculum")
48
+ curriculum = []
49
+ if curriculum_header:
50
+ # Get the list of curriculum items
51
+ curriculum_list = curriculum_header.find_next("ul", class_="text-icon__list section__body")
52
+ if curriculum_list:
53
+ curriculum = [item.get_text(strip=True) for item in curriculum_list.find_all("h4")]
54
+
55
+ # Extract additional course information (duration, rating, difficulty)
56
+ additional_info = {}
57
+ info_list = soup.select(".text-icon__list-item")
58
+ for item in info_list:
59
+ icon = item.find("i")
60
+ if icon:
61
+ if "fa-clock-o" in icon.get("class", []):
62
+ additional_info["duration"] = item.find("h4").text.strip() if item.find("h4") else "No duration"
63
+ elif "fa-star" in icon.get("class", []):
64
+ additional_info["rating"] = item.find("h4").text.strip() if item.find("h4") else "No rating"
65
+ elif "fa-signal" in icon.get("class", []):
66
+ additional_info["difficulty"] = item.find("h4").text.strip() if item.find("h4") else "No difficulty level"
67
+
68
+ return {
69
+ "title": title,
70
+ "description": description,
71
+ "curriculum": curriculum,
72
+ "additional_info": additional_info
73
+ }
74
+
75
+ all_course_details = []
76
+ for path in course_paths:
77
+ details = scrape_course_details(path)
78
+ if details:
79
+ all_course_details.append(details)
80
+
81
+ import json
82
+ with open("course_details.json", "w") as f:
83
+ json.dump(all_course_details, f, indent=4)
84
+
85
+ print("Scraping completed. Details saved to 'course_details.json'.")