sachit3071's picture
updated requirements.txt
b76b731
import requests
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import json
import streamlit as st
def get_domain_link():
return "https://courses.analyticsvidhya.com"
def clean_text(text):
text = text.replace("\n", "")
text = text.replace("\t", "")
return text.strip()
@st.cache_data(ttl=3600) # Cache for 1 hour
def get_course_details(url):
course_texts = []
progress_bar = st.progress(0)
for page_no in range(1, 10):
print("page :",page_no)
response = requests.get(url, params={'page': page_no})
soup = BeautifulSoup(response.content, "html.parser")
products_list = soup.find_all('a', class_='course-card__public')
course_links = [course_link.get("href") for course_link in products_list]
for course_link in course_links:
course_url = get_domain_link() + course_link
response = requests.get(course_url)
course_soup = BeautifulSoup(response.content, "html.parser")
course_name = course_soup.find('h1', class_ = 'section__heading').get_text()
course_description = course_soup.find('div', class_ = 'fr-view').get_text()
course_curriculum_titles_raw = course_soup.find_all('h5', class_ = 'course-curriculum__chapter-title')
course_curriculum_titles = [course_curriculum_title.get_text() for course_curriculum_title in course_curriculum_titles_raw]
course_curriculum_lessons_raw = course_soup.find_all('span', class_ = 'course-curriculum__chapter-lesson')
course_curriculum_lessons = [course_curriculum_lesson.get_text() for course_curriculum_lesson in course_curriculum_lessons_raw]
course_texts.append({
"text": course_name,
"type": "course_name",
"link" : course_url,
"course_name" : course_name
})
course_texts.append({
"text": course_description,
"type": "course_description",
"link" : course_url,
"course_name" : course_name
})
for course_curriculum_title in course_curriculum_titles:
title = clean_text(course_curriculum_title)
course_text = {
"text": title,
"type": "title",
"link" : course_url,
"course_name" : course_name
}
course_texts.append(course_text)
for course_curriculum_lesson in course_curriculum_lessons:
lesson = clean_text(course_curriculum_lesson)
course_text = {
"text": lesson,
"type": "lesson",
"link" : course_url,
"course_name" : course_name
}
course_texts.append(course_text)
with open('content.json', 'w') as f:
json.dump(course_texts, f, indent=4)
return course_texts
def get_documents(course_texts:list):
texts = []
metadatas = []
print("course_texts",course_texts)
for course_text in course_texts:
texts.append(course_text["text"])
metadatas.append({
"type": course_text["type"],
"link" : course_text["link"],
"course_name" : course_text["course_name"]
})
text_splitter = CharacterTextSplitter(chunk_size=1000)
documents = text_splitter.create_documents(texts = texts, metadatas = metadatas)
return documents
def read_json_data(file_path):
try:
with open(file_path, 'r') as file:
data = json.load(file)
return data
except FileNotFoundError:
print(f"Error: File not found at {file_path}")
return None
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in {file_path}")
return None
def main():
st.title("Analytics Vidhya Course Scraper")
url = get_domain_link() + "/collections/courses"
# courses_texts = get_course_details(url)
query = st.text_input("What do you want to learn today", value="Large language models")
if st.button("Fetch Courses"):
st.info("Fetching courses please wait...")
courses_texts = read_json_data("content.json")
documents = get_documents(courses_texts)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(documents, embeddings)
docs = db.similarity_search(query)
if docs:
st.success(f"Found {len(docs)} courses!")
st.write("Course Names and Links:")
for i, course in enumerate(docs):
st.write(f"{i+1}. {course.metadata['course_name']}")
st.write(f" -{course.metadata['link']}")
else:
st.warning("No courses found.")
if __name__ == "__main__":
main()