File size: 5,225 Bytes
5dab304
 
 
 
 
 
df90043
5dab304
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b76b731
5dab304
 
b76b731
5dab304
 
 
 
 
 
 
 
 
 
 
39947ea
 
 
 
 
 
 
 
 
 
 
5dab304
 
 
2c9a929
b76b731
2c9a929
5dab304
 
2c9a929
b76b731
5dab304
 
 
 
 
 
39947ea
2c9a929
39947ea
2fb7a93
 
5dab304
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
from bs4 import BeautifulSoup
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
import json
import streamlit as st



def get_domain_link():
    return "https://courses.analyticsvidhya.com"

def clean_text(text):
    text = text.replace("\n", "")
    text = text.replace("\t", "")
    return text.strip()

@st.cache_data(ttl=3600)  # Cache for 1 hour
def get_course_details(url):
    course_texts = []
    progress_bar = st.progress(0)
    for page_no in range(1, 10):
        print("page :",page_no)
        response = requests.get(url, params={'page': page_no})
        soup = BeautifulSoup(response.content, "html.parser")

        products_list = soup.find_all('a', class_='course-card__public')
        course_links = [course_link.get("href") for course_link in products_list]

        for course_link in course_links:
            course_url = get_domain_link() + course_link
            response = requests.get(course_url)
            course_soup = BeautifulSoup(response.content, "html.parser")

            course_name = course_soup.find('h1', class_ = 'section__heading').get_text()
            course_description = course_soup.find('div', class_ = 'fr-view').get_text()

            course_curriculum_titles_raw = course_soup.find_all('h5', class_ = 'course-curriculum__chapter-title')
            course_curriculum_titles = [course_curriculum_title.get_text() for course_curriculum_title in course_curriculum_titles_raw]

            course_curriculum_lessons_raw = course_soup.find_all('span', class_ = 'course-curriculum__chapter-lesson')
            course_curriculum_lessons = [course_curriculum_lesson.get_text() for course_curriculum_lesson in course_curriculum_lessons_raw]

            course_texts.append({
                    "text": course_name,
                    "type": "course_name",
                    "link" : course_url,
                    "course_name" : course_name
                })
            course_texts.append({
                    "text": course_description,
                    "type": "course_description",
                    "link" : course_url,
                    "course_name" : course_name
                })

            for course_curriculum_title in course_curriculum_titles:
                title = clean_text(course_curriculum_title)
                course_text = {
                    "text": title,
                    "type": "title",
                    "link" : course_url,
                    "course_name" : course_name
                }
                course_texts.append(course_text)

            for course_curriculum_lesson in course_curriculum_lessons:
                lesson = clean_text(course_curriculum_lesson)
                course_text = {
                    "text": lesson,
                    "type": "lesson",
                    "link" : course_url,
                    "course_name" : course_name
                }
                course_texts.append(course_text)
    with open('content.json', 'w') as f:
        json.dump(course_texts, f, indent=4)
    return course_texts

def get_documents(course_texts:list):
    texts = []
    metadatas = []
    print("course_texts",course_texts)
    for course_text in course_texts:
        texts.append(course_text["text"])
        metadatas.append({
                        "type": course_text["type"],
                        "link" : course_text["link"],
                        "course_name" : course_text["course_name"]
                    })
    text_splitter = CharacterTextSplitter(chunk_size=1000)
    documents = text_splitter.create_documents(texts = texts, metadatas = metadatas)
    return documents

def read_json_data(file_path):
  try:
    with open(file_path, 'r') as file:
      data = json.load(file)
      return data
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None
  except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {file_path}")
    return None

def main():
    st.title("Analytics Vidhya Course Scraper")
    url = get_domain_link() + "/collections/courses"
    # courses_texts = get_course_details(url)
    query = st.text_input("What do you want to learn today", value="Large language models")
    
    if st.button("Fetch Courses"):
        st.info("Fetching courses please wait...")
        courses_texts = read_json_data("content.json")
        documents = get_documents(courses_texts)
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        db = Chroma.from_documents(documents, embeddings)
        docs = db.similarity_search(query)
        
        if docs:
            st.success(f"Found {len(docs)} courses!")
            st.write("Course Names and Links:")
            for i, course in enumerate(docs):
                st.write(f"{i+1}. {course.metadata['course_name']}")
                st.write(f"   -{course.metadata['link']}")
        else:
            st.warning("No courses found.")

if __name__ == "__main__":
    main()