sachit3071 commited on
Commit
5dab304
·
unverified ·
1 Parent(s): b190e67

added content

Browse files
Files changed (3) hide show
  1. app.py +122 -0
  2. content.json +0 -0
  3. requirements.txt +9 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from langchain_chroma import Chroma
4
+ from langchain.embeddings import HuggingFaceEmbeddings
5
+ from langchain_text_splitters import CharacterTextSplitter
6
+ import os
7
+ import sentence_transformers
8
+ import json
9
+
10
+
11
+
12
+ def get_domain_link():
13
+ return "https://courses.analyticsvidhya.com"
14
+
15
+ def clean_text(text):
16
+ text = text.replace("\n", "")
17
+ text = text.replace("\t", "")
18
+ return text.strip()
19
+
20
+ @st.cache_data(ttl=3600) # Cache for 1 hour
21
+ def get_course_details(url):
22
+ course_texts = []
23
+ progress_bar = st.progress(0)
24
+ for page_no in range(1, 10):
25
+ print("page :",page_no)
26
+ response = requests.get(url, params={'page': page_no})
27
+ soup = BeautifulSoup(response.content, "html.parser")
28
+
29
+ products_list = soup.find_all('a', class_='course-card__public')
30
+ course_links = [course_link.get("href") for course_link in products_list]
31
+
32
+ for course_link in course_links:
33
+ course_url = get_domain_link() + course_link
34
+ response = requests.get(course_url)
35
+ course_soup = BeautifulSoup(response.content, "html.parser")
36
+
37
+ course_name = course_soup.find('h1', class_ = 'section__heading').get_text()
38
+ course_description = course_soup.find('div', class_ = 'fr-view').get_text()
39
+
40
+ course_curriculum_titles_raw = course_soup.find_all('h5', class_ = 'course-curriculum__chapter-title')
41
+ course_curriculum_titles = [course_curriculum_title.get_text() for course_curriculum_title in course_curriculum_titles_raw]
42
+
43
+ course_curriculum_lessons_raw = course_soup.find_all('span', class_ = 'course-curriculum__chapter-lesson')
44
+ course_curriculum_lessons = [course_curriculum_lesson.get_text() for course_curriculum_lesson in course_curriculum_lessons_raw]
45
+
46
+ course_texts.append({
47
+ "text": course_name,
48
+ "type": "course_name",
49
+ "link" : course_url,
50
+ "course_name" : course_name
51
+ })
52
+ course_texts.append({
53
+ "text": course_description,
54
+ "type": "course_description",
55
+ "link" : course_url,
56
+ "course_name" : course_name
57
+ })
58
+
59
+ for course_curriculum_title in course_curriculum_titles:
60
+ title = clean_text(course_curriculum_title)
61
+ course_text = {
62
+ "text": title,
63
+ "type": "title",
64
+ "link" : course_url,
65
+ "course_name" : course_name
66
+ }
67
+ course_texts.append(course_text)
68
+
69
+ for course_curriculum_lesson in course_curriculum_lessons:
70
+ lesson = clean_text(course_curriculum_lesson)
71
+ course_text = {
72
+ "text": lesson,
73
+ "type": "lesson",
74
+ "link" : course_url,
75
+ "course_name" : course_name
76
+ }
77
+ course_texts.append(course_text)
78
+ json_data = json.dumps(course_texts, indent=4)
79
+ with open('content.json', 'w') as f:
80
+ json.dump(course_texts, f, indent=4)
81
+ return course_texts
82
+
83
+ def get_documents(courses_texts):
84
+ texts = []
85
+ metadatas = []
86
+ for course_text in course_texts:
87
+ texts.append(course_text["text"])
88
+ metadatas.append({
89
+ "type": course_text["type"],
90
+ "link" : course_text["link"],
91
+ "course_name" : course_text["course_name"]
92
+ })
93
+ text_splitter = CharacterTextSplitter(chunk_size=1000)
94
+ documents = text_splitter.create_documents(texts = texts, metadatas = metadatas)
95
+ return documents
96
+
97
+
98
+ def main():
99
+ st.title("Analytics Vidhya Course Scraper")
100
+
101
+ query = st.text_input("Enter URL", value=get_domain_link())
102
+
103
+ if st.button("Fetch Courses"):
104
+ url = get_domain_link() + "/collections/courses"
105
+ courses_texts = get_course_details(url)
106
+
107
+ documents = get_documents(courses_texts)
108
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
109
+ db = Chroma.from_documents(documents, embeddings)
110
+ docs = db.similarity_search(query)
111
+
112
+ if docs:
113
+ st.success(f"Found {len(courses)} courses!")
114
+ st.write("Course Links:")
115
+ for course in docs:
116
+ st.write(f"- {course.metadata["course_name"]}")
117
+ st.write(f"- {course.metadata["link"]}")
118
+ else:
119
+ st.warning("No courses found.")
120
+
121
+ if __name__ == "__main__":
122
+ main()
content.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ requests==2.31.0
2
+ beautifulsoup4==4.12.2
3
+ langchain-core==0.1.12
4
+ langchain-community==0.0.19
5
+ langchain-text-splitters==0.0.1
6
+ langchain-huggingface==0.0.9
7
+ python-dotenv==1.0.0
8
+ streamlit
9
+ json