Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain_chroma import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain_text_splitters import CharacterTextSplitter | |
| import json | |
| import streamlit as st | |
| def get_domain_link(): | |
| return "https://courses.analyticsvidhya.com" | |
| def clean_text(text): | |
| text = text.replace("\n", "") | |
| text = text.replace("\t", "") | |
| return text.strip() | |
| # Cache for 1 hour | |
| def get_course_details(url): | |
| course_texts = [] | |
| progress_bar = st.progress(0) | |
| for page_no in range(1, 10): | |
| print("page :",page_no) | |
| response = requests.get(url, params={'page': page_no}) | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| products_list = soup.find_all('a', class_='course-card__public') | |
| course_links = [course_link.get("href") for course_link in products_list] | |
| for course_link in course_links: | |
| course_url = get_domain_link() + course_link | |
| response = requests.get(course_url) | |
| course_soup = BeautifulSoup(response.content, "html.parser") | |
| course_name = course_soup.find('h1', class_ = 'section__heading').get_text() | |
| course_description = course_soup.find('div', class_ = 'fr-view').get_text() | |
| course_curriculum_titles_raw = course_soup.find_all('h5', class_ = 'course-curriculum__chapter-title') | |
| course_curriculum_titles = [course_curriculum_title.get_text() for course_curriculum_title in course_curriculum_titles_raw] | |
| course_curriculum_lessons_raw = course_soup.find_all('span', class_ = 'course-curriculum__chapter-lesson') | |
| course_curriculum_lessons = [course_curriculum_lesson.get_text() for course_curriculum_lesson in course_curriculum_lessons_raw] | |
| course_texts.append({ | |
| "text": course_name, | |
| "type": "course_name", | |
| "link" : course_url, | |
| "course_name" : course_name | |
| }) | |
| course_texts.append({ | |
| "text": course_description, | |
| "type": "course_description", | |
| "link" : course_url, | |
| "course_name" : course_name | |
| }) | |
| for course_curriculum_title in course_curriculum_titles: | |
| title = clean_text(course_curriculum_title) | |
| course_text = { | |
| "text": title, | |
| "type": "title", | |
| "link" : course_url, | |
| "course_name" : course_name | |
| } | |
| course_texts.append(course_text) | |
| for course_curriculum_lesson in course_curriculum_lessons: | |
| lesson = clean_text(course_curriculum_lesson) | |
| course_text = { | |
| "text": lesson, | |
| "type": "lesson", | |
| "link" : course_url, | |
| "course_name" : course_name | |
| } | |
| course_texts.append(course_text) | |
| with open('content.json', 'w') as f: | |
| json.dump(course_texts, f, indent=4) | |
| return course_texts | |
| def get_documents(course_texts:list): | |
| texts = [] | |
| metadatas = [] | |
| print("course_texts",course_texts) | |
| for course_text in course_texts: | |
| texts.append(course_text["text"]) | |
| metadatas.append({ | |
| "type": course_text["type"], | |
| "link" : course_text["link"], | |
| "course_name" : course_text["course_name"] | |
| }) | |
| text_splitter = CharacterTextSplitter(chunk_size=1000) | |
| documents = text_splitter.create_documents(texts = texts, metadatas = metadatas) | |
| return documents | |
| def read_json_data(file_path): | |
| try: | |
| with open(file_path, 'r') as file: | |
| data = json.load(file) | |
| return data | |
| except FileNotFoundError: | |
| print(f"Error: File not found at {file_path}") | |
| return None | |
| except json.JSONDecodeError: | |
| print(f"Error: Invalid JSON format in {file_path}") | |
| return None | |
| def main(): | |
| st.title("Analytics Vidhya Course Scraper") | |
| url = get_domain_link() + "/collections/courses" | |
| # courses_texts = get_course_details(url) | |
| query = st.text_input("What do you want to learn today", value="Large language models") | |
| if st.button("Fetch Courses"): | |
| st.info("Fetching courses please wait...") | |
| courses_texts = read_json_data("content.json") | |
| documents = get_documents(courses_texts) | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| db = Chroma.from_documents(documents, embeddings) | |
| docs = db.similarity_search(query) | |
| if docs: | |
| st.success(f"Found {len(docs)} courses!") | |
| st.write("Course Names and Links:") | |
| for i, course in enumerate(docs): | |
| st.write(f"{i+1}. {course.metadata['course_name']}") | |
| st.write(f" -{course.metadata['link']}") | |
| else: | |
| st.warning("No courses found.") | |
| if __name__ == "__main__": | |
| main() |