from langchain_core.documents import Document from langchain_pinecone import PineconeVectorStore from langchain_google_genai import GoogleGenerativeAIEmbeddings import time import os from dotenv import load_dotenv from extract_all_courses import scrape_all_courses def creating_pinecone_index(embedding): """ Creates a Pinecone index using the provided embedding model. Args: embedding (object): The embedding model or function used to generate vector embeddings. Returns: PineconeVectorStore: An instance of Pinecone index where the vectors can be processed. """ index = PineconeVectorStore(embedding=embedding) return index def convert_into_documents(all_course_data): """ Converting a list of courses' data dictionaries into a list of Langchain Document objects. Each course data dictionary contains the following data: - 'title' (str): The title of the course. - 'description' (str): A brief description of the course. - 'curriculum' (list of str): A list of curriculum items/topics covered in the course. Args: all_course_data (list of dict): It is a list where each element is a dictionary that contains course information. Returns: list of Document: A list of Document objects, each one represents a course. """ documents = [] for course_data in all_course_data: # Extractin each course details with default empty values if keys are missing title = course_data.get("title", "") url = course_data.get("url", "") description = course_data.get("description", "") curriculum = " \n ".join(course_data.get("curriculum", [])) # Combining the course details into a single text full_text = f"Title: {title} ; Web url: {url} Description: {description} ; Curriculum: {curriculum}" # Creation of a Document object with the combined text and metadata course_document = Document( page_content=full_text, metadata={"url": url} ) # Appending the course Document object to the list documents.append(course_document) return documents def uploading_document_to_pinecone(all_courses): """ Upload all course data to a Pinecone index after converting it into a suitable format. Args: all_courses (list of dict): A list where each element is a dictionary containing course information. """ # Convert course data into Document objects final_course_data = convert_into_documents(all_courses) print("Deleting file") try: # Deleting all existing data from the Pinecone index pinecone_index.delete(delete_all=True) time.sleep(5) except Exception as e: print(f"Namespace is already empty") print("Uploading File to Pinecone") # Upload the chunked data to Pinecone index pinecone_index.from_documents(final_course_data, embedding, index_name=index_name) print("Document Uploaded to Pinecone") if __name__ == "__main__": """ Main function to initialize and execute the pipeline for scraping courses, generating embeddings, and uploading course data to a Pinecone index. Steps: 1. Load environment variables from a `.env` file. 2. Initializing the google's embedding model for creating document vectors. 3. Defining the Pinecone index's name and creating its Pinecone index. 5. Scrape all courses from the specified base URL. 6. Upload the scraped course data to the Pinecone index. """ # Load all environment variables from .env file load_dotenv() # Initializing the embedding model for creating document vectors embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004") # The Pinecone index name for storing document embeddings index_name = "analytics-vidya-free-courses" # Creation of the Pinecone index using the embedding model pinecone_index = creating_pinecone_index(embedding) # Defining the base URL for scraping the courses' list base_url = "https://courses.analyticsvidhya.com/collections/courses" # Scraping all courses available on the base URL all_courses = scrape_all_courses(base_url) # Uploading the scraped courses' data to the Pinecone index uploading_document_to_pinecone(all_courses)