Spaces:

aadil732
/

Analytics-Vidya-Free-Courses

Runtime error

App Files Files Community

Analytics-Vidya-Free-Courses / src /uploading_to_database.py

aadil732

First Commit

86ec184 about 1 year ago

raw

history blame contribute delete

4.4 kB

	from langchain_core.documents import Document
	from langchain_pinecone import PineconeVectorStore
	from langchain_google_genai import GoogleGenerativeAIEmbeddings

	import time
	import os
	from dotenv import load_dotenv

	from extract_all_courses import scrape_all_courses

	def creating_pinecone_index(embedding):
	"""
	Creates a Pinecone index using the provided embedding model.

	Args:
	embedding (object): The embedding model or function used to generate vector embeddings.

	Returns:
	PineconeVectorStore: An instance of Pinecone index where the vectors can be processed.
	"""

	index = PineconeVectorStore(embedding=embedding)
	return index

	def convert_into_documents(all_course_data):
	"""
	Converting a list of courses' data dictionaries into a list of Langchain Document objects.

	Each course data dictionary contains the following data:
	- 'title' (str): The title of the course.
	- 'description' (str): A brief description of the course.
	- 'curriculum' (list of str): A list of curriculum items/topics covered in the course.

	Args:
	all_course_data (list of dict): It is a list where each element is a dictionary that contains course information.

	Returns:
	list of Document: A list of Document objects, each one represents a course.
	"""
	documents = []

	for course_data in all_course_data:

	# Extractin each course details with default empty values if keys are missing
	title = course_data.get("title", "")
	url = course_data.get("url", "")
	description = course_data.get("description", "")
	curriculum = " \n ".join(course_data.get("curriculum", []))

	# Combining the course details into a single text
	full_text = f"Title: {title} ; Web url: {url} Description: {description} ; Curriculum: {curriculum}"

	# Creation of a Document object with the combined text and metadata
	course_document = Document(
	page_content=full_text,
	metadata={"url": url}
	)

	# Appending the course Document object to the list
	documents.append(course_document)

	return documents


	def uploading_document_to_pinecone(all_courses):
	"""
	Upload all course data to a Pinecone index after converting it into a suitable format.

	Args:
	all_courses (list of dict): A list where each element is a dictionary containing course information.
	"""

	# Convert course data into Document objects
	final_course_data = convert_into_documents(all_courses)

	print("Deleting file")
	try:
	# Deleting all existing data from the Pinecone index
	pinecone_index.delete(delete_all=True)
	time.sleep(5)
	except Exception as e:
	print(f"Namespace is already empty")

	print("Uploading File to Pinecone")

	# Upload the chunked data to Pinecone index
	pinecone_index.from_documents(final_course_data, embedding, index_name=index_name)
	print("Document Uploaded to Pinecone")


	if __name__ == "__main__":
	"""
	Main function to initialize and execute the pipeline for scraping courses,
	generating embeddings, and uploading course data to a Pinecone index.

	Steps:
	1. Load environment variables from a `.env` file.
	2. Initializing the google's embedding model for creating document vectors.
	3. Defining the Pinecone index's name and creating its Pinecone index.
	5. Scrape all courses from the specified base URL.
	6. Upload the scraped course data to the Pinecone index.

	"""
	# Load all environment variables from .env file
	load_dotenv()

	# Initializing the embedding model for creating document vectors
	embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

	# The Pinecone index name for storing document embeddings
	index_name = "analytics-vidya-free-courses"

	# Creation of the Pinecone index using the embedding model
	pinecone_index = creating_pinecone_index(embedding)

	# Defining the base URL for scraping the courses' list
	base_url = "https://courses.analyticsvidhya.com/collections/courses"

	# Scraping all courses available on the base URL
	all_courses = scrape_all_courses(base_url)

	# Uploading the scraped courses' data to the Pinecone index
	uploading_document_to_pinecone(all_courses)