from langchain_core.documents import Document
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import time
import os
from dotenv import load_dotenv

from extract_all_courses import scrape_all_courses

def creating_pinecone_index(embedding):
    """
    Creates a Pinecone index using the provided embedding model.

    Args:
        embedding (object): The embedding model or function used to generate vector embeddings.

    Returns:
        PineconeVectorStore: An instance of Pinecone index where the vectors can be processed.
    """
    
    index = PineconeVectorStore(embedding=embedding)
    return index

def convert_into_documents(all_course_data):
    """
    Converting a list of courses' data dictionaries into a list of Langchain Document objects.

    Each course data dictionary contains the following data:
        - 'title' (str): The title of the course.
        - 'description' (str): A brief description of the course.
        - 'curriculum' (list of str): A list of curriculum items/topics covered in the course.

    Args:
        all_course_data (list of dict): It is a list where each element is a dictionary that contains course information.

    Returns:
        list of Document: A list of Document objects, each one represents a course.
    """
    documents = []

    for course_data in all_course_data:

        # Extractin each course details with default empty values if keys are missing
        title = course_data.get("title", "")
        url = course_data.get("url", "")
        description = course_data.get("description", "")
        curriculum = " \n ".join(course_data.get("curriculum", []))

        # Combining the course details into a single text
        full_text = f"Title: {title} ; Web url: {url} Description: {description} ; Curriculum: {curriculum}"

        # Creation of a Document object with the combined text and metadata
        course_document = Document(
            page_content=full_text,
            metadata={"url": url}
        )

        # Appending the course Document object to the list
        documents.append(course_document)

    return documents


def uploading_document_to_pinecone(all_courses):
    """
    Upload all course data to a Pinecone index after converting it into a suitable format.

    Args:
        all_courses (list of dict): A list where each element is a dictionary containing course information.
    """
        
    # Convert course data into Document objects
    final_course_data = convert_into_documents(all_courses)

    print("Deleting file")
    try:
        # Deleting all existing data from the Pinecone index
        pinecone_index.delete(delete_all=True)
        time.sleep(5)
    except Exception as e:
        print(f"Namespace is already empty")

    print("Uploading File to Pinecone")
    
    # Upload the chunked data to Pinecone index
    pinecone_index.from_documents(final_course_data, embedding, index_name=index_name)
    print("Document Uploaded to Pinecone")


if __name__ == "__main__":
    """
    Main function to initialize and execute the pipeline for scraping courses, 
    generating embeddings, and uploading course data to a Pinecone index.

    Steps:
        1. Load environment variables from a `.env` file.
        2. Initializing the google's embedding model for creating document vectors.
        3. Defining the Pinecone index's name and creating its Pinecone index.
        5. Scrape all courses from the specified base URL.
        6. Upload the scraped course data to the Pinecone index.

    """
    # Load all environment variables from .env file
    load_dotenv()

    # Initializing the embedding model for creating document vectors
    embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

    # The Pinecone index name for storing document embeddings
    index_name = "analytics-vidya-free-courses"

    # Creation of the Pinecone index using the embedding model
    pinecone_index = creating_pinecone_index(embedding)

    # Defining the base URL for scraping the courses' list
    base_url = "https://courses.analyticsvidhya.com/collections/courses"

    # Scraping all courses available on the base URL
    all_courses = scrape_all_courses(base_url)

    # Uploading the scraped courses' data to the Pinecone index
    uploading_document_to_pinecone(all_courses)