In [None]:
% pip install numpy==1.26.4 \
openai==1.44.1 \
qdrant-client==1.11.2 \
langchain==0.3.0 \
langchain-text-splitters==0.3.0 \
langchain-community==0.3.0 \
langchain_experimental \
langchain_qdrant \
langchain_openai \
pypdf==4.3.1 \
PyMuPDF==1.24.10 \
pymupdf4llm \
sentence_transformers \
langchain_huggingface 

In [6]:
BOR_FILE_PATH = "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf"
NIST_FILE_PATH = "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"
SMALL_DOC = "https://arxiv.org/pdf/1908.10084"  
documents_to_preload = [
    BOR_FILE_PATH,
    NIST_FILE_PATH
    # SMALL_DOC
]


In [1]:
# Embedding model - snowflake-arctic-embed-l
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "jeevanions/finetuned_arctic-embedd-l"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

  from tqdm.autonotebook import tqdm, trange


In [4]:
from enum import Enum
from typing import List
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document
import asyncio

class PDFLoaderWrapper():
    class LoaderType(str, Enum):
        PYMUPDF = "pymupdf"

    def __init__(self, file_path: str | List[str] , loader_type: LoaderType = LoaderType.PYMUPDF):
        self.file_path = file_path if isinstance(file_path, list) else [file_path]
        self.loader_type = loader_type

    async def aload(self) -> List[Document]:
        all_docs = []
        for file_path in self.file_path:
            if self.loader_type == self.LoaderType.PYMUPDF:
                try:
                    loader = PyMuPDFLoader(file_path)
                    docs = await loader.aload()
                    all_docs.extend(docs)
                except Exception as e:
                    print(f"Error loading file {file_path}: {e}")
                    continue
        return all_docs



In [7]:
pdf_loader = PDFLoaderWrapper(
    documents_to_preload, PDFLoaderWrapper.LoaderType.PYMUPDF
)
documents = await pdf_loader.aload()

In [None]:
from langchain_experimental.text_splitter import SemanticChunker



text_splitter = SemanticChunker(embedding_model, buffer_size=5, breakpoint_threshold_type="percentile",breakpoint_threshold_amount=90)

chunked_docs = text_splitter.split_documents(documents)


In [8]:
# Recursive splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap = 100,
    length_function = len,
)
recursive_chunked_docs = recursive_text_splitter.split_documents(documents)

In [9]:
import os
import getpass

os.environ["QDRANT_API_URL"] = getpass.getpass("Enter Your Qdrant API URL: ")
os.environ["QDRANT_API_KEY"] = getpass.getpass("Enter Your Qdrant API Key: ")
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")


In [11]:
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

dimension = 1024
collection_name = "ai-safety-sf-arctic-embed-l-semantic"
qdrant_server = os.environ["QDRANT_API_URL"]
qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
# qdrant_client.create_collection(
#     collection_name=collection_name,
#     vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
# )

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    embedding=embedding_model,
)

vector_store.add_documents(recursive_chunked_docs)



['7e6a73422dd04376b5212e1c71275f5c',
 '9fc355b29e534d4d9ea5d87d8c9bf77c',
 '5e6a933b1bae4db7922b50a8a6bab44d',
 '83ddb01ec3954f1dbdda744124a8c76d',
 'd6e795fad13242498f11b8bfd8216f7d',
 'ba26430bfb714249ac4ce151f6819ac2',
 '06818b77aa704eeba5c7499d653c0433',
 'bfff4b85b5b440869b1fadcaf0ba0489',
 '7810842f60234aa0bee8c81a6577fc73',
 '85fbfbfdd52349dca312edcbd70af79e',
 'abddd6d787a9496c9c3f93d7386cd1c4',
 'da0779d479454c1ba82407a68d536997',
 '39b0ea70a678462fbf305d1096338cc0',
 'aff239aa6f5c447d885b6f7b0346ed1f',
 'e486a65b3a414225aaee7cb21e44cd4a',
 '16014bcfe609485d919b310f040ff735',
 '163bee1b3971476d97bf3c92c2aafd32',
 '10806d74287549288bc664e03e1528c8',
 '1c8cb58feffc4bd38061f6bb47fc9974',
 'e48046904f4f44d8a5042385d618b84b',
 'adf374bab6264d75b4598e9e6b0c31f1',
 'dc30905487ec493398c89c5db30c8e41',
 'e571028cd6804e259d217f3936ab1ac5',
 '5708400cf26f4f5ab83c151cab3f29b5',
 '3d66be0e03ed4c6d8d6d4bd89c3885bc',
 '6c1368d79ada4a87b0bcf0d5ecbb1d05',
 '3364bb5cb27f4df9a84e671fc47f3299',
 

In [12]:
retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
                search_kwargs={'k':10,'score_threshold': 0.8})

In [13]:
retriever.invoke("What steps can organizations take to minimize bias in AI models?")

[Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': '', '_id': 'ed405fa7-b4f6-4e8d-b294-4f5220a5c8fe', '_collection_name': 'ai-safety-sf-arctic-embed-l-semantic'}, page_content='FROM \nPRINCIPLES \nTO PRACTICE \nA TECHINCAL COMPANION TO\nTHE Blueprint for an \nAI BILL OF RIGHTS\n12'),
 Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 50, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'A

In [14]:
# Vector Store with recursive chunked documents
from langchain_qdrant import QdrantVectorStore
from langchain_core.documents import Document
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

dimension = 1024
qdrant_server = os.environ["QDRANT_API_URL"]
recursive_collection_name = "ai-safety-ft-arctic-embed-l-recursive"

recursive_qdrant_client = QdrantClient(url=qdrant_server,api_key=os.environ["QDRANT_API_KEY"])
recursive_qdrant_client.create_collection(
    collection_name=recursive_collection_name,
    vectors_config=VectorParams(size=dimension, distance=Distance.COSINE),
)

recursive_vector_store = QdrantVectorStore(
    client=recursive_qdrant_client,
    collection_name=recursive_collection_name,
    embedding=embedding_model,
)

recursive_vector_store.add_documents(recursive_chunked_docs)

['121c5a692bac4c508d5f3311982e25ec',
 '002f32185b9047e89a538ae775a06e71',
 'a75c5035aab6408eb5345e460c469a10',
 'ca645e946ec542388be2bad2ded9a670',
 '6c7ea28d623b4922817cd1c0b6d588be',
 '7a62cd3d9905402bac4abb6821656d51',
 '2b634384780b4b5eb37fe6e37abdff36',
 'bf0688dd9882476e81f754c1e24b0ca2',
 '3cd98731f3d94931b3be136846f4c403',
 'b90b3ac01e304f2dadc116b2a0e424d9',
 'd689b544a6e94580b0c44168c0daa6e3',
 '5acfb9a2c5004eb69af7e40d00894163',
 '00266c03dc974d4bbe53bb0a70c8ffdb',
 '9b412771249f493f9b6984df80c8b9d7',
 '34ad696c6ad348ba9d3e16b99674afd8',
 '5c369a7d85864bc2bcb8b01ce92ef144',
 'b09a0da6c4804f8f92566d4f634c0c09',
 '54f8ae4b1813431a8a5d42bc85ef40a4',
 'a5bb3d08f7b04da0aa0fd89b51284b75',
 '58e0fc7a8d6a46cebd9a5c390cb7feb7',
 'dae3c740827a4617b3807d439e9b4450',
 '5fb9e3ae5bc349f3a36ab016df65cc5d',
 '0d556bc654ad46beb3268c43c6bc4c70',
 'b76072d3981c49f6b99b2b5870ace24b',
 '3ef94b3ff0054f8980194cb6cf5b5246',
 '5c79ea68ce604c2c9ea088eaedc8c1b8',
 '9a4c71068d984a3d9ff4eb5cd7b9bb69',
 

In [15]:
recursive_retriever = recursive_vector_store.as_retriever(search_type="similarity_score_threshold",
                search_kwargs={'k':10,'score_threshold': 0.8})

In [16]:
recursive_retriever.invoke("What steps can organizations take to minimize bias in AI models?")

[Document(metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 11, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': '', '_id': '9d1df59c-40d5-4e12-9e59-2cf3f7ba07cc', '_collection_name': 'ai-safety-ft-arctic-embed-l-recursive'}, page_content='FROM \nPRINCIPLES \nTO PRACTICE \nA TECHINCAL COMPANION TO\nTHE Blueprint for an \nAI BILL OF RIGHTS\n12'),
 Document(metadata={'source': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'file_path': 'https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf', 'page': 50, 'total_pages': 64, 'format': 'PDF 1.6', 'title': '

In [37]:
# Trying Compression retriver
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import ChatOpenAI

base_retriever = recursive_retriever

#Create a contextual compressor
compressor_llm = ChatOpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)
compressor = LLMChainExtractor.from_llm(compressor_llm)

#Combine the retriever with the compressor
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)



In [18]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

In [16]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [19]:
from langchain_openai import ChatOpenAI

# Using the same model used in the app.
chat_model_name = "gpt-4o"
llm = ChatOpenAI(model=chat_model_name)

In [38]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser

ai_safety_rag_chain = (
    {"context": itemgetter("question") | compression_retriever, "question": itemgetter("question")}
    | rag_prompt | llm | StrOutputParser()
)

In [39]:
ai_safety_rag_chain.invoke({"question" : "How can companies ensure AI does not violate data privacy laws?"})

'Companies can ensure AI does not violate data privacy laws by incorporating built-in protections and ensuring that data collection conforms to reasonable expectations. They should collect only the data strictly necessary for the specific context and seek user permission, respecting their decisions regarding the collection, use, access, transfer, and deletion of data. If obtaining user permission is not possible, alternative privacy by design safeguards should be used. Additionally, systems should avoid user experience and design decisions that obfuscate user choice or burden users with privacy-invasive defaults.'

In [40]:
ai_safety_rag_chain.invoke({"question" : "What are the implications of using GAI systems for organizations in terms of risk management and compliance?"})

"I don't know."