import gradio as gr from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from sentence_transformers import SentenceTransformer from langchain.vectorstores import Chroma from langchain.memory import ConversationBufferMemory from langchain.chains import RetrievalQA from langchain.llms import HuggingFacePipeline from transformers import pipeline # Step 1: Load the PDF document loader = PyPDFLoader("/content/Data_Cleaning_and_Preprocessing_for_Data_Science_Beginners_Data_Science_Horizons_2023_Data_Science_Hor.pdf") docs = loader.load() # Step 2: Split the document into chunks splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]) chunks = splitter.split_documents(docs) # Step 3: Define a custom embedding function wrapper for SentenceTransformer class SentenceTransformerEmbedding: def __init__(self, model): self.model = model def embed_documents(self, texts): """Embed multiple documents""" return self.model.encode(texts, show_progress_bar=True) def embed_query(self, query): """Embed a single query""" return self.model.encode([query], show_progress_bar=True)[0] # Step 4: Create the SentenceTransformer model and wrap it embedding_model = SentenceTransformer('all-MiniLM-L6-v2') embedding_function = SentenceTransformerEmbedding(embedding_model) # Step 5: Store the embeddings in a Chroma vector store db = Chroma.from_texts( texts=[chunk.page_content for chunk in chunks], embedding=embedding_function, ) # Step 6: Load a question-answering pipeline from HuggingFace qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad") # HuggingFace pipeline is already a callable, so we can directly use it with LangChain's HuggingFacePipeline qa_model = HuggingFacePipeline(pipeline=qa_pipeline) # Step 7: Set up retriever and the retrieval-based QA chain retriever = db.as_retriever() memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) retrieval_qa_chain = RetrievalQA.from_chain_type( llm=qa_model, # This is now a valid LangChain LLM retriever=retriever, memory=memory, ) # Step 8: Define the function for Gradio interface def chatbot_response(user_input): try: # Format the query properly for retrieval QA chain formatted_input = {"query": user_input, "context": " ".join([chunk.page_content for chunk in chunks])} response = retrieval_qa_chain.run(formatted_input) return response[0] except Exception as e: return f"Error: {e}" # Step 9: Create the Gradio interface iface = gr.Interface( fn=chatbot_response, inputs="text", outputs="text", title="RAG Chatbot", description="Ask questions related to Data Science from the provided document.", theme="compact" ) # Step 10: Launch the Gradio app if __name__ == "__main__": iface.launch()