Spaces:
Build error
Build error
File size: 3,016 Bytes
be8c076 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import gradio as gr
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Chroma
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
# Step 1: Load the PDF document
loader = PyPDFLoader("/content/Data_Cleaning_and_Preprocessing_for_Data_Science_Beginners_Data_Science_Horizons_2023_Data_Science_Hor.pdf")
docs = loader.load()
# Step 2: Split the document into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""])
chunks = splitter.split_documents(docs)
# Step 3: Define a custom embedding function wrapper for SentenceTransformer
class SentenceTransformerEmbedding:
def __init__(self, model):
self.model = model
def embed_documents(self, texts):
"""Embed multiple documents"""
return self.model.encode(texts, show_progress_bar=True)
def embed_query(self, query):
"""Embed a single query"""
return self.model.encode([query], show_progress_bar=True)[0]
# Step 4: Create the SentenceTransformer model and wrap it
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
embedding_function = SentenceTransformerEmbedding(embedding_model)
# Step 5: Store the embeddings in a Chroma vector store
db = Chroma.from_texts(
texts=[chunk.page_content for chunk in chunks],
embedding=embedding_function,
)
# Step 6: Load a question-answering pipeline from HuggingFace
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")
# HuggingFace pipeline is already a callable, so we can directly use it with LangChain's HuggingFacePipeline
qa_model = HuggingFacePipeline(pipeline=qa_pipeline)
# Step 7: Set up retriever and the retrieval-based QA chain
retriever = db.as_retriever()
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
retrieval_qa_chain = RetrievalQA.from_chain_type(
llm=qa_model, # This is now a valid LangChain LLM
retriever=retriever,
memory=memory,
)
# Step 8: Define the function for Gradio interface
def chatbot_response(user_input):
try:
# Format the query properly for retrieval QA chain
formatted_input = {"query": user_input, "context": " ".join([chunk.page_content for chunk in chunks])}
response = retrieval_qa_chain.run(formatted_input)
return response[0]
except Exception as e:
return f"Error: {e}"
# Step 9: Create the Gradio interface
iface = gr.Interface(
fn=chatbot_response,
inputs="text",
outputs="text",
title="RAG Chatbot",
description="Ask questions related to Data Science from the provided document.",
theme="compact"
)
# Step 10: Launch the Gradio app
if __name__ == "__main__":
iface.launch()
|