Spaces:
Build error
Build error
| import gradio as gr | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from sentence_transformers import SentenceTransformer | |
| from langchain.vectorstores import Chroma | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.chains import RetrievalQA | |
| from langchain.llms import HuggingFacePipeline | |
| from transformers import pipeline | |
| # Step 1: Load the PDF document | |
| loader = PyPDFLoader("/content/Data_Cleaning_and_Preprocessing_for_Data_Science_Beginners_Data_Science_Horizons_2023_Data_Science_Hor.pdf") | |
| docs = loader.load() | |
| # Step 2: Split the document into chunks | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""]) | |
| chunks = splitter.split_documents(docs) | |
| # Step 3: Define a custom embedding function wrapper for SentenceTransformer | |
| class SentenceTransformerEmbedding: | |
| def __init__(self, model): | |
| self.model = model | |
| def embed_documents(self, texts): | |
| """Embed multiple documents""" | |
| return self.model.encode(texts, show_progress_bar=True) | |
| def embed_query(self, query): | |
| """Embed a single query""" | |
| return self.model.encode([query], show_progress_bar=True)[0] | |
| # Step 4: Create the SentenceTransformer model and wrap it | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| embedding_function = SentenceTransformerEmbedding(embedding_model) | |
| # Step 5: Store the embeddings in a Chroma vector store | |
| db = Chroma.from_texts( | |
| texts=[chunk.page_content for chunk in chunks], | |
| embedding=embedding_function, | |
| ) | |
| # Step 6: Load a question-answering pipeline from HuggingFace | |
| qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad") | |
| # HuggingFace pipeline is already a callable, so we can directly use it with LangChain's HuggingFacePipeline | |
| qa_model = HuggingFacePipeline(pipeline=qa_pipeline) | |
| # Step 7: Set up retriever and the retrieval-based QA chain | |
| retriever = db.as_retriever() | |
| memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
| retrieval_qa_chain = RetrievalQA.from_chain_type( | |
| llm=qa_model, # This is now a valid LangChain LLM | |
| retriever=retriever, | |
| memory=memory, | |
| ) | |
| # Step 8: Define the function for Gradio interface | |
| def chatbot_response(user_input): | |
| try: | |
| # Format the query properly for retrieval QA chain | |
| formatted_input = {"query": user_input, "context": " ".join([chunk.page_content for chunk in chunks])} | |
| response = retrieval_qa_chain.run(formatted_input) | |
| return response[0] | |
| except Exception as e: | |
| return f"Error: {e}" | |
| # Step 9: Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=chatbot_response, | |
| inputs="text", | |
| outputs="text", | |
| title="RAG Chatbot", | |
| description="Ask questions related to Data Science from the provided document.", | |
| theme="compact" | |
| ) | |
| # Step 10: Launch the Gradio app | |
| if __name__ == "__main__": | |
| iface.launch() | |