Spaces:
Sleeping
Sleeping
| import os | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.document_loaders import PyMuPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.chains import ConversationalRetrievalChain | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.llms import HuggingFaceHub | |
| # Constants | |
| CHROMA_DB_PATH = "chroma_db" | |
| SENTENCE_TRANSFORMER_MODEL = "sentence-transformers/all-MiniLM-L6-v2" | |
| LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta" | |
| # Initialize vector store | |
| def initialize_vector_store(): | |
| embeddings = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL) | |
| return Chroma(persist_directory=CHROMA_DB_PATH, embedding_function=embeddings) | |
| vector_store = initialize_vector_store() | |
| def ingest_pdf(pdf_path): | |
| """Loads, splits, and stores PDF content in a vector database.""" | |
| loader = PyMuPDFLoader(pdf_path) | |
| documents = loader.load() | |
| # Optimized text splitting: Smaller chunks, no overlap to prevent redundancy | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=0) | |
| split_docs = text_splitter.split_documents(documents) | |
| # Add documents to vector store and persist | |
| vector_store.add_documents(split_docs) | |
| vector_store.persist() | |
| def process_query_with_memory(query, chat_memory): | |
| """Processes user queries while maintaining conversational memory.""" | |
| retriever = vector_store.as_retriever(search_kwargs={"k": 3}) # Optimized retrieval | |
| # Debug: Print retrieved documents | |
| retrieved_docs = retriever.get_relevant_documents(query) | |
| print("Retrieved Docs:\n", [doc.page_content for doc in retrieved_docs]) | |
| # Initialize LLM | |
| llm = HuggingFaceHub(repo_id=LLM_MODEL, model_kwargs={"max_new_tokens": 500}) | |
| # Create conversational retrieval chain | |
| conversation_chain = ConversationalRetrievalChain.from_llm( | |
| llm=llm, | |
| retriever=retriever, | |
| memory=chat_memory | |
| ) | |
| # Debug: Print chat history to detect repetition | |
| chat_history = chat_memory.load_memory_variables({}).get("chat_history", []) | |
| print("Chat History:\n", chat_history) | |
| # Ensure no duplicate chat history | |
| chat_history = list(set(chat_history)) | |
| return conversation_chain.run({"question": query, "chat_history": chat_history}) | |
| # Initialize chat memory | |
| chat_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
| # Example Usage | |
| if __name__ == "__main__": | |
| pdf_path = "CV_Data_Science.pdf" | |
| ingest_pdf(pdf_path) | |
| user_query = "What are my skills in CV?" | |
| response = process_query_with_memory(user_query, chat_memory) | |
| print("\nChatbot Response:", response) | |