import chromadb import os from langchain_chroma import Chroma from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT import time import transformers from langchain_community.llms import CTransformers from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.prompts import PromptTemplate from transformers import pipeline from langchain_core.output_parsers import StrOutputParser from langchain_ollama import ChatOllama client = chromadb.HttpClient("http://localhost:8000") def using_ollama_model(retriever, query, results,conversation_history): history_text = "" for item in conversation_history: if "question" in item and item["question"]: history_text += f"User: {item['question']}\n" if "answer" in item and item["answer"]: history_text += f"Assistant: {item['answer']}\n" print("<<<<<< LLM MODEL STARTED >>>>>>") print(" ========>", history_text) # Ensure the prompt template is well-structured prompt_template = """ You are a helpful assistant. Answer the following question using the provided context and previous conversation history. If the context does not contain the answer, only then reply with: "Sorry, I don't have enough information." Conversation History :{history} Context:{results} Question:{query} """ # Initialize the PromptTemplate template = PromptTemplate( input_variables=["history","results", "query"], template=prompt_template, ) doc_texts = "\\n".join([doc.page_content for doc in results]) formatted_output = template.format(history=history_text,results=doc_texts, query=query) print("<<<<<<<<<<< Formatted Output >>>>>>>>>>>") print(formatted_output) print("type of formatted output is ", type(formatted_output)) llm = ChatOllama(model="llama3.2", temperature=0.4, num_predict=512) rag_chain = template | llm | StrOutputParser() # results = retriever.invoke(query) # doc_texts = "\\n".join([doc.page_content for doc in results]) answer = rag_chain.invoke({"history" : history_text,"results": doc_texts, "query": query}) return answer # # Set up the RAG pipeline # rag_pipeline = RetrievalQAWithSourcesChain.from_chain_type( # llm=llm, chain_type="stuff", retriever=retriever # ) # # try: # # # answer = rag_pipeline.run(formatted_output) # answer = rag_pipeline.invoke(formatted_output) # return answer # except Exception as e: # print(f"Error occurred during invocation: {e}") # return None def retrievingReponse(docId, query, conversation_history) : model_kwargs = {"device": "mps"} encode_kwargs = {"normalize_embeddings": True} embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-distilroberta-base-v1", model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, ) vectorDB = Chroma( collection_name="embeddings", embedding_function=embeddings, # Using the encode method to get embeddings persist_directory="MM_CHROMA_DB", ) # retriever = vectorDB.as_retriever( # search_type="mmr", # search_kwargs={ # "k": 6, # was 5 originally # "lambda_mult": 1, # was 0.30 originally # "filter": {"docId": docId} # } # ) retriever = vectorDB.as_retriever( search_type="similarity", search_kwargs={ "k": 4, # was 5 originally # "lambda_mult": 1, # was 0.30 originally "filter": {"docId": docId} } ) # retriever = vectorDB.as_retriever() print("<<<<<<<<<<<<<<<< Retriever >>>>>>>>>>>>>>>>") # print("d",retriever) print("\n") results = retriever.invoke( query ) unique_results = [] seen_texts = set() for result in results: print(result) # If the result's content has not been seen before, process it if result.page_content not in seen_texts: ans = result.page_content ans = ans.replace("\n", "") # Clean the content by removing newlines unique_results.append(ans) # Add the cleaned answer to the results list seen_texts.add(result.page_content) # Mark this text as seen os.environ["TOKENIZERS_PARALLELISM"] = "false" start = time.time() # llm_result = using_llm_model(retriever, query, results) llm_result = using_ollama_model(retriever, query, results, conversation_history) end = time.time() print("Inference Time:>>>>>>> ", end - start) return llm_result