Spaces:

BitBasher
/

EduConnect

Build error

App Files Files Community

dtyago commited on Feb 16, 2024

Commit

d131d3b

1 Parent(s): faf4679

init chat_rag

Browse files

Files changed (3) hide show

app/utils/chat_rag.py +154 -2
requirements.txt +5 -1
static/.DS_Store +0 -0

app/utils/chat_rag.py CHANGED Viewed

@@ -1,2 +1,154 @@
-# Model import
-# Implement RAG using wvvocer

+#list of librarys for requirement.txt
+from langchain.document_loaders import PyPDFLoader
+# Import embeddings module from langchain for vector representations of text
+from langchain.embeddings import HuggingFaceEmbeddings
+# Import text splitter for handling large texts
+from langchain.text_splitter import CharacterTextSplitter
+# Import vector store for database operations
+from langchain.vectorstores import Chroma
+# for loading of llama gguf model
+from langchain.llms import LlamaCpp
+from langchain.chains.router.llm_router import LLMRouterChain, RouterOutputParser
+from langchain.chains.router.multi_prompt_prompt import MULTI_PROMPT_ROUTER_TEMPLATE
+from langchain.chains.router import MultiPromptChain
+from langchain.chains import ConversationChain
+from langchain.chains.llm import LLMChain
+from langchain.prompts import PromptTemplate
+from langchain.memory import ConversationBufferMemory
+from langchain.chains import ConversationalRetrievalChain
+from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+def pdf_to_vec(filename):
+    document = []
+    loader = PyPDFLoader(filename)
+    document.extend(loader.load()) #which library is this from?
+    # Initialize HuggingFaceEmbeddings with the 'sentence-transformers/all-MiniLM-L6-v2' model for generating text embeddings
+    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+    # Initialize a CharacterTextSplitter to split the loaded documents into smaller chunks
+    document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
+    # Use the splitter to divide the 'document' content into manageable chunks
+    document_chunks = document_splitter.split_documents(document) #which library is this from?
+    # Create a Chroma vector database from the document chunks with the specified embeddings, and set a directory for persistence
+    vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='./data') ## change to GUI path
+    # Persist the created vector database to disk in the specified directory
+    vectordb.persist() #this is mandatory?
+    return(vectordb)
+    #return collection  # Return the collection as the asset
+def load_llm():
+    #callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+    llm = LlamaCpp(
+                #streaming = True,
+                model_path="/content/llama-2-7b-mini-ibased.Q5_K_M.gguf", #/content/data/llama-2-7b-mcq_2-gguf.gguf. # change to GUI path. llama-2-7b-mini-ibased.Q5_K_M.gguf llama-2-7b-mcq_2.Q5_K_M.gguf
+                #n_gpu_layers=-1,
+                n_batch=512,
+                temperature=0.1,
+                top_p=1,
+                #verbose=False,
+                #callback_manager=callback_manager,
+                max_tokens=2000,
+                )
+    return llm
+#step 5, to instantiate once to create default_chain,router_chain,destination_chains into chain and set vectordb. so will not re-create per prompt
+def default_chain():
+    sum_template = """
+    As a machine learning education specialist, your expertise is pivotal in deepening the comprehension of complex machine learning concepts for both educators and students.
+    Your role entails:
+    Providing Detailed Explanations: Deliver comprehensive answers to these questions, elucidating the underlying technical principles.
+    Assisting in Exam Preparation: Support educators in formulating sophisticated exam and quiz questions, including MCQs, accompanied by thorough explanations.
+    Summarizing Course Material: Distill key information from course materials, articulating complex ideas within the context of advanced machine learning practices.
+    Objective: to summarize and explain the key points.
+    Here the question:
+    {input}"""
+    mcq_template = """
+    As a machine learning education specialist, your expertise is pivotal in deepening the comprehension of complex machine learning concepts for both educators and students.
+    Your role entails:
+    Crafting Insightful Questions: Develop thought-provoking questions that explore the intricacies of machine learning topics.
+    Generating MCQs: Create MCQs for each machine learning topic, comprising a question, four choices (A-D), and the correct answer, along with a rationale explaining the answer.
+    Objective: to create multiple choice question in this format
+    [question:
+    options A:
+    options B:
+    options C:
+    options D:
+    correct_answer:
+    explanation:]
+    Here the question:
+    {input}"""
+    prompt_infos = [
+        {
+            "name": "SUMMARIZE",
+            "description": "Good for summarizing and explaination ",
+            "prompt_template": sum_template,
+        },
+        {
+            "name": "MCQ",
+            "description": "Good for creating multiple choices questions",
+            "prompt_template": mcq_template,
+        },
+    ]
+    destination_chains = {}
+    for p_info in prompt_infos:
+        name = p_info["name"]
+        prompt_template = p_info["prompt_template"]
+        prompt = PromptTemplate(template=prompt_template, input_variables=["input"])
+        chain = LLMChain(llm=llm, prompt=prompt)
+        destination_chains[name] = chain
+    #default_chain = ConversationChain(llm=llm, output_key="text")
+    #memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
+    default_chain = ConversationalRetrievalChain.from_llm(llm=llm,
+                                                retriever=vectordb.as_retriever(search_kwargs={'k': 3}),
+                                                verbose=True, output_key="text" )
+    destinations = [f"{p['name']}: {p['description']}" for p in prompt_infos]
+    destinations_str = "\n".join(destinations)
+    router_template = MULTI_PROMPT_ROUTER_TEMPLATE.format(destinations=destinations_str)
+    router_prompt = PromptTemplate(
+        template=router_template,
+        input_variables=["input"],
+        output_parser=RouterOutputParser(),
+    )
+    router_chain = LLMRouterChain.from_llm(llm, router_prompt)
+    return default_chain,router_chain,destination_chains
+def llm_infer(default_chain,router_chain,destination_chains,prompt):
+    chain = MultiPromptChain(
+        router_chain=router_chain,
+        destination_chains=destination_chains,
+        default_chain=default_chain,
+        #memory=ConversationBufferMemory(k=2), # memory_key='chat_history', return_messages=True
+        verbose=True,
+    )
+    response = chain.run(prompt)
+    return response

requirements.txt CHANGED Viewed

@@ -14,4 +14,8 @@ bcrypt==4.1.*                  # For hashing secrets
 opencv-python-headless==4.5.5.* # For image handling images
 tensorflow                     # Tensorflow is needed by MTCNN for facial recognition
 scipy                          # The scipy is required for keras-facenet
-tinydb                         # The in memory database for storing JWT tokens

 opencv-python-headless==4.5.5.* # For image handling images
 tensorflow                     # Tensorflow is needed by MTCNN for facial recognition
 scipy                          # The scipy is required for keras-facenet
+tinydb                         # The in memory database for storing JWT tokens
+langchain                      # Langgchain for RAG
+llama-cpp-python               # To load the model
+sentence-transformers          # For text embeddings
+pypdf                          # Handling PDF files

static/.DS_Store CHANGED Viewed

Binary files a/static/.DS_Store and b/static/.DS_Store differ