Spaces:

sakuexe
/

thesizer

Build error

App Files Files Community

sakuexe commited on Oct 23, 2024

Commit

0b367ea

1 Parent(s): 6427fd5

tweaked the code a bit to make answering faster

Browse files

Files changed (2) hide show

app.py +7 -7
vector_store.py +11 -11

app.py CHANGED Viewed

@@ -2,13 +2,12 @@
 # https://huggingface.co/learn/cookbook/rag_zephyr_langchain
 # langchain
 from typing import TypedDict
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_huggingface import HuggingFacePipeline
 # huggingface
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 from transformers import pipeline
 # pytorch
 import torch
@@ -59,6 +58,11 @@ text_generation_pipeline = pipeline(
 llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
 def generate_prompt(message_history: list[ChatMessage], max_history=5):
     # creating the prompt template in the shape of a chat prompt
@@ -99,10 +103,6 @@ def generate_prompt(message_history: list[ChatMessage], max_history=5):
 async def generate_answer(message_history: list[ChatMessage]):
-    # generate a vector store
-    print("creating the document database")
-    db = await get_document_database("learning_material/*/*/*")
-    print("Document database is ready")
     # initialize the similarity search
     n_of_best_results = 4

 # https://huggingface.co/learn/cookbook/rag_zephyr_langchain
 # langchain
 from typing import TypedDict
+from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.runnables import RunnablePassthrough
 from langchain_huggingface import HuggingFacePipeline
 # huggingface
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers import pipeline
 # pytorch
 import torch
 llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
+# generate a vector store
+print("creating the document database")
+db = get_document_database("learning_material/*/*/*")
+print("Document database is ready")
 def generate_prompt(message_history: list[ChatMessage], max_history=5):
     # creating the prompt template in the shape of a chat prompt
 async def generate_answer(message_history: list[ChatMessage]):
     # initialize the similarity search
     n_of_best_results = 4

vector_store.py CHANGED Viewed

@@ -10,21 +10,21 @@ from glob import glob
 import pathlib
-async def load_text(file_path: str) -> list[Document] | None:
     """Loads text documents (.txt) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".txt"
     try:
         loader = TextLoader(file_path)
-        return await loader.aload()
     except UnicodeError or RuntimeError as err:
         print(f"could not load file: {file_path}")
         print(f"error: {err}")
 # https://python.langchain.com/docs/how_to/document_loader_markdown/
-async def load_markdown(file_path: str) -> list[Document] | None:
     """Loads markdown files asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".md"
@@ -33,33 +33,33 @@ async def load_markdown(file_path: str) -> list[Document] | None:
         # use the mode elements to keep metadata about if the information is
         # a paragraph, link or a heading for example
         loader = UnstructuredMarkdownLoader(file_path, mode="elements")
-        return await loader.aload()
     except UnicodeError or RuntimeError as err:
         print(f"could not load file: {file_path}")
         print(f"error: {err}")
 # https://python.langchain.com/docs/how_to/document_loader_pdf/
-async def load_pdf(file_path: str) -> list[Document] | None:
     """Loads pdf documents (.pdf) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".pdf"
     loader = PyPDFLoader(file_path)
     try:
-        return await loader.aload()
     except PyPdfError as err:
         print(f"could not read file: {file_path}")
         print(f"error: {err}")
-async def load_html(file_path: str) -> list[Document]:
     """Loads html documents (.html) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".html" or ".htm"
     loader = BSHTMLLoader(file_path)
-    return await loader.aload()
 # hold all of the loader functions for easy 0(1) fetching
@@ -73,7 +73,7 @@ LOADER_MAP = {
 # https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
-async def get_document_database(
     data_folder="learning_material/*/*/*",
     embedding_model="BAAI/bge-base-en-v1.5",
     chunk_size=1028, chunk_overlap=0,
@@ -96,7 +96,7 @@ async def get_document_database(
             continue
         # load the document with a filetype specific loader
-        result_documents = await load_fn(file_path)
         if not result_documents:
             print(f"file {file_path} does not include any content, skipping")
@@ -111,7 +111,7 @@ async def get_document_database(
     chunked_docs = splitter.split_documents(all_docs)
-    return await FAISS.afrom_documents(
         chunked_docs,
         HuggingFaceEmbeddings(model_name=embedding_model)
     )

 import pathlib
+def load_text(file_path: str) -> list[Document] | None:
     """Loads text documents (.txt) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".txt"
     try:
         loader = TextLoader(file_path)
+        return loader.load()
     except UnicodeError or RuntimeError as err:
         print(f"could not load file: {file_path}")
         print(f"error: {err}")
 # https://python.langchain.com/docs/how_to/document_loader_markdown/
+def load_markdown(file_path: str) -> list[Document] | None:
     """Loads markdown files asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".md"
         # use the mode elements to keep metadata about if the information is
         # a paragraph, link or a heading for example
         loader = UnstructuredMarkdownLoader(file_path, mode="elements")
+        return loader.load()
     except UnicodeError or RuntimeError as err:
         print(f"could not load file: {file_path}")
         print(f"error: {err}")
 # https://python.langchain.com/docs/how_to/document_loader_pdf/
+def load_pdf(file_path: str) -> list[Document] | None:
     """Loads pdf documents (.pdf) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".pdf"
     loader = PyPDFLoader(file_path)
     try:
+        return loader.load()
     except PyPdfError as err:
         print(f"could not read file: {file_path}")
         print(f"error: {err}")
+def load_html(file_path: str) -> list[Document]:
     """Loads html documents (.html) asynchronously from a passed file_path."""
     assert file_path != ""
     assert pathlib.Path(file_path).suffix == ".html" or ".htm"
     loader = BSHTMLLoader(file_path)
+    return loader.load()
 # hold all of the loader functions for easy 0(1) fetching
 # https://python.langchain.com/v0.1/docs/modules/data_connection/retrievers/vectorstore/
+def get_document_database(
     data_folder="learning_material/*/*/*",
     embedding_model="BAAI/bge-base-en-v1.5",
     chunk_size=1028, chunk_overlap=0,
             continue
         # load the document with a filetype specific loader
+        result_documents = load_fn(file_path)
         if not result_documents:
             print(f"file {file_path} does not include any content, skipping")
     chunked_docs = splitter.split_documents(all_docs)
+    return FAISS.from_documents(
         chunked_docs,
         HuggingFaceEmbeddings(model_name=embedding_model)
     )