Spaces:

dl4ds
/

dl4ds_tutor

Build error

App Files Files Community

Farid Karimli commited on Aug 2, 2024

Commit

c26167a

1 Parent(s): 49a1201

Chunking method selector and patches

Browse files

Files changed (3) hide show

code/main.py +11 -1
code/modules/dataloader/data_loader.py +3 -2
code/modules/dataloader/pdf_readers/gpt.py +1 -1

code/main.py CHANGED Viewed

@@ -66,16 +66,19 @@ class Chatbot:
     async def setup_llm(self):
         """
         Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
         """
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
-        chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
             llm_settings.get("llm_style"),
             llm_settings.get("follow_up_questions"),
         )
         chain = cl.user_session.get("chain")
@@ -95,6 +98,7 @@ class Chatbot:
         self.config["llm_params"]["llm_style"] = llm_style
         self.config["llm_params"]["llm_loader"] = chat_profile
         self.config["llm_params"]["generate_follow_up"] = generate_follow_up
         self.llm_tutor.update_llm(
             old_config, self.config
@@ -172,6 +176,12 @@ class Chatbot:
                     label="Stream response",
                     initial=config["llm_params"]["stream"],
                 ),
                 cl.input_widget.Switch(
                     id="follow_up_questions",
                     label="Generate follow up questions",

     async def setup_llm(self):
         """
         Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
+        #TODO: Clean this up.
         """
         start_time = time.time()
         llm_settings = cl.user_session.get("llm_settings", {})
+        chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
             llm_settings.get("chat_model"),
             llm_settings.get("retriever_method"),
             llm_settings.get("memory_window"),
             llm_settings.get("llm_style"),
             llm_settings.get("follow_up_questions"),
+            llm_settings.get("chunking_mode"),
         )
         chain = cl.user_session.get("chain")
         self.config["llm_params"]["llm_style"] = llm_style
         self.config["llm_params"]["llm_loader"] = chat_profile
         self.config["llm_params"]["generate_follow_up"] = generate_follow_up
+        self.config["splitter_options"]["chunking_mode"] = chunking_mode
         self.llm_tutor.update_llm(
             old_config, self.config
                     label="Stream response",
                     initial=config["llm_params"]["stream"],
                 ),
+                cl.input_widget.Select(
+                    id="chunking_mode",
+                    label="Chunking mode",
+                    values=['fixed', 'semantic'],
+                    initial_index=1,
+                ),
                 cl.input_widget.Switch(
                     id="follow_up_questions",
                     label="Generate follow up questions",

code/modules/dataloader/data_loader.py CHANGED Viewed

@@ -202,10 +202,11 @@ class ChunkProcessor:
     def process_chunks(
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
-        if file_type == "pdf":
             document_chunks = documents
         else:
-            documents = [Document(page_content=documents, source=source, page=page)]
             document_chunks = self.splitter.split_documents(documents)
         # add the source and page number back to the metadata

     def process_chunks(
         self, documents, file_type="txt", source="", page=0, metadata={}
     ):
+        # TODO: Clear up this pipeline of re-adding metadata
+        documents = [Document(page_content=documents, source=source, page=page)]
+        if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
             document_chunks = documents
         else:
             document_chunks = self.splitter.split_documents(documents)
         # add the source and page number back to the metadata

code/modules/dataloader/pdf_readers/gpt.py CHANGED Viewed

@@ -23,7 +23,7 @@ class GPTParser:
          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
          The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
-         Just respond with the markdown.
          """
     def parse(self, pdf_path):

          The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
          The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
          For images, give a description and if you can, a source. Separate each page with '---'.
+         Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
          """
     def parse(self, pdf_path):