Spaces:
Build error
Build error
Farid Karimli
commited on
Commit
·
c26167a
1
Parent(s):
49a1201
Chunking method selector and patches
Browse files
code/main.py
CHANGED
|
@@ -66,16 +66,19 @@ class Chatbot:
|
|
| 66 |
async def setup_llm(self):
|
| 67 |
"""
|
| 68 |
Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
|
|
|
|
|
|
|
| 69 |
"""
|
| 70 |
start_time = time.time()
|
| 71 |
|
| 72 |
llm_settings = cl.user_session.get("llm_settings", {})
|
| 73 |
-
chat_profile, retriever_method, memory_window, llm_style, generate_follow_up = (
|
| 74 |
llm_settings.get("chat_model"),
|
| 75 |
llm_settings.get("retriever_method"),
|
| 76 |
llm_settings.get("memory_window"),
|
| 77 |
llm_settings.get("llm_style"),
|
| 78 |
llm_settings.get("follow_up_questions"),
|
|
|
|
| 79 |
)
|
| 80 |
|
| 81 |
chain = cl.user_session.get("chain")
|
|
@@ -95,6 +98,7 @@ class Chatbot:
|
|
| 95 |
self.config["llm_params"]["llm_style"] = llm_style
|
| 96 |
self.config["llm_params"]["llm_loader"] = chat_profile
|
| 97 |
self.config["llm_params"]["generate_follow_up"] = generate_follow_up
|
|
|
|
| 98 |
|
| 99 |
self.llm_tutor.update_llm(
|
| 100 |
old_config, self.config
|
|
@@ -172,6 +176,12 @@ class Chatbot:
|
|
| 172 |
label="Stream response",
|
| 173 |
initial=config["llm_params"]["stream"],
|
| 174 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
cl.input_widget.Switch(
|
| 176 |
id="follow_up_questions",
|
| 177 |
label="Generate follow up questions",
|
|
|
|
| 66 |
async def setup_llm(self):
|
| 67 |
"""
|
| 68 |
Set up the LLM with the provided settings. Update the configuration and initialize the LLM tutor.
|
| 69 |
+
|
| 70 |
+
#TODO: Clean this up.
|
| 71 |
"""
|
| 72 |
start_time = time.time()
|
| 73 |
|
| 74 |
llm_settings = cl.user_session.get("llm_settings", {})
|
| 75 |
+
chat_profile, retriever_method, memory_window, llm_style, generate_follow_up, chunking_mode = (
|
| 76 |
llm_settings.get("chat_model"),
|
| 77 |
llm_settings.get("retriever_method"),
|
| 78 |
llm_settings.get("memory_window"),
|
| 79 |
llm_settings.get("llm_style"),
|
| 80 |
llm_settings.get("follow_up_questions"),
|
| 81 |
+
llm_settings.get("chunking_mode"),
|
| 82 |
)
|
| 83 |
|
| 84 |
chain = cl.user_session.get("chain")
|
|
|
|
| 98 |
self.config["llm_params"]["llm_style"] = llm_style
|
| 99 |
self.config["llm_params"]["llm_loader"] = chat_profile
|
| 100 |
self.config["llm_params"]["generate_follow_up"] = generate_follow_up
|
| 101 |
+
self.config["splitter_options"]["chunking_mode"] = chunking_mode
|
| 102 |
|
| 103 |
self.llm_tutor.update_llm(
|
| 104 |
old_config, self.config
|
|
|
|
| 176 |
label="Stream response",
|
| 177 |
initial=config["llm_params"]["stream"],
|
| 178 |
),
|
| 179 |
+
cl.input_widget.Select(
|
| 180 |
+
id="chunking_mode",
|
| 181 |
+
label="Chunking mode",
|
| 182 |
+
values=['fixed', 'semantic'],
|
| 183 |
+
initial_index=1,
|
| 184 |
+
),
|
| 185 |
cl.input_widget.Switch(
|
| 186 |
id="follow_up_questions",
|
| 187 |
label="Generate follow up questions",
|
code/modules/dataloader/data_loader.py
CHANGED
|
@@ -202,10 +202,11 @@ class ChunkProcessor:
|
|
| 202 |
def process_chunks(
|
| 203 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
| 204 |
):
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
document_chunks = documents
|
| 207 |
else:
|
| 208 |
-
documents = [Document(page_content=documents, source=source, page=page)]
|
| 209 |
document_chunks = self.splitter.split_documents(documents)
|
| 210 |
|
| 211 |
# add the source and page number back to the metadata
|
|
|
|
| 202 |
def process_chunks(
|
| 203 |
self, documents, file_type="txt", source="", page=0, metadata={}
|
| 204 |
):
|
| 205 |
+
# TODO: Clear up this pipeline of re-adding metadata
|
| 206 |
+
documents = [Document(page_content=documents, source=source, page=page)]
|
| 207 |
+
if file_type == "pdf" and self.config["splitter_options"]["chunking_mode"] == "fixed":
|
| 208 |
document_chunks = documents
|
| 209 |
else:
|
|
|
|
| 210 |
document_chunks = self.splitter.split_documents(documents)
|
| 211 |
|
| 212 |
# add the source and page number back to the metadata
|
code/modules/dataloader/pdf_readers/gpt.py
CHANGED
|
@@ -23,7 +23,7 @@ class GPTParser:
|
|
| 23 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
| 24 |
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
| 25 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
| 26 |
-
Just respond with the markdown.
|
| 27 |
"""
|
| 28 |
|
| 29 |
def parse(self, pdf_path):
|
|
|
|
| 23 |
The goal is to extract the text, images and equations from the slides and convert everything to markdown format. Some of the equations may be complicated.
|
| 24 |
The markdown should be clean and easy to read, and any math equation should be converted to LaTeX, between $$.
|
| 25 |
For images, give a description and if you can, a source. Separate each page with '---'.
|
| 26 |
+
Just respond with the markdown. Do not include page numbers or any other metadata. Do not try to provide titles. Strictly the content.
|
| 27 |
"""
|
| 28 |
|
| 29 |
def parse(self, pdf_path):
|