Commit
·
0c818aa
1
Parent(s):
a73f005
Returned to bge-base embeddings for now. Improved UI a little
Browse files- app.py +29 -29
- chatfuncs/ingest.py +27 -32
- chatfuncs/ingest_borough_plan.py +1 -1
app.py
CHANGED
|
@@ -45,7 +45,7 @@ import chatfuncs.ingest as ing
|
|
| 45 |
# Load preset embeddings, vectorstore, and model
|
| 46 |
###
|
| 47 |
|
| 48 |
-
embeddings_name =
|
| 49 |
|
| 50 |
def load_embeddings(embeddings_name = embeddings_name):
|
| 51 |
|
|
@@ -185,7 +185,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
|
|
| 185 |
# RUN UI
|
| 186 |
###
|
| 187 |
|
| 188 |
-
app = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}")
|
| 189 |
|
| 190 |
with app:
|
| 191 |
ingest_text = gr.State()
|
|
@@ -243,9 +243,9 @@ with app:
|
|
| 243 |
lines=1,
|
| 244 |
)
|
| 245 |
with gr.Row():
|
| 246 |
-
submit = gr.Button(value="Send message", variant="
|
| 247 |
-
clear = gr.Button(value="Clear chat", variant="secondary", scale=
|
| 248 |
-
stop = gr.Button(value="Stop generating", variant="secondary", scale=
|
| 249 |
|
| 250 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
| 251 |
#value = "What were the five pillars of the previous borough plan?",
|
|
@@ -296,45 +296,45 @@ with app:
|
|
| 296 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
| 297 |
|
| 298 |
change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
|
| 304 |
# Load in a pdf
|
| 305 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
|
| 310 |
# Load in a webpage
|
| 311 |
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
|
| 316 |
# Load in a csv/excel file
|
| 317 |
load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
|
| 322 |
# Load in a webpage
|
| 323 |
|
| 324 |
# Click/enter to send message action
|
| 325 |
response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
response_click.
|
| 329 |
-
|
| 330 |
-
|
| 331 |
|
| 332 |
response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
response_enter.
|
| 336 |
-
|
| 337 |
-
|
| 338 |
|
| 339 |
# Stop box
|
| 340 |
stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
|
|
@@ -356,7 +356,7 @@ with app:
|
|
| 356 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
| 357 |
|
| 358 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
| 359 |
-
|
| 360 |
|
| 361 |
# Launch the Gradio app
|
| 362 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
|
|
| 45 |
# Load preset embeddings, vectorstore, and model
|
| 46 |
###
|
| 47 |
|
| 48 |
+
embeddings_name = "BAAI/bge-base-en-v1.5" #"mixedbread-ai/mxbai-embed-xsmall-v1"
|
| 49 |
|
| 50 |
def load_embeddings(embeddings_name = embeddings_name):
|
| 51 |
|
|
|
|
| 185 |
# RUN UI
|
| 186 |
###
|
| 187 |
|
| 188 |
+
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)#css=".gradio-container {background-color: black}")
|
| 189 |
|
| 190 |
with app:
|
| 191 |
ingest_text = gr.State()
|
|
|
|
| 243 |
lines=1,
|
| 244 |
)
|
| 245 |
with gr.Row():
|
| 246 |
+
submit = gr.Button(value="Send message", variant="primary", scale = 4)
|
| 247 |
+
clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
|
| 248 |
+
stop = gr.Button(value="Stop generating", variant="secondary", scale=1)
|
| 249 |
|
| 250 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
| 251 |
#value = "What were the five pillars of the previous borough plan?",
|
|
|
|
| 296 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
| 297 |
|
| 298 |
change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 299 |
+
success(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
|
| 300 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
|
| 301 |
+
success(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
|
| 302 |
+
success(lambda: None, None, chatbot, queue=False)
|
| 303 |
|
| 304 |
# Load in a pdf
|
| 305 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
| 306 |
+
success(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
| 307 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
| 308 |
+
success(chatf.hide_block, outputs = [examples_set])
|
| 309 |
|
| 310 |
# Load in a webpage
|
| 311 |
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
|
| 312 |
+
success(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
| 313 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
| 314 |
+
success(chatf.hide_block, outputs = [examples_set])
|
| 315 |
|
| 316 |
# Load in a csv/excel file
|
| 317 |
load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
|
| 318 |
+
success(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
|
| 319 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
| 320 |
+
success(chatf.hide_block, outputs = [examples_set])
|
| 321 |
|
| 322 |
# Load in a webpage
|
| 323 |
|
| 324 |
# Click/enter to send message action
|
| 325 |
response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
|
| 326 |
+
success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 327 |
+
success(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
|
| 328 |
+
response_click.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 329 |
+
success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 330 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
| 331 |
|
| 332 |
response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
|
| 333 |
+
success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
| 334 |
+
success(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
|
| 335 |
+
response_enter.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
| 336 |
+
success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
| 337 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
| 338 |
|
| 339 |
# Stop box
|
| 340 |
stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
|
|
|
|
| 356 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
| 357 |
|
| 358 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
| 359 |
+
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 360 |
|
| 361 |
# Launch the Gradio app
|
| 362 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
chatfuncs/ingest.py
CHANGED
|
@@ -7,6 +7,7 @@ import requests
|
|
| 7 |
import pandas as pd
|
| 8 |
import dateutil.parser
|
| 9 |
from typing import Type, List
|
|
|
|
| 10 |
|
| 11 |
from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
| 12 |
from langchain_community.vectorstores.faiss import FAISS
|
|
@@ -573,56 +574,50 @@ def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
|
|
| 573 |
|
| 574 |
return embeddings_func
|
| 575 |
|
| 576 |
-
def embed_faiss_save_to_zip(docs_out, save_to="output", model_name
|
| 577 |
-
|
| 578 |
load_embeddings(model_name=model_name)
|
| 579 |
|
| 580 |
-
#embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 581 |
-
|
| 582 |
print(f"> Total split documents: {len(docs_out)}")
|
| 583 |
|
| 584 |
vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
| 585 |
|
| 586 |
-
|
| 587 |
-
|
| 588 |
|
| 589 |
-
|
| 590 |
-
vectorstore.save_local(folder_path=save_to)
|
| 591 |
|
| 592 |
-
print(">
|
| 593 |
print(f"> Saved to: {save_to}")
|
| 594 |
|
| 595 |
-
|
|
|
|
|
|
|
| 596 |
|
| 597 |
-
|
|
|
|
| 598 |
|
| 599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
-
|
| 602 |
-
|
| 603 |
|
| 604 |
-
|
|
|
|
|
|
|
| 605 |
|
| 606 |
-
|
|
|
|
| 607 |
|
| 608 |
-
|
|
|
|
|
|
|
|
|
|
| 609 |
|
| 610 |
-
return out_message, vectorstore, save_zip_out
|
| 611 |
|
| 612 |
-
def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
|
| 613 |
-
print(f"> Total split documents: {len(docs_out)}")
|
| 614 |
-
|
| 615 |
-
vectordb = Chroma.from_documents(documents=docs_out,
|
| 616 |
-
embedding=embeddings,
|
| 617 |
-
persist_directory=save_to)
|
| 618 |
-
|
| 619 |
-
# persiste the db to disk
|
| 620 |
-
vectordb.persist()
|
| 621 |
-
|
| 622 |
-
print("> DONE")
|
| 623 |
-
print(f"> Saved to: {save_to}")
|
| 624 |
-
|
| 625 |
-
return vectordb
|
| 626 |
|
| 627 |
def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
|
| 628 |
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
import dateutil.parser
|
| 9 |
from typing import Type, List
|
| 10 |
+
import shutil
|
| 11 |
|
| 12 |
from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
| 13 |
from langchain_community.vectorstores.faiss import FAISS
|
|
|
|
| 574 |
|
| 575 |
return embeddings_func
|
| 576 |
|
| 577 |
+
def embed_faiss_save_to_zip(docs_out, save_to="output", model_name="BAAI/bge-base-en-v1.5"):
|
|
|
|
| 578 |
load_embeddings(model_name=model_name)
|
| 579 |
|
|
|
|
|
|
|
| 580 |
print(f"> Total split documents: {len(docs_out)}")
|
| 581 |
|
| 582 |
vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
| 583 |
|
| 584 |
+
save_to_path = Path(save_to)
|
| 585 |
+
save_to_path.mkdir(parents=True, exist_ok=True)
|
| 586 |
|
| 587 |
+
vectorstore.save_local(folder_path=str(save_to_path))
|
|
|
|
| 588 |
|
| 589 |
+
print("> FAISS index saved")
|
| 590 |
print(f"> Saved to: {save_to}")
|
| 591 |
|
| 592 |
+
# Ensure files are written before archiving
|
| 593 |
+
index_faiss = save_to_path / "index.faiss"
|
| 594 |
+
index_pkl = save_to_path / "index.pkl"
|
| 595 |
|
| 596 |
+
if not index_faiss.exists() or not index_pkl.exists():
|
| 597 |
+
raise FileNotFoundError("Expected FAISS index files not found before zipping.")
|
| 598 |
|
| 599 |
+
# Flush file system writes by forcing a sync (works best on Unix)
|
| 600 |
+
try:
|
| 601 |
+
os.sync()
|
| 602 |
+
except AttributeError:
|
| 603 |
+
pass # os.sync() not available on Windows
|
| 604 |
|
| 605 |
+
# Create ZIP archive
|
| 606 |
+
final_zip_path = shutil.make_archive(str(save_to_path), 'zip', root_dir=str(save_to_path))
|
| 607 |
|
| 608 |
+
# Remove individual index files to avoid leaking large raw files
|
| 609 |
+
index_faiss.unlink(missing_ok=True)
|
| 610 |
+
index_pkl.unlink(missing_ok=True)
|
| 611 |
|
| 612 |
+
# Move ZIP inside the folder for easier reference
|
| 613 |
+
#final_zip_path = save_to_path.with_suffix('.zip')
|
| 614 |
|
| 615 |
+
print("> Archive complete")
|
| 616 |
+
print(f"> Final ZIP path: {final_zip_path}")
|
| 617 |
+
|
| 618 |
+
return "Document processing complete", vectorstore, final_zip_path
|
| 619 |
|
|
|
|
| 620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
|
| 623 |
|
chatfuncs/ingest_borough_plan.py
CHANGED
|
@@ -8,7 +8,7 @@ print(borough_plan_text)
|
|
| 8 |
borough_plan_docs = ing.text_to_docs(borough_plan_text)
|
| 9 |
print("Borough plan docs created")
|
| 10 |
|
| 11 |
-
embedding_model =
|
| 12 |
|
| 13 |
embeddings = ing.load_embeddings(model_name = embedding_model)
|
| 14 |
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|
|
|
|
| 8 |
borough_plan_docs = ing.text_to_docs(borough_plan_text)
|
| 9 |
print("Borough plan docs created")
|
| 10 |
|
| 11 |
+
embedding_model = "BAAI/bge-base-en-v1.5" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
|
| 12 |
|
| 13 |
embeddings = ing.load_embeddings(model_name = embedding_model)
|
| 14 |
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|