Spaces:
Build error
Build error
Commit ·
d3ff2e2
1
Parent(s): 3b77fe5
Allowed for custom output folder, returned Dockerfile to work under user account and port 7860
Browse files- Dockerfile +7 -13
- app.py +4 -2
- search_funcs/bm25_functions.py +8 -8
- search_funcs/helper_functions.py +23 -5
- search_funcs/semantic_functions.py +5 -5
- search_funcs/semantic_ingest_functions.py +4 -4
Dockerfile
CHANGED
|
@@ -26,21 +26,15 @@ RUN git lfs install
|
|
| 26 |
RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
|
| 27 |
RUN rm -rf /model/bge/.git
|
| 28 |
|
| 29 |
-
# Expose port 8080
|
| 30 |
-
EXPOSE 8080
|
| 31 |
-
|
| 32 |
# Set up a new user named "user" with user ID 1000
|
| 33 |
-
|
| 34 |
|
| 35 |
# Change ownership of /home/user directory
|
| 36 |
-
|
| 37 |
|
| 38 |
# Make output folder
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
RUN mkdir -p /home/user/app/output
|
| 43 |
-
RUN mkdir -p /home/user/.cache/huggingface/hub
|
| 44 |
|
| 45 |
# Switch to the "user" user
|
| 46 |
USER user
|
|
@@ -53,7 +47,7 @@ ENV HOME=/home/user \
|
|
| 53 |
GRADIO_ALLOW_FLAGGING=never \
|
| 54 |
GRADIO_NUM_PORTS=1 \
|
| 55 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 56 |
-
GRADIO_SERVER_PORT=
|
| 57 |
GRADIO_THEME=huggingface \
|
| 58 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
| 59 |
#GRADIO_ROOT_PATH=/data-text-search \
|
|
@@ -63,8 +57,8 @@ ENV HOME=/home/user \
|
|
| 63 |
WORKDIR $HOME/app
|
| 64 |
|
| 65 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 66 |
-
|
| 67 |
-
COPY . $HOME/app
|
| 68 |
|
| 69 |
|
| 70 |
CMD ["python", "app.py"]
|
|
|
|
| 26 |
RUN git clone https://huggingface.co/BAAI/bge-small-en-v1.5 /model/bge
|
| 27 |
RUN rm -rf /model/bge/.git
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
# Set up a new user named "user" with user ID 1000
|
| 30 |
+
RUN useradd -m -u 1000 user
|
| 31 |
|
| 32 |
# Change ownership of /home/user directory
|
| 33 |
+
RUN chown -R user:user /home/user
|
| 34 |
|
| 35 |
# Make output folder
|
| 36 |
+
RUN mkdir -p /home/user/app/output && chown -R user:user /home/user/app/output
|
| 37 |
+
RUN mkdir -p /home/user/.cache/huggingface/hub && chown -R user:user /home/user/.cache/huggingface/hub
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# Switch to the "user" user
|
| 40 |
USER user
|
|
|
|
| 47 |
GRADIO_ALLOW_FLAGGING=never \
|
| 48 |
GRADIO_NUM_PORTS=1 \
|
| 49 |
GRADIO_SERVER_NAME=0.0.0.0 \
|
| 50 |
+
GRADIO_SERVER_PORT=7860 \
|
| 51 |
GRADIO_THEME=huggingface \
|
| 52 |
AWS_STS_REGIONAL_ENDPOINT=regional \
|
| 53 |
#GRADIO_ROOT_PATH=/data-text-search \
|
|
|
|
| 57 |
WORKDIR $HOME/app
|
| 58 |
|
| 59 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 60 |
+
COPY --chown=user . $HOME/app
|
| 61 |
+
#COPY . $HOME/app
|
| 62 |
|
| 63 |
|
| 64 |
CMD ["python", "app.py"]
|
app.py
CHANGED
|
@@ -8,14 +8,16 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
| 8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
| 9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
| 10 |
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
|
| 11 |
-
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder
|
| 12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
| 13 |
from search_funcs.aws_functions import load_data_from_aws
|
| 14 |
|
| 15 |
#from fastapi import FastAPI
|
| 16 |
#app = FastAPI()
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
temp_folder_path = get_temp_folder_path()
|
| 20 |
empty_folder(temp_folder_path)
|
| 21 |
|
|
|
|
| 8 |
from search_funcs.bm25_functions import prepare_bm25_input_data, prepare_bm25, bm25_search
|
| 9 |
from search_funcs.semantic_ingest_functions import csv_excel_text_to_docs
|
| 10 |
from search_funcs.semantic_functions import docs_to_bge_embed_np_array, bge_simple_retrieval
|
| 11 |
+
from search_funcs.helper_functions import display_info, initial_data_load, put_columns_in_join_df, get_temp_folder_path, empty_folder, output_folder
|
| 12 |
from search_funcs.spacy_search_funcs import spacy_fuzzy_search
|
| 13 |
from search_funcs.aws_functions import load_data_from_aws
|
| 14 |
|
| 15 |
#from fastapi import FastAPI
|
| 16 |
#app = FastAPI()
|
| 17 |
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Attempt to delete temporary files generated by previous use of the app (as the files can be very big!). Only setup to work for local runs in Windows
|
| 21 |
temp_folder_path = get_temp_folder_path()
|
| 22 |
empty_folder(temp_folder_path)
|
| 23 |
|
search_funcs/bm25_functions.py
CHANGED
|
@@ -14,7 +14,7 @@ from datetime import datetime
|
|
| 14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
| 15 |
|
| 16 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
| 17 |
-
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists
|
| 18 |
|
| 19 |
# Load the SpaCy model
|
| 20 |
from spacy.cli.download import download
|
|
@@ -232,7 +232,7 @@ class BM25:
|
|
| 232 |
|
| 233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
| 234 |
#print(in_file)
|
| 235 |
-
ensure_output_folder_exists()
|
| 236 |
|
| 237 |
if not in_file:
|
| 238 |
print("No input file found. Please load in at least one file.")
|
|
@@ -327,9 +327,9 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
| 327 |
if return_intermediate_files == "Yes":
|
| 328 |
|
| 329 |
if clean == "Yes":
|
| 330 |
-
tokenised_data_file_name =
|
| 331 |
else:
|
| 332 |
-
tokenised_data_file_name =
|
| 333 |
|
| 334 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
| 335 |
|
|
@@ -339,7 +339,7 @@ def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, c
|
|
| 339 |
|
| 340 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
| 341 |
|
| 342 |
-
ensure_output_folder_exists()
|
| 343 |
|
| 344 |
# Check if the list and the dataframe have the same length
|
| 345 |
if len(prepared_text_list) != len(in_df):
|
|
@@ -347,7 +347,7 @@ def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_col
|
|
| 347 |
|
| 348 |
file_end = ".parquet"
|
| 349 |
|
| 350 |
-
file_name =
|
| 351 |
|
| 352 |
new_text_column = in_bm25_column + "_cleaned"
|
| 353 |
prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
|
|
@@ -547,10 +547,10 @@ def bm25_search(free_text_query, in_no_search_results, original_data, searched_d
|
|
| 547 |
results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
|
| 548 |
|
| 549 |
# Out file
|
| 550 |
-
ensure_output_folder_exists()
|
| 551 |
|
| 552 |
query_str_file = ("_").join(token_query)
|
| 553 |
-
results_df_name = "
|
| 554 |
|
| 555 |
print("Saving search file output")
|
| 556 |
progress(0.7, desc = "Saving search output to file")
|
|
|
|
| 14 |
today_rev = datetime.now().strftime("%Y%m%d")
|
| 15 |
|
| 16 |
from search_funcs.clean_funcs import initial_clean # get_lemma_tokens, stem_sentence
|
| 17 |
+
from search_funcs.helper_functions import get_file_path_end_with_ext, get_file_path_end, create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
|
| 18 |
|
| 19 |
# Load the SpaCy model
|
| 20 |
from spacy.cli.download import download
|
|
|
|
| 232 |
|
| 233 |
def prepare_bm25_input_data(in_file, text_column, data_state, tokenised_state, clean="No", return_intermediate_files = "No", progress=gr.Progress(track_tqdm=True)):
|
| 234 |
#print(in_file)
|
| 235 |
+
ensure_output_folder_exists(output_folder)
|
| 236 |
|
| 237 |
if not in_file:
|
| 238 |
print("No input file found. Please load in at least one file.")
|
|
|
|
| 327 |
if return_intermediate_files == "Yes":
|
| 328 |
|
| 329 |
if clean == "Yes":
|
| 330 |
+
tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_cleaned_tokenised.parquet"
|
| 331 |
else:
|
| 332 |
+
tokenised_data_file_name = output_folder + data_file_out_name_no_ext + "_tokenised.parquet"
|
| 333 |
|
| 334 |
pd.DataFrame(data={"Corpus":corpus}).to_parquet(tokenised_data_file_name)
|
| 335 |
|
|
|
|
| 339 |
|
| 340 |
def save_prepared_bm25_data(in_file_name, prepared_text_list, in_df, in_bm25_column, progress=gr.Progress(track_tqdm=True)):
|
| 341 |
|
| 342 |
+
ensure_output_folder_exists(output_folder)
|
| 343 |
|
| 344 |
# Check if the list and the dataframe have the same length
|
| 345 |
if len(prepared_text_list) != len(in_df):
|
|
|
|
| 347 |
|
| 348 |
file_end = ".parquet"
|
| 349 |
|
| 350 |
+
file_name = output_folder + get_file_path_end(in_file_name) + "_cleaned" + file_end
|
| 351 |
|
| 352 |
new_text_column = in_bm25_column + "_cleaned"
|
| 353 |
prepared_text_df = pd.DataFrame(data={new_text_column:prepared_text_list})
|
|
|
|
| 547 |
results_df_out = results_df_out.sort_values(['search_score_abs', "search_text"], ascending=False)
|
| 548 |
|
| 549 |
# Out file
|
| 550 |
+
ensure_output_folder_exists(output_folder)
|
| 551 |
|
| 552 |
query_str_file = ("_").join(token_query)
|
| 553 |
+
results_df_name = output_folder + "keyword_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
| 554 |
|
| 555 |
print("Saving search file output")
|
| 556 |
progress(0.7, desc = "Saving search output to file")
|
search_funcs/helper_functions.py
CHANGED
|
@@ -19,6 +19,24 @@ megabyte = 1024 * 1024 # Bytes in a megabyte
|
|
| 19 |
file_size_mb = 500 # Size in megabytes
|
| 20 |
file_size_bytes_500mb = megabyte * file_size_mb
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Attempt to delete content of gradio temp folder
|
| 23 |
def get_temp_folder_path():
|
| 24 |
username = getpass.getuser()
|
|
@@ -58,17 +76,17 @@ def get_file_path_end_with_ext(file_path):
|
|
| 58 |
|
| 59 |
return filename_end
|
| 60 |
|
| 61 |
-
def ensure_output_folder_exists():
|
| 62 |
-
"""Checks if the
|
| 63 |
|
| 64 |
-
folder_name =
|
| 65 |
|
| 66 |
if not os.path.exists(folder_name):
|
| 67 |
# Create the folder if it doesn't exist
|
| 68 |
os.makedirs(folder_name)
|
| 69 |
-
print(f"Created the
|
| 70 |
else:
|
| 71 |
-
print(f"The
|
| 72 |
|
| 73 |
def detect_file_type(filename):
|
| 74 |
"""Detect the file type based on its extension."""
|
|
|
|
| 19 |
file_size_mb = 500 # Size in megabytes
|
| 20 |
file_size_bytes_500mb = megabyte * file_size_mb
|
| 21 |
|
| 22 |
+
def get_or_create_env_var(var_name, default_value):
|
| 23 |
+
# Get the environment variable if it exists
|
| 24 |
+
value = os.environ.get(var_name)
|
| 25 |
+
|
| 26 |
+
# If it doesn't exist, set it to the default value
|
| 27 |
+
if value is None:
|
| 28 |
+
os.environ[var_name] = default_value
|
| 29 |
+
value = default_value
|
| 30 |
+
|
| 31 |
+
return value
|
| 32 |
+
|
| 33 |
+
# Retrieving or setting output folder
|
| 34 |
+
env_var_name = 'GRADIO_OUTPUT_FOLDER'
|
| 35 |
+
default_value = 'output/'
|
| 36 |
+
|
| 37 |
+
output_folder = get_or_create_env_var(env_var_name, default_value)
|
| 38 |
+
print(f'The value of {env_var_name} is {output_folder}')
|
| 39 |
+
|
| 40 |
# Attempt to delete content of gradio temp folder
|
| 41 |
def get_temp_folder_path():
|
| 42 |
username = getpass.getuser()
|
|
|
|
| 76 |
|
| 77 |
return filename_end
|
| 78 |
|
| 79 |
+
def ensure_output_folder_exists(output_folder):
|
| 80 |
+
"""Checks if the output folder exists, creates it if not."""
|
| 81 |
|
| 82 |
+
folder_name = output_folder
|
| 83 |
|
| 84 |
if not os.path.exists(folder_name):
|
| 85 |
# Create the folder if it doesn't exist
|
| 86 |
os.makedirs(folder_name)
|
| 87 |
+
print(f"Created the output folder:", folder_name)
|
| 88 |
else:
|
| 89 |
+
print(f"The output folder already exists:", folder_name)
|
| 90 |
|
| 91 |
def detect_file_type(filename):
|
| 92 |
"""Detect the file type based on its extension."""
|
search_funcs/semantic_functions.py
CHANGED
|
@@ -25,7 +25,7 @@ else:
|
|
| 25 |
|
| 26 |
print("Device used is: ", torch_device)
|
| 27 |
|
| 28 |
-
from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists
|
| 29 |
|
| 30 |
PandasDataFrame = Type[pd.DataFrame]
|
| 31 |
|
|
@@ -70,7 +70,7 @@ def docs_to_bge_embed_np_array(docs_out, in_file, embeddings_state, output_file_
|
|
| 70 |
Takes a Langchain document class and saves it into a Numpy array.
|
| 71 |
'''
|
| 72 |
|
| 73 |
-
ensure_output_folder_exists()
|
| 74 |
|
| 75 |
if not in_file:
|
| 76 |
out_message = "No input file found. Please load in at least one file."
|
|
@@ -232,7 +232,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
| 232 |
# print("vectorstore loaded: ", vectorstore)
|
| 233 |
progress(0, desc = "Conducting semantic search")
|
| 234 |
|
| 235 |
-
ensure_output_folder_exists()
|
| 236 |
|
| 237 |
print("Searching")
|
| 238 |
|
|
@@ -297,7 +297,7 @@ def bge_simple_retrieval(query_str:str, vectorstore, docs, orig_df_col:str, k_va
|
|
| 297 |
|
| 298 |
query_str_file = query_str.replace(" ", "_")
|
| 299 |
|
| 300 |
-
results_df_name = "
|
| 301 |
|
| 302 |
print("Saving search output to file")
|
| 303 |
progress(0.7, desc = "Saving search output to file")
|
|
@@ -594,7 +594,7 @@ def chroma_retrieval_deprecated(query_str:str, vectorstore, docs, orig_df_col:st
|
|
| 594 |
|
| 595 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
| 596 |
|
| 597 |
-
results_df_name = "
|
| 598 |
results_df_out.to_csv(results_df_name, index= None)
|
| 599 |
results_first_text = results_df_out[orig_df_col].iloc[0]
|
| 600 |
|
|
|
|
| 25 |
|
| 26 |
print("Device used is: ", torch_device)
|
| 27 |
|
| 28 |
+
from search_funcs.helper_functions import create_highlighted_excel_wb, ensure_output_folder_exists, output_folder
|
| 29 |
|
| 30 |
PandasDataFrame = Type[pd.DataFrame]
|
| 31 |
|
|
|
|
| 70 |
Takes a Langchain document class and saves it into a Numpy array.
|
| 71 |
'''
|
| 72 |
|
| 73 |
+
ensure_output_folder_exists(output_folder)
|
| 74 |
|
| 75 |
if not in_file:
|
| 76 |
out_message = "No input file found. Please load in at least one file."
|
|
|
|
| 232 |
# print("vectorstore loaded: ", vectorstore)
|
| 233 |
progress(0, desc = "Conducting semantic search")
|
| 234 |
|
| 235 |
+
ensure_output_folder_exists(output_folder)
|
| 236 |
|
| 237 |
print("Searching")
|
| 238 |
|
|
|
|
| 297 |
|
| 298 |
query_str_file = query_str.replace(" ", "_")
|
| 299 |
|
| 300 |
+
results_df_name = output_folder + "semantic_search_result_" + today_rev + "_" + query_str_file + ".xlsx"
|
| 301 |
|
| 302 |
print("Saving search output to file")
|
| 303 |
progress(0.7, desc = "Saving search output to file")
|
|
|
|
| 594 |
|
| 595 |
results_df_out = process_data_from_scores_df(df_docs, in_join_file, out_passages, vec_score_cut_off, vec_weight, orig_df_col, in_join_column, search_df_join_column)
|
| 596 |
|
| 597 |
+
results_df_name = output_folder + "semantic_search_result.csv"
|
| 598 |
results_df_out.to_csv(results_df_name, index= None)
|
| 599 |
results_first_text = results_df_out[orig_df_col].iloc[0]
|
| 600 |
|
search_funcs/semantic_ingest_functions.py
CHANGED
|
@@ -32,7 +32,7 @@ chunk_overlap = 0
|
|
| 32 |
start_index = True
|
| 33 |
|
| 34 |
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
|
| 35 |
-
from search_funcs.bm25_functions import save_prepared_bm25_data
|
| 36 |
from search_funcs.clean_funcs import initial_clean
|
| 37 |
|
| 38 |
def parse_file_not_used(file_paths, text_column='text'):
|
|
@@ -198,7 +198,7 @@ def parse_metadata(row):
|
|
| 198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
| 199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
| 200 |
|
| 201 |
-
ensure_output_folder_exists()
|
| 202 |
output_list = []
|
| 203 |
|
| 204 |
if not in_file:
|
|
@@ -305,7 +305,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
| 305 |
|
| 306 |
if clean == "No":
|
| 307 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
| 308 |
-
out_doc_file_name =
|
| 309 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
| 310 |
pickle.dump(doc_sections, file)
|
| 311 |
|
|
@@ -313,7 +313,7 @@ def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_interm
|
|
| 313 |
elif clean == "Yes":
|
| 314 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
| 315 |
|
| 316 |
-
out_doc_file_name =
|
| 317 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
| 318 |
pickle.dump(doc_sections, file)
|
| 319 |
|
|
|
|
| 32 |
start_index = True
|
| 33 |
|
| 34 |
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end, ensure_output_folder_exists
|
| 35 |
+
from search_funcs.bm25_functions import save_prepared_bm25_data, output_folder
|
| 36 |
from search_funcs.clean_funcs import initial_clean
|
| 37 |
|
| 38 |
def parse_file_not_used(file_paths, text_column='text'):
|
|
|
|
| 198 |
def csv_excel_text_to_docs(df, in_file, text_column, clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress(track_tqdm=True)) -> List[Document]:
|
| 199 |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata."""
|
| 200 |
|
| 201 |
+
ensure_output_folder_exists(output_folder)
|
| 202 |
output_list = []
|
| 203 |
|
| 204 |
if not in_file:
|
|
|
|
| 305 |
|
| 306 |
if clean == "No":
|
| 307 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet")
|
| 308 |
+
out_doc_file_name = output_folder + file_name + "_prepared_docs.pkl.gz"
|
| 309 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
| 310 |
pickle.dump(doc_sections, file)
|
| 311 |
|
|
|
|
| 313 |
elif clean == "Yes":
|
| 314 |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet")
|
| 315 |
|
| 316 |
+
out_doc_file_name = output_folder + file_name + "_cleaned_prepared_docs.pkl.gz"
|
| 317 |
with gzip.open(out_doc_file_name, 'wb') as file:
|
| 318 |
pickle.dump(doc_sections, file)
|
| 319 |
|