Spaces:
Running
Running
Commit
·
cc495e1
1
Parent(s):
49e0db8
Rearranged functions for embeddings creation to be compatible with zero GPU space. Updated packages.
Browse files- README.md +1 -1
- app.py +1 -1
- funcs/clean_funcs.py +10 -4
- funcs/embeddings.py +36 -7
- funcs/topic_core_funcs.py +68 -90
- requirements.txt +8 -6
- requirements_aws.txt +1 -1
- requirements_gpu.txt +3 -4
README.md
CHANGED
|
@@ -4,7 +4,7 @@ emoji: 🚀
|
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
|
|
|
| 4 |
colorFrom: red
|
| 5 |
colorTo: yellow
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 5.8.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: true
|
| 10 |
license: apache-2.0
|
app.py
CHANGED
|
@@ -76,7 +76,7 @@ with app:
|
|
| 76 |
|
| 77 |
with gr.Accordion("Clean data", open = False):
|
| 78 |
with gr.Row():
|
| 79 |
-
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII,
|
| 80 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
| 81 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
|
| 82 |
#with gr.Row():
|
|
|
|
| 76 |
|
| 77 |
with gr.Accordion("Clean data", open = False):
|
| 78 |
with gr.Row():
|
| 79 |
+
clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, large numbers, emails, postcodes (UK).")
|
| 80 |
drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
|
| 81 |
anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
|
| 82 |
#with gr.Row():
|
funcs/clean_funcs.py
CHANGED
|
@@ -2,6 +2,7 @@ import re
|
|
| 2 |
import string
|
| 3 |
import unicodedata
|
| 4 |
import polars as pl
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
|
| 7 |
# Adding custom words to the stopwords
|
|
@@ -15,15 +16,18 @@ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
|
| 15 |
non_ascii_pattern = r'[^\x00-\x7F]+'
|
| 16 |
email_pattern_regex = r'\S*@\S*\s?'
|
| 17 |
num_pattern_regex = r'[0-9]+'
|
| 18 |
-
|
|
|
|
|
|
|
| 19 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
| 20 |
multiple_spaces_regex = r'\s{2,}'
|
| 21 |
multiple_new_lines_regex = r'(\r\n|\n)+'
|
|
|
|
| 22 |
|
| 23 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
| 24 |
|
| 25 |
for text in texts:
|
| 26 |
-
if not text:
|
| 27 |
text = ""
|
| 28 |
|
| 29 |
# Normalize unicode characters to decompose any special forms
|
|
@@ -53,10 +57,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
|
| 53 |
(html_start_pattern_end_dots_regex, ' '),
|
| 54 |
(non_ascii_pattern, ' '),
|
| 55 |
(email_pattern_regex, ' '),
|
| 56 |
-
(
|
| 57 |
(postcode_pattern_regex, ' '),
|
| 58 |
(multiple_spaces_regex, ' '),
|
| 59 |
-
(
|
|
|
|
|
|
|
| 60 |
]
|
| 61 |
|
| 62 |
# Apply each regex replacement
|
|
|
|
| 2 |
import string
|
| 3 |
import unicodedata
|
| 4 |
import polars as pl
|
| 5 |
+
import pandas as pd
|
| 6 |
import gradio as gr
|
| 7 |
|
| 8 |
# Adding custom words to the stopwords
|
|
|
|
| 16 |
non_ascii_pattern = r'[^\x00-\x7F]+'
|
| 17 |
email_pattern_regex = r'\S*@\S*\s?'
|
| 18 |
num_pattern_regex = r'[0-9]+'
|
| 19 |
+
and_sign_regex = r'&'
|
| 20 |
+
forward_slash_regex = r'/'
|
| 21 |
+
nums_five_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{5,}\b|\b[0-9]+\s[0-9]+\b' # Should match five digit numbers or more, and also if there are full stops or commas in between
|
| 22 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
| 23 |
multiple_spaces_regex = r'\s{2,}'
|
| 24 |
multiple_new_lines_regex = r'(\r\n|\n)+'
|
| 25 |
+
multiple_punctuation_regex = r"(\p{P})\p{P}+"
|
| 26 |
|
| 27 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
| 28 |
|
| 29 |
for text in texts:
|
| 30 |
+
if not text or pd.isnull(text):
|
| 31 |
text = ""
|
| 32 |
|
| 33 |
# Normalize unicode characters to decompose any special forms
|
|
|
|
| 57 |
(html_start_pattern_end_dots_regex, ' '),
|
| 58 |
(non_ascii_pattern, ' '),
|
| 59 |
(email_pattern_regex, ' '),
|
| 60 |
+
(nums_five_more_regex, ' '),
|
| 61 |
(postcode_pattern_regex, ' '),
|
| 62 |
(multiple_spaces_regex, ' '),
|
| 63 |
+
(multiple_punctuation_regex, "${1}"),
|
| 64 |
+
(and_sign_regex, 'and')#,
|
| 65 |
+
#(forward_slash_regex, 'or')
|
| 66 |
]
|
| 67 |
|
| 68 |
# Apply each regex replacement
|
funcs/embeddings.py
CHANGED
|
@@ -1,7 +1,12 @@
|
|
| 1 |
import time
|
| 2 |
import numpy as np
|
| 3 |
import os
|
|
|
|
| 4 |
from torch import cuda, backends, version
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Check for torch cuda
|
| 7 |
# If you want to disable cuda for testing purposes
|
|
@@ -18,11 +23,9 @@ else:
|
|
| 18 |
torch_device = "cpu"
|
| 19 |
high_quality_mode = "No"
|
| 20 |
|
| 21 |
-
print("Device used is: ", torch_device)
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embedding_model, embeddings_super_compress: str, high_quality_mode_opt: str) -> np.ndarray:
|
| 26 |
"""
|
| 27 |
Create or load embeddings for the given documents.
|
| 28 |
|
|
@@ -30,7 +33,6 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
| 30 |
docs (list): List of documents to embed.
|
| 31 |
file_list (list): List of file names to check for existing embeddings.
|
| 32 |
embeddings_out (np.ndarray): Array to store the embeddings.
|
| 33 |
-
embedding_model: Model used to generate embeddings.
|
| 34 |
embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
|
| 35 |
high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
|
| 36 |
|
|
@@ -38,6 +40,33 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
| 38 |
np.ndarray: The generated or loaded embeddings.
|
| 39 |
"""
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# If no embeddings found, make or load in
|
| 42 |
if embeddings_out.size == 0:
|
| 43 |
print("Embeddings not found. Loading or generating new ones.")
|
|
@@ -84,9 +113,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
|
|
| 84 |
embeddings_out = np.round(embeddings_out, 3)
|
| 85 |
embeddings_out *= 100
|
| 86 |
|
| 87 |
-
return embeddings_out
|
| 88 |
|
| 89 |
else:
|
| 90 |
print("Found pre-loaded embeddings.")
|
| 91 |
|
| 92 |
-
return embeddings_out
|
|
|
|
| 1 |
import time
|
| 2 |
import numpy as np
|
| 3 |
import os
|
| 4 |
+
import spaces
|
| 5 |
from torch import cuda, backends, version
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
from sklearn.pipeline import make_pipeline
|
| 8 |
+
from sklearn.decomposition import TruncatedSVD
|
| 9 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 10 |
|
| 11 |
# Check for torch cuda
|
| 12 |
# If you want to disable cuda for testing purposes
|
|
|
|
| 23 |
torch_device = "cpu"
|
| 24 |
high_quality_mode = "No"
|
| 25 |
|
|
|
|
| 26 |
|
| 27 |
+
@spaces.GPU
|
| 28 |
+
def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1") -> np.ndarray:
|
|
|
|
| 29 |
"""
|
| 30 |
Create or load embeddings for the given documents.
|
| 31 |
|
|
|
|
| 33 |
docs (list): List of documents to embed.
|
| 34 |
file_list (list): List of file names to check for existing embeddings.
|
| 35 |
embeddings_out (np.ndarray): Array to store the embeddings.
|
|
|
|
| 36 |
embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
|
| 37 |
high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
|
| 38 |
|
|
|
|
| 40 |
np.ndarray: The generated or loaded embeddings.
|
| 41 |
"""
|
| 42 |
|
| 43 |
+
if high_quality_mode_opt == "Yes":
|
| 44 |
+
# Define a list of possible local locations to search for the model
|
| 45 |
+
local_embeddings_locations = [
|
| 46 |
+
"model/embed/", # Potential local location
|
| 47 |
+
"/model/embed/", # Potential location in Docker container
|
| 48 |
+
"/home/user/app/model/embed/" # This is inside a Docker container
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# Attempt to load the model from each local location
|
| 52 |
+
for location in local_embeddings_locations:
|
| 53 |
+
try:
|
| 54 |
+
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
|
| 55 |
+
print(f"Found local model installation at: {location}")
|
| 56 |
+
break # Exit the loop if the model is found
|
| 57 |
+
except Exception as e:
|
| 58 |
+
print(f"Failed to load model from {location}: {e}")
|
| 59 |
+
continue
|
| 60 |
+
else:
|
| 61 |
+
# If the loop completes without finding the model in any local location
|
| 62 |
+
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
|
| 63 |
+
print("Could not find local model installation. Downloading from Huggingface")
|
| 64 |
+
else:
|
| 65 |
+
embedding_model = make_pipeline(
|
| 66 |
+
TfidfVectorizer(),
|
| 67 |
+
TruncatedSVD(100, random_state=random_seed)
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
# If no embeddings found, make or load in
|
| 71 |
if embeddings_out.size == 0:
|
| 72 |
print("Embeddings not found. Loading or generating new ones.")
|
|
|
|
| 113 |
embeddings_out = np.round(embeddings_out, 3)
|
| 114 |
embeddings_out *= 100
|
| 115 |
|
| 116 |
+
return embeddings_out, embedding_model
|
| 117 |
|
| 118 |
else:
|
| 119 |
print("Found pre-loaded embeddings.")
|
| 120 |
|
| 121 |
+
return embeddings_out, embedding_model
|
funcs/topic_core_funcs.py
CHANGED
|
@@ -7,6 +7,7 @@ import pandas as pd
|
|
| 7 |
import numpy as np
|
| 8 |
import time
|
| 9 |
from bertopic import BERTopic
|
|
|
|
| 10 |
|
| 11 |
from typing import List, Type, Union
|
| 12 |
PandasDataFrame = Type[pd.DataFrame]
|
|
@@ -17,13 +18,7 @@ from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder
|
|
| 17 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
| 18 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
| 19 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
| 20 |
-
|
| 21 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 22 |
-
|
| 23 |
-
from sentence_transformers import SentenceTransformer
|
| 24 |
-
from sklearn.pipeline import make_pipeline
|
| 25 |
-
from sklearn.decomposition import TruncatedSVD
|
| 26 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 27 |
import funcs.anonymiser as anon
|
| 28 |
from umap import UMAP
|
| 29 |
|
|
@@ -96,84 +91,88 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
|
|
| 96 |
output_list = []
|
| 97 |
#file_list = [string.name for string in in_files]
|
| 98 |
|
| 99 |
-
in_colnames_list_first
|
| 100 |
|
| 101 |
-
|
| 102 |
-
if not "original_index" in data.columns:
|
| 103 |
-
data = data.reset_index(names="original_index")
|
| 104 |
|
| 105 |
-
|
| 106 |
-
clean_tic = time.perf_counter()
|
| 107 |
-
print("Starting data clean.")
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
if
|
| 112 |
-
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
| 116 |
-
print(clean_time_out)
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
|
| 126 |
-
|
| 127 |
-
|
|
|
|
| 128 |
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
| 134 |
-
data = data[data[in_colnames_list_first].str.len() >= 50]
|
| 135 |
-
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
| 136 |
-
|
| 137 |
-
#print("Data shape after duplicate/null removal: ", data.shape)
|
| 138 |
|
| 139 |
-
|
| 140 |
-
progress(0.4, desc= "Anonymising data")
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
|
| 148 |
|
| 149 |
-
|
|
|
|
| 150 |
|
| 151 |
-
|
|
|
|
|
|
|
| 152 |
|
| 153 |
-
|
| 154 |
-
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
| 155 |
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
-
data_file_name_no_ext = data_file_name_no_ext + "_split"
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
|
| 174 |
-
|
| 175 |
|
| 176 |
-
|
| 177 |
|
| 178 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
| 179 |
data.to_csv(out_data_name)
|
|
@@ -299,27 +298,6 @@ def extract_topics(
|
|
| 299 |
if high_quality_mode == "Yes":
|
| 300 |
print("Using high quality embedding model")
|
| 301 |
|
| 302 |
-
# Define a list of possible local locations to search for the model
|
| 303 |
-
local_embeddings_locations = [
|
| 304 |
-
"model/embed/", # Potential local location
|
| 305 |
-
"/model/embed/", # Potential location in Docker container
|
| 306 |
-
"/home/user/app/model/embed/" # This is inside a Docker container
|
| 307 |
-
]
|
| 308 |
-
|
| 309 |
-
# Attempt to load the model from each local location
|
| 310 |
-
for location in local_embeddings_locations:
|
| 311 |
-
try:
|
| 312 |
-
embedding_model = SentenceTransformer(location)#, truncate_dim=512)
|
| 313 |
-
print(f"Found local model installation at: {location}")
|
| 314 |
-
break # Exit the loop if the model is found
|
| 315 |
-
except Exception as e:
|
| 316 |
-
print(f"Failed to load model from {location}: {e}")
|
| 317 |
-
continue
|
| 318 |
-
else:
|
| 319 |
-
# If the loop completes without finding the model in any local location
|
| 320 |
-
embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
|
| 321 |
-
print("Could not find local model installation. Downloading from Huggingface")
|
| 322 |
-
|
| 323 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
| 324 |
|
| 325 |
# If tfidf embeddings currently exist, wipe these empty
|
|
@@ -329,15 +307,15 @@ def extract_topics(
|
|
| 329 |
embeddings_type_state = "large"
|
| 330 |
|
| 331 |
# UMAP model uses Bertopic defaults
|
| 332 |
-
umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
|
| 333 |
|
| 334 |
else:
|
| 335 |
print("Choosing low resource TF-IDF model.")
|
| 336 |
|
| 337 |
-
embedding_model = make_pipeline(
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
|
| 342 |
# If large embeddings currently exist, wipe these empty, then rename embeddings type
|
| 343 |
if embeddings_type_state == "large":
|
|
@@ -346,10 +324,10 @@ def extract_topics(
|
|
| 346 |
embeddings_type_state = "tfidf"
|
| 347 |
|
| 348 |
#umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 349 |
-
|
| 350 |
-
|
| 351 |
|
| 352 |
-
embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out,
|
| 353 |
|
| 354 |
# If you want to save your embedding files
|
| 355 |
if return_intermediate_files == "Yes":
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
import time
|
| 9 |
from bertopic import BERTopic
|
| 10 |
+
import spaces
|
| 11 |
|
| 12 |
from typing import List, Type, Union
|
| 13 |
PandasDataFrame = Type[pd.DataFrame]
|
|
|
|
| 18 |
from funcs.embeddings import make_or_load_embeddings, torch_device
|
| 19 |
from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
|
| 20 |
from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
|
|
|
|
| 21 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
import funcs.anonymiser as anon
|
| 23 |
from umap import UMAP
|
| 24 |
|
|
|
|
| 91 |
output_list = []
|
| 92 |
#file_list = [string.name for string in in_files]
|
| 93 |
|
| 94 |
+
for in_colnames_list_first in in_colnames:
|
| 95 |
|
| 96 |
+
print("Cleaning column:", in_colnames_list_first)
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
#in_colnames_list_first = in_colnames[0]
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
# Reset original index to a new column so you can link it to data outputted from cleaning
|
| 101 |
+
if not "original_index" in data.columns:
|
| 102 |
+
data = data.reset_index(names="original_index")
|
| 103 |
|
| 104 |
+
if clean_text == "Yes":
|
| 105 |
+
clean_tic = time.perf_counter()
|
| 106 |
+
print("Starting data clean.")
|
| 107 |
|
| 108 |
+
data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
if '_clean' not in data_file_name_no_ext:
|
| 111 |
+
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
|
|
|
| 112 |
|
| 113 |
+
clean_toc = time.perf_counter()
|
| 114 |
+
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
|
| 115 |
+
print(clean_time_out)
|
| 116 |
|
| 117 |
+
# Clean custom regex if exists
|
| 118 |
+
if not custom_regex.empty:
|
| 119 |
+
data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
|
| 120 |
|
| 121 |
+
if '_clean' not in data_file_name_no_ext:
|
| 122 |
+
data_file_name_no_ext = data_file_name_no_ext + "_clean"
|
| 123 |
+
|
| 124 |
|
| 125 |
+
if drop_duplicate_text == "Yes":
|
| 126 |
+
progress(0.3, desc= "Drop duplicates - remove short texts")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
data_file_name_no_ext = data_file_name_no_ext + "_dedup"
|
|
|
|
| 129 |
|
| 130 |
+
#print("Removing duplicates and short entries from data")
|
| 131 |
+
#print("Data shape before: ", data.shape)
|
| 132 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
| 133 |
+
data = data[data[in_colnames_list_first].str.len() >= 50]
|
| 134 |
+
data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
|
| 135 |
+
|
| 136 |
+
#print("Data shape after duplicate/null removal: ", data.shape)
|
| 137 |
|
| 138 |
+
if anonymise_drop == "Yes":
|
| 139 |
+
progress(0.4, desc= "Anonymising data")
|
|
|
|
| 140 |
|
| 141 |
+
if '_anon' not in data_file_name_no_ext:
|
| 142 |
+
data_file_name_no_ext = data_file_name_no_ext + "_anon"
|
| 143 |
|
| 144 |
+
anon_tic = time.perf_counter()
|
| 145 |
+
|
| 146 |
+
data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
|
| 147 |
|
| 148 |
+
data[in_colnames_list_first] = data_anon_col
|
|
|
|
| 149 |
|
| 150 |
+
print(anonymisation_success)
|
| 151 |
|
| 152 |
+
anon_toc = time.perf_counter()
|
| 153 |
+
time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
|
| 154 |
|
| 155 |
+
print(time_out)
|
|
|
|
| 156 |
|
| 157 |
+
if sentence_split_drop == "Yes":
|
| 158 |
+
progress(0.6, desc= "Splitting text into sentences")
|
| 159 |
+
|
| 160 |
+
if '_split' not in data_file_name_no_ext:
|
| 161 |
+
data_file_name_no_ext = data_file_name_no_ext + "_split"
|
| 162 |
+
|
| 163 |
+
anon_tic = time.perf_counter()
|
| 164 |
+
|
| 165 |
+
data = expand_sentences_spacy(data, in_colnames_list_first)
|
| 166 |
+
data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
|
| 167 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
| 168 |
+
data.reset_index(inplace=True, drop=True)
|
| 169 |
|
| 170 |
+
anon_toc = time.perf_counter()
|
| 171 |
+
time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
|
| 172 |
|
| 173 |
+
print(time_out)
|
| 174 |
|
| 175 |
+
data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
|
| 176 |
|
| 177 |
out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev + ".csv"
|
| 178 |
data.to_csv(out_data_name)
|
|
|
|
| 298 |
if high_quality_mode == "Yes":
|
| 299 |
print("Using high quality embedding model")
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
#embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
|
| 302 |
|
| 303 |
# If tfidf embeddings currently exist, wipe these empty
|
|
|
|
| 307 |
embeddings_type_state = "large"
|
| 308 |
|
| 309 |
# UMAP model uses Bertopic defaults
|
| 310 |
+
#umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
|
| 311 |
|
| 312 |
else:
|
| 313 |
print("Choosing low resource TF-IDF model.")
|
| 314 |
|
| 315 |
+
# embedding_model = make_pipeline(
|
| 316 |
+
# TfidfVectorizer(),
|
| 317 |
+
# TruncatedSVD(100, random_state=random_seed)
|
| 318 |
+
# )
|
| 319 |
|
| 320 |
# If large embeddings currently exist, wipe these empty, then rename embeddings type
|
| 321 |
if embeddings_type_state == "large":
|
|
|
|
| 324 |
embeddings_type_state = "tfidf"
|
| 325 |
|
| 326 |
#umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
|
| 327 |
+
# UMAP model uses Bertopic defaults
|
| 328 |
+
umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
|
| 329 |
|
| 330 |
+
embeddings_out, embedding_model = make_or_load_embeddings(docs, file_list, embeddings_out, embeddings_super_compress, high_quality_mode, embeddings_name)
|
| 331 |
|
| 332 |
# If you want to save your embedding files
|
| 333 |
if return_intermediate_files == "Yes":
|
requirements.txt
CHANGED
|
@@ -3,11 +3,10 @@ pandas==2.2.3
|
|
| 3 |
plotly==5.24.1
|
| 4 |
scikit-learn==1.5.2
|
| 5 |
umap-learn==0.5.7
|
| 6 |
-
gradio==5.
|
| 7 |
-
boto3==1.35.
|
| 8 |
transformers==4.46.3
|
| 9 |
accelerate==1.1.1
|
| 10 |
-
torch==2.5.1
|
| 11 |
bertopic==0.16.4
|
| 12 |
spacy==3.8.0
|
| 13 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
|
@@ -18,6 +17,9 @@ presidio_analyzer==2.2.355
|
|
| 18 |
presidio_anonymizer==2.2.355
|
| 19 |
scipy
|
| 20 |
polars
|
| 21 |
-
sentence-transformers==3.
|
| 22 |
-
|
| 23 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
plotly==5.24.1
|
| 4 |
scikit-learn==1.5.2
|
| 5 |
umap-learn==0.5.7
|
| 6 |
+
gradio==5.8.0
|
| 7 |
+
boto3==1.35.71
|
| 8 |
transformers==4.46.3
|
| 9 |
accelerate==1.1.1
|
|
|
|
| 10 |
bertopic==0.16.4
|
| 11 |
spacy==3.8.0
|
| 12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
|
|
|
| 17 |
presidio_anonymizer==2.2.355
|
| 18 |
scipy
|
| 19 |
polars
|
| 20 |
+
sentence-transformers==3.3.1
|
| 21 |
+
torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
|
| 22 |
+
#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
| 23 |
+
# Specify exact llama_cpp wheel for huggingface compatibility
|
| 24 |
+
https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
|
| 25 |
+
numpy==1.26.4
|
requirements_aws.txt
CHANGED
|
@@ -6,7 +6,7 @@ umap-learn==0.5.7
|
|
| 6 |
boto3==1.35.64
|
| 7 |
spacy==3.8.0
|
| 8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
| 9 |
-
gradio==5.
|
| 10 |
pyarrow
|
| 11 |
openpyxl
|
| 12 |
Faker
|
|
|
|
| 6 |
boto3==1.35.64
|
| 7 |
spacy==3.8.0
|
| 8 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
| 9 |
+
gradio==5.8.0
|
| 10 |
pyarrow
|
| 11 |
openpyxl
|
| 12 |
Faker
|
requirements_gpu.txt
CHANGED
|
@@ -18,8 +18,7 @@ presidio_analyzer==2.2.355
|
|
| 18 |
presidio_anonymizer==2.2.355
|
| 19 |
scipy
|
| 20 |
polars
|
| 21 |
-
llama-cpp-python==0.
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
#numpy==1.26.4
|
| 25 |
|
|
|
|
| 18 |
presidio_anonymizer==2.2.355
|
| 19 |
scipy
|
| 20 |
polars
|
| 21 |
+
llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
| 22 |
+
sentence-transformers==3.3.1
|
| 23 |
+
numpy==1.26.4
|
|
|
|
| 24 |
|