Spaces:

seanpedrickcase
/

topic_modelling

Running

App Files Files Community

seanpedrickcase commited on Dec 12, 2024

Commit

cc495e1

1 Parent(s): 49e0db8

Rearranged functions for embeddings creation to be compatible with zero GPU space. Updated packages.

Browse files

Files changed (8) hide show

README.md +1 -1
app.py +1 -1
funcs/clean_funcs.py +10 -4
funcs/embeddings.py +36 -7
funcs/topic_core_funcs.py +68 -90
requirements.txt +8 -6
requirements_aws.txt +1 -1
requirements_gpu.txt +3 -4

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🚀
 colorFrom: red
 colorTo: yellow
 sdk: gradio
-sdk_version: 5.6.0
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: red
 colorTo: yellow
 sdk: gradio
+sdk_version: 5.8.0
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -76,7 +76,7 @@ with app:
         with gr.Accordion("Clean data", open = False):
             with gr.Row():
-                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, multiple digits, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
                 #with gr.Row():

         with gr.Accordion("Clean data", open = False):
             with gr.Row():
+                clean_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove html, URLs, non-ASCII, large numbers, emails, postcodes (UK).")
                 drop_duplicate_text = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Remove duplicate text, drop < 50 character strings.")
                 anonymise_drop = gr.Dropdown(value = "No", choices=["Yes", "No"], multiselect=False, label="Redact personal information - not 100% effective and slow!")
                 #with gr.Row():

funcs/clean_funcs.py CHANGED Viewed

@@ -2,6 +2,7 @@ import re
 import string
 import unicodedata
 import polars as pl
 import gradio as gr
 # Adding custom words to the stopwords
@@ -15,15 +16,18 @@ html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 non_ascii_pattern = r'[^\x00-\x7F]+'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
-nums_two_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b' # Should match two digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 multiple_new_lines_regex = r'(\r\n|\n)+'
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     for text in texts:
-        if not text:
             text = ""
         # Normalize unicode characters to decompose any special forms
@@ -53,10 +57,12 @@ def initial_clean(texts, custom_regex, progress=gr.Progress()):
         (html_start_pattern_end_dots_regex, ' '),
         (non_ascii_pattern, ' '),
         (email_pattern_regex, ' '),
-        (nums_two_more_regex, ' '),
         (postcode_pattern_regex, ' '),
         (multiple_spaces_regex, ' '),
-        (r"(\p{P})\p{P}+", "${1}")
     ]
     # Apply each regex replacement

 import string
 import unicodedata
 import polars as pl
+import pandas as pd
 import gradio as gr
 # Adding custom words to the stopwords
 non_ascii_pattern = r'[^\x00-\x7F]+'
 email_pattern_regex = r'\S*@\S*\s?'
 num_pattern_regex = r'[0-9]+'
+and_sign_regex = r'&'
+forward_slash_regex = r'/'
+nums_five_more_regex = r'\b\d+[\.|\,]\d+\b|\b[0-9]{5,}\b|\b[0-9]+\s[0-9]+\b' # Should match five digit numbers or more, and also if there are full stops or commas in between
 postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
 multiple_spaces_regex = r'\s{2,}'
 multiple_new_lines_regex = r'(\r\n|\n)+'
+multiple_punctuation_regex = r"(\p{P})\p{P}+"
 def initial_clean(texts, custom_regex, progress=gr.Progress()):
     for text in texts:
+        if not text or pd.isnull(text):
             text = ""
         # Normalize unicode characters to decompose any special forms
         (html_start_pattern_end_dots_regex, ' '),
         (non_ascii_pattern, ' '),
         (email_pattern_regex, ' '),
+        (nums_five_more_regex, ' '),
         (postcode_pattern_regex, ' '),
         (multiple_spaces_regex, ' '),
+        (multiple_punctuation_regex, "${1}"),
+        (and_sign_regex, 'and')#,
+        #(forward_slash_regex, 'or')
     ]
     # Apply each regex replacement

funcs/embeddings.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import time
 import numpy as np
 import os
 from torch import cuda, backends, version
 # Check for torch cuda
 # If you want to disable cuda for testing purposes
@@ -18,11 +23,9 @@ else:
     torch_device =  "cpu"
     high_quality_mode = "No"
-print("Device used is: ", torch_device)
-def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embedding_model, embeddings_super_compress: str, high_quality_mode_opt: str) -> np.ndarray:
     """
     Create or load embeddings for the given documents.
@@ -30,7 +33,6 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
         docs (list): List of documents to embed.
         file_list (list): List of file names to check for existing embeddings.
         embeddings_out (np.ndarray): Array to store the embeddings.
-        embedding_model: Model used to generate embeddings.
         embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
         high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
@@ -38,6 +40,33 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
         np.ndarray: The generated or loaded embeddings.
     """
     # If no embeddings found, make or load in
     if embeddings_out.size == 0:
         print("Embeddings not found. Loading or generating new ones.")
@@ -84,9 +113,9 @@ def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndar
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
-        return embeddings_out
     else:
         print("Found pre-loaded embeddings.")
-        return embeddings_out

 import time
 import numpy as np
 import os
+import spaces
 from torch import cuda, backends, version
+from sentence_transformers import SentenceTransformer
+from sklearn.pipeline import make_pipeline
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
 # Check for torch cuda
 # If you want to disable cuda for testing purposes
     torch_device =  "cpu"
     high_quality_mode = "No"
+@spaces.GPU
+def make_or_load_embeddings(docs: list, file_list: list, embeddings_out: np.ndarray, embeddings_super_compress: str, high_quality_mode_opt: str, embeddings_name:str="mixedbread-ai/mxbai-embed-xsmall-v1") -> np.ndarray:
     """
     Create or load embeddings for the given documents.
         docs (list): List of documents to embed.
         file_list (list): List of file names to check for existing embeddings.
         embeddings_out (np.ndarray): Array to store the embeddings.
         embeddings_super_compress (str): Option to super compress embeddings ("Yes" or "No").
         high_quality_mode_opt (str): Option for high quality mode ("Yes" or "No").
         np.ndarray: The generated or loaded embeddings.
     """
+    if high_quality_mode_opt == "Yes":
+    # Define a list of possible local locations to search for the model
+        local_embeddings_locations = [
+            "model/embed/", # Potential local location
+            "/model/embed/", # Potential location in Docker container
+            "/home/user/app/model/embed/" # This is inside a Docker container
+        ]
+        # Attempt to load the model from each local location
+        for location in local_embeddings_locations:
+            try:
+                embedding_model = SentenceTransformer(location)#, truncate_dim=512)
+                print(f"Found local model installation at: {location}")
+                break  # Exit the loop if the model is found
+            except Exception as e:
+                print(f"Failed to load model from {location}: {e}")
+                continue
+        else:
+            # If the loop completes without finding the model in any local location
+            embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
+            print("Could not find local model installation. Downloading from Huggingface")
+    else:
+        embedding_model = make_pipeline(
+                TfidfVectorizer(),
+                TruncatedSVD(100, random_state=random_seed)
+                )
     # If no embeddings found, make or load in
     if embeddings_out.size == 0:
         print("Embeddings not found. Loading or generating new ones.")
                 embeddings_out = np.round(embeddings_out, 3)
                 embeddings_out *= 100
+        return embeddings_out, embedding_model
     else:
         print("Found pre-loaded embeddings.")
+        return embeddings_out, embedding_model

funcs/topic_core_funcs.py CHANGED Viewed

@@ -7,6 +7,7 @@ import pandas as pd
 import numpy as np
 import time
 from bertopic import BERTopic
 from typing import List, Type, Union
 PandasDataFrame = Type[pd.DataFrame]
@@ -17,13 +18,7 @@ from funcs.helper_functions import read_file, zip_folder, delete_files_in_folder
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
 from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
 from sklearn.feature_extraction.text import CountVectorizer
-from sentence_transformers import SentenceTransformer
-from sklearn.pipeline import make_pipeline
-from sklearn.decomposition import TruncatedSVD
-from sklearn.feature_extraction.text import TfidfVectorizer
 import funcs.anonymiser as anon
 from umap import UMAP
@@ -96,84 +91,88 @@ def pre_clean(data: pd.DataFrame, in_colnames: list, data_file_name_no_ext: str,
     output_list = []
     #file_list = [string.name for string in in_files]
-    in_colnames_list_first = in_colnames[0]
-    # Reset original index to a new column so you can link it to data outputted from cleaning
-    if not "original_index" in data.columns:
-        data = data.reset_index(names="original_index")
-    if clean_text == "Yes":
-        clean_tic = time.perf_counter()
-        print("Starting data clean.")
-        data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
-        if '_clean' not in data_file_name_no_ext:
-            data_file_name_no_ext = data_file_name_no_ext + "_clean"
-        clean_toc = time.perf_counter()
-        clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
-        print(clean_time_out)
-    # Clean custom regex if exists
-    if not custom_regex.empty:
-        data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
-        if '_clean' not in data_file_name_no_ext:
-            data_file_name_no_ext = data_file_name_no_ext + "_clean"
-    if drop_duplicate_text == "Yes":
-        progress(0.3, desc= "Drop duplicates - remove short texts")
-        data_file_name_no_ext = data_file_name_no_ext + "_dedup"
-        #print("Removing duplicates and short entries from data")
-        #print("Data shape before: ", data.shape)
-        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
-        data = data[data[in_colnames_list_first].str.len() >= 50]
-        data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
-        #print("Data shape after duplicate/null removal: ", data.shape)
-    if anonymise_drop == "Yes":
-        progress(0.4, desc= "Anonymising data")
-        if '_anon' not in data_file_name_no_ext:
-            data_file_name_no_ext = data_file_name_no_ext + "_anon"
-        anon_tic = time.perf_counter()
-        data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
-        data[in_colnames_list_first] = data_anon_col
-        print(anonymisation_success)
-        anon_toc = time.perf_counter()
-        time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
-        print(time_out)
-    if sentence_split_drop == "Yes":
-        progress(0.6, desc= "Splitting text into sentences")
-        if '_split' not in data_file_name_no_ext:
-            data_file_name_no_ext = data_file_name_no_ext + "_split"
-        anon_tic = time.perf_counter()
-        data = expand_sentences_spacy(data, in_colnames_list_first)
-        data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
-        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
-        data.reset_index(inplace=True, drop=True)
-        anon_toc = time.perf_counter()
-        time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
-        print(time_out)
-        data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
@@ -299,27 +298,6 @@ def extract_topics(
     if high_quality_mode == "Yes":
         print("Using high quality embedding model")
-        # Define a list of possible local locations to search for the model
-        local_embeddings_locations = [
-            "model/embed/", # Potential local location
-            "/model/embed/", # Potential location in Docker container
-            "/home/user/app/model/embed/" # This is inside a Docker container
-        ]
-        # Attempt to load the model from each local location
-        for location in local_embeddings_locations:
-            try:
-                embedding_model = SentenceTransformer(location)#, truncate_dim=512)
-                print(f"Found local model installation at: {location}")
-                break  # Exit the loop if the model is found
-            except Exception as e:
-                print(f"Failed to load model from {location}: {e}")
-                continue
-        else:
-            # If the loop completes without finding the model in any local location
-            embedding_model = SentenceTransformer(embeddings_name)#, truncate_dim=512)
-            print("Could not find local model installation. Downloading from Huggingface")
         #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
         # If tfidf embeddings currently exist, wipe these empty
@@ -329,15 +307,15 @@ def extract_topics(
         embeddings_type_state = "large"
         # UMAP model uses Bertopic defaults
-        umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
     else:
         print("Choosing low resource TF-IDF model.")
-        embedding_model = make_pipeline(
-                TfidfVectorizer(),
-                TruncatedSVD(100, random_state=random_seed)
-                )
         # If large embeddings currently exist, wipe these empty, then rename embeddings type
         if embeddings_type_state == "large":
@@ -346,10 +324,10 @@ def extract_topics(
         embeddings_type_state = "tfidf"
         #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
-        # UMAP model uses Bertopic defaults
-        umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
-    embeddings_out = make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, high_quality_mode)
      # If you want to save your embedding files
     if return_intermediate_files == "Yes":

 import numpy as np
 import time
 from bertopic import BERTopic
+import spaces
 from typing import List, Type, Union
 PandasDataFrame = Type[pd.DataFrame]
 from funcs.embeddings import make_or_load_embeddings, torch_device
 from funcs.bertopic_vis_documents import visualize_documents_custom, visualize_hierarchical_documents_custom, hierarchical_topics_custom, visualize_hierarchy_custom
 from funcs.representation_model import create_representation_model, llm_config, chosen_start_tag, random_seed, RUNNING_ON_AWS
 from sklearn.feature_extraction.text import CountVectorizer
 import funcs.anonymiser as anon
 from umap import UMAP
     output_list = []
     #file_list = [string.name for string in in_files]
+    for in_colnames_list_first in in_colnames:
+        print("Cleaning column:", in_colnames_list_first)
+        #in_colnames_list_first = in_colnames[0]
+        # Reset original index to a new column so you can link it to data outputted from cleaning
+        if not "original_index" in data.columns:
+            data = data.reset_index(names="original_index")
+        if clean_text == "Yes":
+            clean_tic = time.perf_counter()
+            print("Starting data clean.")
+            data[in_colnames_list_first] = initial_clean(data[in_colnames_list_first], [])
+            if '_clean' not in data_file_name_no_ext:
+                data_file_name_no_ext = data_file_name_no_ext + "_clean"
+            clean_toc = time.perf_counter()
+            clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds."
+            print(clean_time_out)
+        # Clean custom regex if exists
+        if not custom_regex.empty:
+            data[in_colnames_list_first] = regex_clean(data[in_colnames_list_first], custom_regex.iloc[:, 0].to_list())
+            if '_clean' not in data_file_name_no_ext:
+                data_file_name_no_ext = data_file_name_no_ext + "_clean"
+        if drop_duplicate_text == "Yes":
+            progress(0.3, desc= "Drop duplicates - remove short texts")
+            data_file_name_no_ext = data_file_name_no_ext + "_dedup"
+            #print("Removing duplicates and short entries from data")
+            #print("Data shape before: ", data.shape)
+            data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
+            data = data[data[in_colnames_list_first].str.len() >= 50]
+            data = data.drop_duplicates(subset = in_colnames_list_first).dropna(subset= in_colnames_list_first).reset_index()
+            #print("Data shape after duplicate/null removal: ", data.shape)
+        if anonymise_drop == "Yes":
+            progress(0.4, desc= "Anonymising data")
+            if '_anon' not in data_file_name_no_ext:
+                data_file_name_no_ext = data_file_name_no_ext + "_anon"
+            anon_tic = time.perf_counter()
+            data_anon_col, anonymisation_success = anon.anonymise_script(data, in_colnames_list_first, anon_strat="redact")
+            data[in_colnames_list_first] = data_anon_col
+            print(anonymisation_success)
+            anon_toc = time.perf_counter()
+            time_out = f"Anonymising text took {anon_toc - anon_tic:0.1f} seconds"
+            print(time_out)
+        if sentence_split_drop == "Yes":
+            progress(0.6, desc= "Splitting text into sentences")
+            if '_split' not in data_file_name_no_ext:
+                data_file_name_no_ext = data_file_name_no_ext + "_split"
+            anon_tic = time.perf_counter()
+            data = expand_sentences_spacy(data, in_colnames_list_first)
+            data = data[data[in_colnames_list_first].str.len() > min_sentence_length] # Keep only rows with at more than 5 characters
+            data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
+            data.reset_index(inplace=True, drop=True)
+            anon_toc = time.perf_counter()
+            time_out = f"Splitting text took {anon_toc - anon_tic:0.1f} seconds"
+            print(time_out)
+            data[in_colnames_list_first] = data[in_colnames_list_first].str.strip()
     out_data_name = output_folder + data_file_name_no_ext + "_" + today_rev +  ".csv"
     data.to_csv(out_data_name)
     if high_quality_mode == "Yes":
         print("Using high quality embedding model")
         #embedding_model = SentenceTransformer(embeddings_name, truncate_dim=512)
         # If tfidf embeddings currently exist, wipe these empty
         embeddings_type_state = "large"
         # UMAP model uses Bertopic defaults
+        #umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=False, random_state=random_seed)
     else:
         print("Choosing low resource TF-IDF model.")
+        # embedding_model = make_pipeline(
+        #         TfidfVectorizer(),
+        #         TruncatedSVD(100, random_state=random_seed)
+        #         )
         # If large embeddings currently exist, wipe these empty, then rename embeddings type
         if embeddings_type_state == "large":
         embeddings_type_state = "tfidf"
         #umap_model = TruncatedSVD(n_components=5, random_state=random_seed)
+    # UMAP model uses Bertopic defaults
+    umap_model = UMAP(n_neighbors=umap_n_neighbours, n_components=5, min_dist=umap_min_dist, metric=umap_metric, low_memory=True, random_state=random_seed)
+    embeddings_out, embedding_model = make_or_load_embeddings(docs, file_list, embeddings_out, embeddings_super_compress, high_quality_mode, embeddings_name)
      # If you want to save your embedding files
     if return_intermediate_files == "Yes":

requirements.txt CHANGED Viewed

@@ -3,11 +3,10 @@ pandas==2.2.3
 plotly==5.24.1
 scikit-learn==1.5.2
 umap-learn==0.5.7
-gradio==5.6.0
-boto3==1.35.64
 transformers==4.46.3
 accelerate==1.1.1
-torch==2.5.1
 bertopic==0.16.4
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
@@ -18,6 +17,9 @@ presidio_analyzer==2.2.355
 presidio_anonymizer==2.2.355
 scipy
 polars
-sentence-transformers==3.2.0
-llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-#numpy==1.26.4

 plotly==5.24.1
 scikit-learn==1.5.2
 umap-learn==0.5.7
+gradio==5.8.0
+boto3==1.35.71
 transformers==4.46.3
 accelerate==1.1.1
 bertopic==0.16.4
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 presidio_anonymizer==2.2.355
 scipy
 polars
+sentence-transformers==3.3.1
+torch==2.4.1 --extra-index-url https://download.pytorch.org/whl/cu121
+#llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+# Specify exact llama_cpp wheel for huggingface compatibility
+https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu121/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
+numpy==1.26.4

requirements_aws.txt CHANGED Viewed

@@ -6,7 +6,7 @@ umap-learn==0.5.7
 boto3==1.35.64
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.6.0
 pyarrow
 openpyxl
 Faker

 boto3==1.35.64
 spacy==3.8.0
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.8.0
 pyarrow
 openpyxl
 Faker

requirements_gpu.txt CHANGED Viewed

@@ -18,8 +18,7 @@ presidio_analyzer==2.2.355
 presidio_anonymizer==2.2.355
 scipy
 polars
-llama-cpp-python==0.3.2 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
-torch --index-url https://download.pytorch.org/whl/cu121
-sentence-transformers==3.2.0
-#numpy==1.26.4

 presidio_anonymizer==2.2.355
 scipy
 polars
+llama-cpp-python==0.2.90 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+sentence-transformers==3.3.1
+numpy==1.26.4