Spaces:
Runtime error
Runtime error
Enable spaces
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
|
| 2 |
import requests
|
| 3 |
import logging
|
| 4 |
import duckdb
|
|
@@ -57,7 +57,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 57 |
offload_folder="offload", # Offloading part of the model to CPU to save GPU memory
|
| 58 |
)
|
| 59 |
|
| 60 |
-
# Enable gradient checkpointing for memory efficiency during backprop
|
| 61 |
model.gradient_checkpointing_enable()
|
| 62 |
|
| 63 |
generator = pipeline(
|
|
@@ -122,13 +122,13 @@ def get_docs_from_parquet(parquet_urls, column, offset, limit):
|
|
| 122 |
return df[column].tolist()
|
| 123 |
|
| 124 |
|
| 125 |
-
|
| 126 |
# TODO: Modify batch size to reduce memory consumption during embedding calculation, which value is better?
|
| 127 |
def calculate_embeddings(docs):
|
| 128 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 129 |
|
| 130 |
|
| 131 |
-
|
| 132 |
def fit_model(base_model, docs, embeddings):
|
| 133 |
new_model = BERTopic(
|
| 134 |
"english",
|
|
@@ -195,12 +195,11 @@ def generate_topics(dataset, config, split, column, nested_column):
|
|
| 195 |
all_docs.extend(docs)
|
| 196 |
|
| 197 |
topics_info = base_model.get_topic_info()
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
topic_plot = base_model.visualize_barchart()
|
| 204 |
|
| 205 |
logging.info(f"Topics: {repr_model_topics}")
|
| 206 |
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
import requests
|
| 3 |
import logging
|
| 4 |
import duckdb
|
|
|
|
| 57 |
offload_folder="offload", # Offloading part of the model to CPU to save GPU memory
|
| 58 |
)
|
| 59 |
|
| 60 |
+
# Enable gradient checkpointing for memory efficiency during backprop?
|
| 61 |
model.gradient_checkpointing_enable()
|
| 62 |
|
| 63 |
generator = pipeline(
|
|
|
|
| 122 |
return df[column].tolist()
|
| 123 |
|
| 124 |
|
| 125 |
+
@spaces.GPU
|
| 126 |
# TODO: Modify batch size to reduce memory consumption during embedding calculation, which value is better?
|
| 127 |
def calculate_embeddings(docs):
|
| 128 |
return sentence_model.encode(docs, show_progress_bar=True, batch_size=32)
|
| 129 |
|
| 130 |
|
| 131 |
+
@spaces.GPU
|
| 132 |
def fit_model(base_model, docs, embeddings):
|
| 133 |
new_model = BERTopic(
|
| 134 |
"english",
|
|
|
|
| 195 |
all_docs.extend(docs)
|
| 196 |
|
| 197 |
topics_info = base_model.get_topic_info()
|
| 198 |
+
topic_plot = base_model.visualize_documents(
|
| 199 |
+
all_docs,
|
| 200 |
+
reduced_embeddings=np.vstack(reduced_embeddings_list),
|
| 201 |
+
custom_labels=True,
|
| 202 |
+
)
|
|
|
|
| 203 |
|
| 204 |
logging.info(f"Topics: {repr_model_topics}")
|
| 205 |
|