Topic modeling and code restructuring
Browse files- app.py +30 -8
- src/__init__.py +0 -0
- src/embedding-chunks.py +0 -10
- src/{llm.py → scripts/llm.py} +25 -20
- src/scripts/nlp_processing.py +46 -0
- src/scripts/topic_modeling.py +90 -0
- src/topic-modeling.py +0 -29
- src/utils/constants.py +2 -0
- src/utils/utils.py +30 -0
app.py
CHANGED
|
@@ -1,13 +1,35 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
from
|
| 5 |
-
from cuml.preprocessing import normalize
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
return "Hello " + name + "!!"
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import matplotlib.pyplot as plt
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sentence_transformers import SentenceTransformer
|
|
|
|
| 5 |
|
| 6 |
+
from src.scripts.nlp_processing import embed_splitted_docs, split_corpus
|
| 7 |
+
from src.scripts.topic_modeling import topic_modeling
|
| 8 |
+
from src.utils.constants import EMBEDDING_MODEL_NAME
|
| 9 |
+
from src.utils.utils import extract_corpus
|
| 10 |
|
| 11 |
+
embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
|
|
|
| 12 |
|
| 13 |
+
|
| 14 |
+
def greet(fileobj):
|
| 15 |
+
|
| 16 |
+
# Read the file
|
| 17 |
+
corpus = extract_corpus(fileobj)
|
| 18 |
+
|
| 19 |
+
# Split the corpus
|
| 20 |
+
splitted_docs = split_corpus(corpus)
|
| 21 |
+
|
| 22 |
+
# Embed the splitted documents
|
| 23 |
+
embeddings = embed_splitted_docs(splitted_docs, embedding_model)
|
| 24 |
+
|
| 25 |
+
# Topic modeling
|
| 26 |
+
fig, df = topic_modeling(splitted_docs, embeddings, embedding_model)
|
| 27 |
+
|
| 28 |
+
# Save the figure
|
| 29 |
+
return (fig, df)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
demo = gr.Interface(
|
| 33 |
+
fn=greet, inputs="file", outputs=[gr.outputs.Image(), gr.outputs.Dataframe()]
|
| 34 |
+
)
|
| 35 |
demo.launch()
|
src/__init__.py
DELETED
|
File without changes
|
src/embedding-chunks.py
DELETED
|
@@ -1,10 +0,0 @@
|
|
| 1 |
-
import spaces
|
| 2 |
-
from sentence_transformers import SentenceTransformer
|
| 3 |
-
|
| 4 |
-
embedding_model_name = "BAAI/bge-small-en"
|
| 5 |
-
embedding_model = SentenceTransformer(embedding_model_name)
|
| 6 |
-
|
| 7 |
-
@spaces.GPU()
|
| 8 |
-
def embed_splitted_docs(splitted_docs):
|
| 9 |
-
embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
|
| 10 |
-
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{llm.py → scripts/llm.py}
RENAMED
|
@@ -29,60 +29,65 @@ h3 {
|
|
| 29 |
}
|
| 30 |
"""
|
| 31 |
|
| 32 |
-
device = "cuda"
|
| 33 |
|
| 34 |
quantization_config = BitsAndBytesConfig(
|
| 35 |
load_in_4bit=True,
|
| 36 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 37 |
bnb_4bit_use_double_quant=True,
|
| 38 |
-
bnb_4bit_quant_type=
|
|
|
|
| 39 |
|
| 40 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 41 |
model = AutoModelForCausalLM.from_pretrained(
|
| 42 |
MODEL,
|
| 43 |
torch_dtype=torch.bfloat16,
|
| 44 |
device_map="auto",
|
| 45 |
-
quantization_config=quantization_config
|
|
|
|
|
|
|
| 46 |
|
| 47 |
@spaces.GPU()
|
| 48 |
def chat(
|
| 49 |
-
message: str,
|
| 50 |
history: list,
|
| 51 |
system_prompt: str,
|
| 52 |
-
temperature: float = 0.8,
|
| 53 |
-
max_new_tokens: int = 1024,
|
| 54 |
-
top_p: float = 1.0,
|
| 55 |
-
top_k: int = 20,
|
| 56 |
penalty: float = 1.2,
|
| 57 |
):
|
| 58 |
-
print(f
|
| 59 |
-
print(f
|
| 60 |
|
| 61 |
# Construct the conversation context
|
| 62 |
-
conversation = [
|
| 63 |
-
{"role": "system", "content": system_prompt}
|
| 64 |
-
]
|
| 65 |
for prompt, answer in history:
|
| 66 |
-
conversation.extend(
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
| 70 |
|
| 71 |
conversation.append({"role": "user", "content": message})
|
| 72 |
|
| 73 |
# Tokenize the conversation input
|
| 74 |
-
input_ids = tokenizer.apply_chat_template(
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# Define the generation parameters
|
| 77 |
generate_kwargs = dict(
|
| 78 |
-
input_ids=input_ids,
|
| 79 |
max_new_tokens=max_new_tokens,
|
| 80 |
do_sample=False if temperature == 0 else True,
|
| 81 |
top_p=top_p,
|
| 82 |
top_k=top_k,
|
| 83 |
temperature=temperature,
|
| 84 |
repetition_penalty=penalty,
|
| 85 |
-
eos_token_id=[128001,128008,128009], # Define the end-of-sequence token
|
| 86 |
)
|
| 87 |
|
| 88 |
# Generate the output
|
|
|
|
| 29 |
}
|
| 30 |
"""
|
| 31 |
|
| 32 |
+
device = "cuda" # for GPU usage or "cpu" for CPU usage
|
| 33 |
|
| 34 |
quantization_config = BitsAndBytesConfig(
|
| 35 |
load_in_4bit=True,
|
| 36 |
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 37 |
bnb_4bit_use_double_quant=True,
|
| 38 |
+
bnb_4bit_quant_type="nf4",
|
| 39 |
+
)
|
| 40 |
|
| 41 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 42 |
model = AutoModelForCausalLM.from_pretrained(
|
| 43 |
MODEL,
|
| 44 |
torch_dtype=torch.bfloat16,
|
| 45 |
device_map="auto",
|
| 46 |
+
quantization_config=quantization_config,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
|
| 50 |
@spaces.GPU()
|
| 51 |
def chat(
|
| 52 |
+
message: str,
|
| 53 |
history: list,
|
| 54 |
system_prompt: str,
|
| 55 |
+
temperature: float = 0.8,
|
| 56 |
+
max_new_tokens: int = 1024,
|
| 57 |
+
top_p: float = 1.0,
|
| 58 |
+
top_k: int = 20,
|
| 59 |
penalty: float = 1.2,
|
| 60 |
):
|
| 61 |
+
print(f"message: {message}")
|
| 62 |
+
print(f"history: {history}")
|
| 63 |
|
| 64 |
# Construct the conversation context
|
| 65 |
+
conversation = [{"role": "system", "content": system_prompt}]
|
|
|
|
|
|
|
| 66 |
for prompt, answer in history:
|
| 67 |
+
conversation.extend(
|
| 68 |
+
[
|
| 69 |
+
{"role": "user", "content": prompt},
|
| 70 |
+
{"role": "assistant", "content": answer},
|
| 71 |
+
]
|
| 72 |
+
)
|
| 73 |
|
| 74 |
conversation.append({"role": "user", "content": message})
|
| 75 |
|
| 76 |
# Tokenize the conversation input
|
| 77 |
+
input_ids = tokenizer.apply_chat_template(
|
| 78 |
+
conversation, add_generation_prompt=True, return_tensors="pt"
|
| 79 |
+
).to(model.device)
|
| 80 |
|
| 81 |
# Define the generation parameters
|
| 82 |
generate_kwargs = dict(
|
| 83 |
+
input_ids=input_ids,
|
| 84 |
max_new_tokens=max_new_tokens,
|
| 85 |
do_sample=False if temperature == 0 else True,
|
| 86 |
top_p=top_p,
|
| 87 |
top_k=top_k,
|
| 88 |
temperature=temperature,
|
| 89 |
repetition_penalty=penalty,
|
| 90 |
+
eos_token_id=[128001, 128008, 128009], # Define the end-of-sequence token
|
| 91 |
)
|
| 92 |
|
| 93 |
# Generate the output
|
src/scripts/nlp_processing.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spaces
|
| 2 |
+
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@spaces.GPU()
|
| 6 |
+
def embed_splitted_docs(splitted_docs, embedding_model):
|
| 7 |
+
"""
|
| 8 |
+
Encode the given list of documents using the specified embedding model.
|
| 9 |
+
|
| 10 |
+
Parameters
|
| 11 |
+
----------
|
| 12 |
+
splitted_docs : List of str
|
| 13 |
+
The list of documents to be embedded.
|
| 14 |
+
|
| 15 |
+
Returns
|
| 16 |
+
-------
|
| 17 |
+
embeddings : List of numpy.ndarray
|
| 18 |
+
The embeddings of the given documents.
|
| 19 |
+
"""
|
| 20 |
+
embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
|
| 21 |
+
return embeddings
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def split_corpus(corpus, chunk_size=1000):
|
| 25 |
+
"""
|
| 26 |
+
Split a given corpus into chunks of a given size.
|
| 27 |
+
|
| 28 |
+
Parameters
|
| 29 |
+
----------
|
| 30 |
+
corpus : List of str
|
| 31 |
+
The corpus to be split.
|
| 32 |
+
chunk_size : int, default=1000
|
| 33 |
+
The size of the chunks to be split from the corpus.
|
| 34 |
+
|
| 35 |
+
Returns
|
| 36 |
+
-------
|
| 37 |
+
List of str
|
| 38 |
+
The list of chunks (splitted documents) from the corpus.
|
| 39 |
+
"""
|
| 40 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 41 |
+
chunk_size=chunk_size, chunk_overlap=0, add_start_index=True
|
| 42 |
+
)
|
| 43 |
+
splitted_docs = splitter.create_documents(corpus)
|
| 44 |
+
splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
|
| 45 |
+
|
| 46 |
+
return splitted_docs
|
src/scripts/topic_modeling.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import numpy as np
|
| 5 |
+
import spaces
|
| 6 |
+
from bertopic import BERTopic
|
| 7 |
+
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
|
| 8 |
+
from cuml.cluster import HDBSCAN
|
| 9 |
+
from cuml.manifold import UMAP
|
| 10 |
+
from cuml.preprocessing import normalize
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@spaces.GPU()
|
| 14 |
+
def topic_modeling(
|
| 15 |
+
docs,
|
| 16 |
+
embeddings,
|
| 17 |
+
embedding_model,
|
| 18 |
+
n_gram_range=(3, 6),
|
| 19 |
+
mmr_diversity=1,
|
| 20 |
+
mmr_top_n_words=30,
|
| 21 |
+
keybert_top_n_words=50,
|
| 22 |
+
random_state=42,
|
| 23 |
+
min_cluster_size=15,
|
| 24 |
+
):
|
| 25 |
+
"""
|
| 26 |
+
Perform topic modeling on a list of documents and their embeddings.
|
| 27 |
+
|
| 28 |
+
Parameters
|
| 29 |
+
----------
|
| 30 |
+
docs : List of str
|
| 31 |
+
The list of documents to be topic modeled.
|
| 32 |
+
embeddings : List of numpy.ndarray
|
| 33 |
+
The list of embeddings of the given documents.
|
| 34 |
+
embedding_model : SentenceTransformer
|
| 35 |
+
The embedding model used to generate the embeddings.
|
| 36 |
+
n_gram_range : Tuple of int, optional
|
| 37 |
+
The range of n-grams to be considered. Defaults to (3, 6).
|
| 38 |
+
mmr_diversity : float, optional
|
| 39 |
+
The diversity value of the MMR model. Defaults to 1.
|
| 40 |
+
mmr_top_n_words : int, optional
|
| 41 |
+
The number of top words to be considered in the MMR model. Defaults to 30.
|
| 42 |
+
keybert_top_n_words : int, optional
|
| 43 |
+
The number of top words to be considered in the KeyBERT model. Defaults to 50.
|
| 44 |
+
random_state : int, optional
|
| 45 |
+
The random seed for reproducibility. Defaults to 42.
|
| 46 |
+
min_cluster_size : int, optional
|
| 47 |
+
The minimum size of a cluster to be considered as a topic. Defaults to 15.
|
| 48 |
+
|
| 49 |
+
Returns
|
| 50 |
+
-------
|
| 51 |
+
fig : matplotlib.figure.Figure
|
| 52 |
+
The datamap of the topic modeling.
|
| 53 |
+
topic_info_df : pandas.DataFrame
|
| 54 |
+
The topic information dataframe.
|
| 55 |
+
"""
|
| 56 |
+
representation_model = [
|
| 57 |
+
KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
|
| 58 |
+
MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=mmr_top_n_words),
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
hdbscan_model = HDBSCAN(
|
| 62 |
+
min_cluster_size=min_cluster_size,
|
| 63 |
+
metric="euclidean",
|
| 64 |
+
cluster_selection_method="eom",
|
| 65 |
+
prediction_data=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
umap_model = UMAP(
|
| 69 |
+
n_neighbors=15,
|
| 70 |
+
n_components=5,
|
| 71 |
+
min_dist=0.0,
|
| 72 |
+
metric="cosine",
|
| 73 |
+
low_memory=False,
|
| 74 |
+
random_state=random_state,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
topic_model = BERTopic(
|
| 78 |
+
embedding_model=embedding_model,
|
| 79 |
+
representation_model=representation_model,
|
| 80 |
+
n_gram_range=n_gram_range,
|
| 81 |
+
hdbscan_model=hdbscan_model,
|
| 82 |
+
umap_model=umap_model,
|
| 83 |
+
verbose=True,
|
| 84 |
+
).fit(docs, embeddings=embeddings)
|
| 85 |
+
|
| 86 |
+
fig = topic_model.visualize_document_datamap(docs=docs)
|
| 87 |
+
|
| 88 |
+
topic_info_df = topic_model.get_topic_info()
|
| 89 |
+
|
| 90 |
+
return fig, topic_info_df
|
src/topic-modeling.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
|
| 3 |
-
import matplotlib.pyplot as plt
|
| 4 |
-
import numpy as np
|
| 5 |
-
from bertopic import BERTopic
|
| 6 |
-
from bertopic.representation import KeyBERTInspired, LangChain, MaximalMarginalRelevance
|
| 7 |
-
from cuml.cluster import HDBSCAN
|
| 8 |
-
from cuml.manifold import UMAP
|
| 9 |
-
from cuml.preprocessing import normalize
|
| 10 |
-
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
|
| 11 |
-
|
| 12 |
-
n_gram_range = (3,6)
|
| 13 |
-
mmr_diversity = 1
|
| 14 |
-
mmr_top_n_words = 30
|
| 15 |
-
keybert_top_n_words = 50
|
| 16 |
-
random_state = 42
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
representation_model = [
|
| 20 |
-
KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
|
| 21 |
-
MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words = mmr_top_n_words),
|
| 22 |
-
]
|
| 23 |
-
|
| 24 |
-
chunk_size = 1000
|
| 25 |
-
|
| 26 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, add_start_index=True)
|
| 27 |
-
splitted_docs = splitter.create_documents(corpus)
|
| 28 |
-
splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/utils/constants.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMP_FOLDER = "/home/ubuntu/temps/"
|
| 2 |
+
EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
|
src/utils/utils.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
|
| 4 |
+
from src.utils.constants import TEMP_FOLDER
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def extract_corpus(fileobj):
|
| 8 |
+
"""
|
| 9 |
+
Reads a file object and returns its contents as a list of strings.
|
| 10 |
+
|
| 11 |
+
Copies the file to a temporary location on disk, then reads it line by line
|
| 12 |
+
into a list.
|
| 13 |
+
|
| 14 |
+
Parameters
|
| 15 |
+
----------
|
| 16 |
+
fileobj : File-like object
|
| 17 |
+
The file to read.
|
| 18 |
+
|
| 19 |
+
Returns
|
| 20 |
+
-------
|
| 21 |
+
lines : List of str
|
| 22 |
+
The contents of the file as a list of strings.
|
| 23 |
+
"""
|
| 24 |
+
path = TEMP_FOLDER + os.path.basename(fileobj)
|
| 25 |
+
shutil.copyfile(fileobj.name, path)
|
| 26 |
+
|
| 27 |
+
with open(path, "r") as f:
|
| 28 |
+
lines = f.readlines()
|
| 29 |
+
|
| 30 |
+
return lines
|