Spaces:

Dopler47
/

Bertopic

Sleeping

App Files Files Community

Dopler47 commited on Oct 5, 2024

Commit

ff54a63

1 Parent(s): 8e9ebaf

Topic modeling and code restructuring

Browse files

Files changed (9) hide show

app.py +30 -8
src/__init__.py +0 -0
src/embedding-chunks.py +0 -10
src/{llm.py → scripts/llm.py} +25 -20
src/scripts/nlp_processing.py +46 -0
src/scripts/topic_modeling.py +90 -0
src/topic-modeling.py +0 -29
src/utils/constants.py +2 -0
src/utils/utils.py +30 -0

app.py CHANGED Viewed

@@ -1,13 +1,35 @@
 import gradio as gr
-from bertopic import BERTopic
-from cuml.cluster import HDBSCAN
-from cuml.manifold import UMAP
-from cuml.preprocessing import normalize
-# embeddings = normalize(embeddings)
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import matplotlib.pyplot as plt
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from src.scripts.nlp_processing import embed_splitted_docs, split_corpus
+from src.scripts.topic_modeling import topic_modeling
+from src.utils.constants import EMBEDDING_MODEL_NAME
+from src.utils.utils import extract_corpus
+embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
+def greet(fileobj):
+    # Read the file
+    corpus = extract_corpus(fileobj)
+    # Split the corpus
+    splitted_docs = split_corpus(corpus)
+    # Embed the splitted documents
+    embeddings = embed_splitted_docs(splitted_docs, embedding_model)
+    # Topic modeling
+    fig, df = topic_modeling(splitted_docs, embeddings, embedding_model)
+    # Save the figure
+    return (fig, df)
+demo = gr.Interface(
+    fn=greet, inputs="file", outputs=[gr.outputs.Image(), gr.outputs.Dataframe()]
+)
 demo.launch()

src/__init__.py DELETED Viewed

File without changes

src/embedding-chunks.py DELETED Viewed

@@ -1,10 +0,0 @@
-import spaces
-from sentence_transformers import SentenceTransformer
-embedding_model_name = "BAAI/bge-small-en"
-embedding_model = SentenceTransformer(embedding_model_name)
-@spaces.GPU()
-def embed_splitted_docs(splitted_docs):
-    embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
-    return embeddings

src/{llm.py → scripts/llm.py} RENAMED Viewed

@@ -29,60 +29,65 @@ h3 {
 }
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type= "nf4")
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
     device_map="auto",
-    quantization_config=quantization_config)
 @spaces.GPU()
 def chat(
-    message: str,
     history: list,
     system_prompt: str,
-    temperature: float = 0.8,
-    max_new_tokens: int = 1024,
-    top_p: float = 1.0,
-    top_k: int = 20,
     penalty: float = 1.2,
 ):
-    print(f'message: {message}')
-    print(f'history: {history}')
     # Construct the conversation context
-    conversation = [
-        {"role": "system", "content": system_prompt}
-    ]
     for prompt, answer in history:
-        conversation.extend([
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": answer},
-        ])
     conversation.append({"role": "user", "content": message})
     # Tokenize the conversation input
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
     # Define the generation parameters
     generate_kwargs = dict(
-        input_ids=input_ids,
         max_new_tokens=max_new_tokens,
         do_sample=False if temperature == 0 else True,
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
         repetition_penalty=penalty,
-        eos_token_id=[128001,128008,128009],  # Define the end-of-sequence token
     )
     # Generate the output

 }
 """
+device = "cuda"  # for GPU usage or "cpu" for CPU usage
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.bfloat16,
     bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
     torch_dtype=torch.bfloat16,
     device_map="auto",
+    quantization_config=quantization_config,
+)
 @spaces.GPU()
 def chat(
+    message: str,
     history: list,
     system_prompt: str,
+    temperature: float = 0.8,
+    max_new_tokens: int = 1024,
+    top_p: float = 1.0,
+    top_k: int = 20,
     penalty: float = 1.2,
 ):
+    print(f"message: {message}")
+    print(f"history: {history}")
     # Construct the conversation context
+    conversation = [{"role": "system", "content": system_prompt}]
     for prompt, answer in history:
+        conversation.extend(
+            [
+                {"role": "user", "content": prompt},
+                {"role": "assistant", "content": answer},
+            ]
+        )
     conversation.append({"role": "user", "content": message})
     # Tokenize the conversation input
+    input_ids = tokenizer.apply_chat_template(
+        conversation, add_generation_prompt=True, return_tensors="pt"
+    ).to(model.device)
     # Define the generation parameters
     generate_kwargs = dict(
+        input_ids=input_ids,
         max_new_tokens=max_new_tokens,
         do_sample=False if temperature == 0 else True,
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
         repetition_penalty=penalty,
+        eos_token_id=[128001, 128008, 128009],  # Define the end-of-sequence token
     )
     # Generate the output

src/scripts/nlp_processing.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import spaces
+from langchain_text_splitters.character import RecursiveCharacterTextSplitter
+@spaces.GPU()
+def embed_splitted_docs(splitted_docs, embedding_model):
+    """
+    Encode the given list of documents using the specified embedding model.
+    Parameters
+    ----------
+    splitted_docs : List of str
+        The list of documents to be embedded.
+    Returns
+    -------
+    embeddings : List of numpy.ndarray
+        The embeddings of the given documents.
+    """
+    embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
+    return embeddings
+def split_corpus(corpus, chunk_size=1000):
+    """
+    Split a given corpus into chunks of a given size.
+    Parameters
+    ----------
+    corpus : List of str
+        The corpus to be split.
+    chunk_size : int, default=1000
+        The size of the chunks to be split from the corpus.
+    Returns
+    -------
+    List of str
+        The list of chunks (splitted documents) from the corpus.
+    """
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size, chunk_overlap=0, add_start_index=True
+    )
+    splitted_docs = splitter.create_documents(corpus)
+    splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
+    return splitted_docs

src/scripts/topic_modeling.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import spaces
+from bertopic import BERTopic
+from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
+from cuml.cluster import HDBSCAN
+from cuml.manifold import UMAP
+from cuml.preprocessing import normalize
+@spaces.GPU()
+def topic_modeling(
+    docs,
+    embeddings,
+    embedding_model,
+    n_gram_range=(3, 6),
+    mmr_diversity=1,
+    mmr_top_n_words=30,
+    keybert_top_n_words=50,
+    random_state=42,
+    min_cluster_size=15,
+):
+    """
+    Perform topic modeling on a list of documents and their embeddings.
+    Parameters
+    ----------
+    docs : List of str
+        The list of documents to be topic modeled.
+    embeddings : List of numpy.ndarray
+        The list of embeddings of the given documents.
+    embedding_model : SentenceTransformer
+        The embedding model used to generate the embeddings.
+    n_gram_range : Tuple of int, optional
+        The range of n-grams to be considered. Defaults to (3, 6).
+    mmr_diversity : float, optional
+        The diversity value of the MMR model. Defaults to 1.
+    mmr_top_n_words : int, optional
+        The number of top words to be considered in the MMR model. Defaults to 30.
+    keybert_top_n_words : int, optional
+        The number of top words to be considered in the KeyBERT model. Defaults to 50.
+    random_state : int, optional
+        The random seed for reproducibility. Defaults to 42.
+    min_cluster_size : int, optional
+        The minimum size of a cluster to be considered as a topic. Defaults to 15.
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        The datamap of the topic modeling.
+    topic_info_df : pandas.DataFrame
+        The topic information dataframe.
+    """
+    representation_model = [
+        KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
+        MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=mmr_top_n_words),
+    ]
+    hdbscan_model = HDBSCAN(
+        min_cluster_size=min_cluster_size,
+        metric="euclidean",
+        cluster_selection_method="eom",
+        prediction_data=True,
+    )
+    umap_model = UMAP(
+        n_neighbors=15,
+        n_components=5,
+        min_dist=0.0,
+        metric="cosine",
+        low_memory=False,
+        random_state=random_state,
+    )
+    topic_model = BERTopic(
+        embedding_model=embedding_model,
+        representation_model=representation_model,
+        n_gram_range=n_gram_range,
+        hdbscan_model=hdbscan_model,
+        umap_model=umap_model,
+        verbose=True,
+    ).fit(docs, embeddings=embeddings)
+    fig = topic_model.visualize_document_datamap(docs=docs)
+    topic_info_df = topic_model.get_topic_info()
+    return fig, topic_info_df

src/topic-modeling.py DELETED Viewed

@@ -1,29 +0,0 @@
-import os
-import matplotlib.pyplot as plt
-import numpy as np
-from bertopic import BERTopic
-from bertopic.representation import KeyBERTInspired, LangChain, MaximalMarginalRelevance
-from cuml.cluster import HDBSCAN
-from cuml.manifold import UMAP
-from cuml.preprocessing import normalize
-from langchain_text_splitters.character import RecursiveCharacterTextSplitter
-n_gram_range = (3,6)
-mmr_diversity = 1
-mmr_top_n_words = 30
-keybert_top_n_words = 50
-random_state = 42
-representation_model = [
-    KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
-    MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words = mmr_top_n_words),
-]
-chunk_size = 1000
-splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, add_start_index=True)
-splitted_docs = splitter.create_documents(corpus)
-splitted_docs = list(map(lambda x: x.page_content, splitted_docs))

src/utils/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ TEMP_FOLDER = "/home/ubuntu/temps/"
2	+ EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"

src/utils/utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import os
+import shutil
+from src.utils.constants import TEMP_FOLDER
+def extract_corpus(fileobj):
+    """
+    Reads a file object and returns its contents as a list of strings.
+    Copies the file to a temporary location on disk, then reads it line by line
+    into a list.
+    Parameters
+    ----------
+    fileobj : File-like object
+        The file to read.
+    Returns
+    -------
+    lines : List of str
+        The contents of the file as a list of strings.
+    """
+    path = TEMP_FOLDER + os.path.basename(fileobj)
+    shutil.copyfile(fileobj.name, path)
+    with open(path, "r") as f:
+        lines = f.readlines()
+    return lines