Dopler47 commited on
Commit
ff54a63
·
1 Parent(s): 8e9ebaf

Topic modeling and code restructuring

Browse files
app.py CHANGED
@@ -1,13 +1,35 @@
1
  import gradio as gr
2
- from bertopic import BERTopic
3
- from cuml.cluster import HDBSCAN
4
- from cuml.manifold import UMAP
5
- from cuml.preprocessing import normalize
6
 
7
- # embeddings = normalize(embeddings)
 
 
 
8
 
9
- def greet(name):
10
- return "Hello " + name + "!!"
11
 
12
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  demo.launch()
 
1
  import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
 
5
 
6
+ from src.scripts.nlp_processing import embed_splitted_docs, split_corpus
7
+ from src.scripts.topic_modeling import topic_modeling
8
+ from src.utils.constants import EMBEDDING_MODEL_NAME
9
+ from src.utils.utils import extract_corpus
10
 
11
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
 
12
 
13
+
14
+ def greet(fileobj):
15
+
16
+ # Read the file
17
+ corpus = extract_corpus(fileobj)
18
+
19
+ # Split the corpus
20
+ splitted_docs = split_corpus(corpus)
21
+
22
+ # Embed the splitted documents
23
+ embeddings = embed_splitted_docs(splitted_docs, embedding_model)
24
+
25
+ # Topic modeling
26
+ fig, df = topic_modeling(splitted_docs, embeddings, embedding_model)
27
+
28
+ # Save the figure
29
+ return (fig, df)
30
+
31
+
32
+ demo = gr.Interface(
33
+ fn=greet, inputs="file", outputs=[gr.outputs.Image(), gr.outputs.Dataframe()]
34
+ )
35
  demo.launch()
src/__init__.py DELETED
File without changes
src/embedding-chunks.py DELETED
@@ -1,10 +0,0 @@
1
- import spaces
2
- from sentence_transformers import SentenceTransformer
3
-
4
- embedding_model_name = "BAAI/bge-small-en"
5
- embedding_model = SentenceTransformer(embedding_model_name)
6
-
7
- @spaces.GPU()
8
- def embed_splitted_docs(splitted_docs):
9
- embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
10
- return embeddings
 
 
 
 
 
 
 
 
 
 
 
src/{llm.py → scripts/llm.py} RENAMED
@@ -29,60 +29,65 @@ h3 {
29
  }
30
  """
31
 
32
- device = "cuda" # for GPU usage or "cpu" for CPU usage
33
 
34
  quantization_config = BitsAndBytesConfig(
35
  load_in_4bit=True,
36
  bnb_4bit_compute_dtype=torch.bfloat16,
37
  bnb_4bit_use_double_quant=True,
38
- bnb_4bit_quant_type= "nf4")
 
39
 
40
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
41
  model = AutoModelForCausalLM.from_pretrained(
42
  MODEL,
43
  torch_dtype=torch.bfloat16,
44
  device_map="auto",
45
- quantization_config=quantization_config)
 
 
46
 
47
  @spaces.GPU()
48
  def chat(
49
- message: str,
50
  history: list,
51
  system_prompt: str,
52
- temperature: float = 0.8,
53
- max_new_tokens: int = 1024,
54
- top_p: float = 1.0,
55
- top_k: int = 20,
56
  penalty: float = 1.2,
57
  ):
58
- print(f'message: {message}')
59
- print(f'history: {history}')
60
 
61
  # Construct the conversation context
62
- conversation = [
63
- {"role": "system", "content": system_prompt}
64
- ]
65
  for prompt, answer in history:
66
- conversation.extend([
67
- {"role": "user", "content": prompt},
68
- {"role": "assistant", "content": answer},
69
- ])
 
 
70
 
71
  conversation.append({"role": "user", "content": message})
72
 
73
  # Tokenize the conversation input
74
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 
75
 
76
  # Define the generation parameters
77
  generate_kwargs = dict(
78
- input_ids=input_ids,
79
  max_new_tokens=max_new_tokens,
80
  do_sample=False if temperature == 0 else True,
81
  top_p=top_p,
82
  top_k=top_k,
83
  temperature=temperature,
84
  repetition_penalty=penalty,
85
- eos_token_id=[128001,128008,128009], # Define the end-of-sequence token
86
  )
87
 
88
  # Generate the output
 
29
  }
30
  """
31
 
32
+ device = "cuda" # for GPU usage or "cpu" for CPU usage
33
 
34
  quantization_config = BitsAndBytesConfig(
35
  load_in_4bit=True,
36
  bnb_4bit_compute_dtype=torch.bfloat16,
37
  bnb_4bit_use_double_quant=True,
38
+ bnb_4bit_quant_type="nf4",
39
+ )
40
 
41
  tokenizer = AutoTokenizer.from_pretrained(MODEL)
42
  model = AutoModelForCausalLM.from_pretrained(
43
  MODEL,
44
  torch_dtype=torch.bfloat16,
45
  device_map="auto",
46
+ quantization_config=quantization_config,
47
+ )
48
+
49
 
50
  @spaces.GPU()
51
  def chat(
52
+ message: str,
53
  history: list,
54
  system_prompt: str,
55
+ temperature: float = 0.8,
56
+ max_new_tokens: int = 1024,
57
+ top_p: float = 1.0,
58
+ top_k: int = 20,
59
  penalty: float = 1.2,
60
  ):
61
+ print(f"message: {message}")
62
+ print(f"history: {history}")
63
 
64
  # Construct the conversation context
65
+ conversation = [{"role": "system", "content": system_prompt}]
 
 
66
  for prompt, answer in history:
67
+ conversation.extend(
68
+ [
69
+ {"role": "user", "content": prompt},
70
+ {"role": "assistant", "content": answer},
71
+ ]
72
+ )
73
 
74
  conversation.append({"role": "user", "content": message})
75
 
76
  # Tokenize the conversation input
77
+ input_ids = tokenizer.apply_chat_template(
78
+ conversation, add_generation_prompt=True, return_tensors="pt"
79
+ ).to(model.device)
80
 
81
  # Define the generation parameters
82
  generate_kwargs = dict(
83
+ input_ids=input_ids,
84
  max_new_tokens=max_new_tokens,
85
  do_sample=False if temperature == 0 else True,
86
  top_p=top_p,
87
  top_k=top_k,
88
  temperature=temperature,
89
  repetition_penalty=penalty,
90
+ eos_token_id=[128001, 128008, 128009], # Define the end-of-sequence token
91
  )
92
 
93
  # Generate the output
src/scripts/nlp_processing.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from langchain_text_splitters.character import RecursiveCharacterTextSplitter
3
+
4
+
5
+ @spaces.GPU()
6
+ def embed_splitted_docs(splitted_docs, embedding_model):
7
+ """
8
+ Encode the given list of documents using the specified embedding model.
9
+
10
+ Parameters
11
+ ----------
12
+ splitted_docs : List of str
13
+ The list of documents to be embedded.
14
+
15
+ Returns
16
+ -------
17
+ embeddings : List of numpy.ndarray
18
+ The embeddings of the given documents.
19
+ """
20
+ embeddings = embedding_model.encode(splitted_docs, show_progress_bar=True)
21
+ return embeddings
22
+
23
+
24
+ def split_corpus(corpus, chunk_size=1000):
25
+ """
26
+ Split a given corpus into chunks of a given size.
27
+
28
+ Parameters
29
+ ----------
30
+ corpus : List of str
31
+ The corpus to be split.
32
+ chunk_size : int, default=1000
33
+ The size of the chunks to be split from the corpus.
34
+
35
+ Returns
36
+ -------
37
+ List of str
38
+ The list of chunks (splitted documents) from the corpus.
39
+ """
40
+ splitter = RecursiveCharacterTextSplitter(
41
+ chunk_size=chunk_size, chunk_overlap=0, add_start_index=True
42
+ )
43
+ splitted_docs = splitter.create_documents(corpus)
44
+ splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
45
+
46
+ return splitted_docs
src/scripts/topic_modeling.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import spaces
6
+ from bertopic import BERTopic
7
+ from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
8
+ from cuml.cluster import HDBSCAN
9
+ from cuml.manifold import UMAP
10
+ from cuml.preprocessing import normalize
11
+
12
+
13
+ @spaces.GPU()
14
+ def topic_modeling(
15
+ docs,
16
+ embeddings,
17
+ embedding_model,
18
+ n_gram_range=(3, 6),
19
+ mmr_diversity=1,
20
+ mmr_top_n_words=30,
21
+ keybert_top_n_words=50,
22
+ random_state=42,
23
+ min_cluster_size=15,
24
+ ):
25
+ """
26
+ Perform topic modeling on a list of documents and their embeddings.
27
+
28
+ Parameters
29
+ ----------
30
+ docs : List of str
31
+ The list of documents to be topic modeled.
32
+ embeddings : List of numpy.ndarray
33
+ The list of embeddings of the given documents.
34
+ embedding_model : SentenceTransformer
35
+ The embedding model used to generate the embeddings.
36
+ n_gram_range : Tuple of int, optional
37
+ The range of n-grams to be considered. Defaults to (3, 6).
38
+ mmr_diversity : float, optional
39
+ The diversity value of the MMR model. Defaults to 1.
40
+ mmr_top_n_words : int, optional
41
+ The number of top words to be considered in the MMR model. Defaults to 30.
42
+ keybert_top_n_words : int, optional
43
+ The number of top words to be considered in the KeyBERT model. Defaults to 50.
44
+ random_state : int, optional
45
+ The random seed for reproducibility. Defaults to 42.
46
+ min_cluster_size : int, optional
47
+ The minimum size of a cluster to be considered as a topic. Defaults to 15.
48
+
49
+ Returns
50
+ -------
51
+ fig : matplotlib.figure.Figure
52
+ The datamap of the topic modeling.
53
+ topic_info_df : pandas.DataFrame
54
+ The topic information dataframe.
55
+ """
56
+ representation_model = [
57
+ KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
58
+ MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words=mmr_top_n_words),
59
+ ]
60
+
61
+ hdbscan_model = HDBSCAN(
62
+ min_cluster_size=min_cluster_size,
63
+ metric="euclidean",
64
+ cluster_selection_method="eom",
65
+ prediction_data=True,
66
+ )
67
+
68
+ umap_model = UMAP(
69
+ n_neighbors=15,
70
+ n_components=5,
71
+ min_dist=0.0,
72
+ metric="cosine",
73
+ low_memory=False,
74
+ random_state=random_state,
75
+ )
76
+
77
+ topic_model = BERTopic(
78
+ embedding_model=embedding_model,
79
+ representation_model=representation_model,
80
+ n_gram_range=n_gram_range,
81
+ hdbscan_model=hdbscan_model,
82
+ umap_model=umap_model,
83
+ verbose=True,
84
+ ).fit(docs, embeddings=embeddings)
85
+
86
+ fig = topic_model.visualize_document_datamap(docs=docs)
87
+
88
+ topic_info_df = topic_model.get_topic_info()
89
+
90
+ return fig, topic_info_df
src/topic-modeling.py DELETED
@@ -1,29 +0,0 @@
1
- import os
2
-
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- from bertopic import BERTopic
6
- from bertopic.representation import KeyBERTInspired, LangChain, MaximalMarginalRelevance
7
- from cuml.cluster import HDBSCAN
8
- from cuml.manifold import UMAP
9
- from cuml.preprocessing import normalize
10
- from langchain_text_splitters.character import RecursiveCharacterTextSplitter
11
-
12
- n_gram_range = (3,6)
13
- mmr_diversity = 1
14
- mmr_top_n_words = 30
15
- keybert_top_n_words = 50
16
- random_state = 42
17
-
18
-
19
- representation_model = [
20
- KeyBERTInspired(top_n_words=keybert_top_n_words, random_state=random_state),
21
- MaximalMarginalRelevance(diversity=mmr_diversity, top_n_words = mmr_top_n_words),
22
- ]
23
-
24
- chunk_size = 1000
25
-
26
- splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0, add_start_index=True)
27
- splitted_docs = splitter.create_documents(corpus)
28
- splitted_docs = list(map(lambda x: x.page_content, splitted_docs))
29
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/utils/constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ TEMP_FOLDER = "/home/ubuntu/temps/"
2
+ EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
src/utils/utils.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+
4
+ from src.utils.constants import TEMP_FOLDER
5
+
6
+
7
+ def extract_corpus(fileobj):
8
+ """
9
+ Reads a file object and returns its contents as a list of strings.
10
+
11
+ Copies the file to a temporary location on disk, then reads it line by line
12
+ into a list.
13
+
14
+ Parameters
15
+ ----------
16
+ fileobj : File-like object
17
+ The file to read.
18
+
19
+ Returns
20
+ -------
21
+ lines : List of str
22
+ The contents of the file as a list of strings.
23
+ """
24
+ path = TEMP_FOLDER + os.path.basename(fileobj)
25
+ shutil.copyfile(fileobj.name, path)
26
+
27
+ with open(path, "r") as f:
28
+ lines = f.readlines()
29
+
30
+ return lines