Dopler47 commited on
Commit
c6607a8
·
1 Parent(s): f2f35ab

debug and testing error

Browse files
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import matplotlib.pyplot as plt
3
  import pandas as pd
 
4
  from sentence_transformers import SentenceTransformer
5
 
6
  from src.scripts.nlp_processing import embed_splitted_docs, split_corpus
@@ -11,7 +12,17 @@ from src.utils.utils import extract_corpus
11
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
12
 
13
 
 
 
 
 
 
 
 
 
 
14
  def greet(fileobj):
 
15
 
16
  # Read the file
17
  corpus = extract_corpus(fileobj)
@@ -20,10 +31,10 @@ def greet(fileobj):
20
  splitted_docs = split_corpus(corpus)
21
 
22
  # Embed the splitted documents
23
- embeddings = embed_splitted_docs(splitted_docs, embedding_model)
24
 
25
  # Topic modeling
26
- fig, df = topic_modeling(splitted_docs, embeddings, embedding_model)
27
 
28
  # Save the figure
29
  return (fig, df)
 
1
  import gradio as gr
2
  import matplotlib.pyplot as plt
3
  import pandas as pd
4
+ import spaces
5
  from sentence_transformers import SentenceTransformer
6
 
7
  from src.scripts.nlp_processing import embed_splitted_docs, split_corpus
 
12
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
13
 
14
 
15
+ @spaces.GPU()
16
+ def test():
17
+ embeddings = embedding_model.encode(
18
+ ["Test1", "Test2", "Test3"], show_progress_bar=True
19
+ )
20
+ print(":" * 10 + " TEST " + "*" * 10)
21
+ print(embeddings)
22
+
23
+
24
  def greet(fileobj):
25
+ test()
26
 
27
  # Read the file
28
  corpus = extract_corpus(fileobj)
 
31
  splitted_docs = split_corpus(corpus)
32
 
33
  # Embed the splitted documents
34
+ embeddings = embed_splitted_docs(splitted_docs)
35
 
36
  # Topic modeling
37
+ fig, df = topic_modeling(splitted_docs, embeddings)
38
 
39
  # Save the figure
40
  return (fig, df)
src/scripts/nlp_processing.py CHANGED
@@ -1,8 +1,13 @@
1
  import spaces
2
  from langchain_text_splitters.character import RecursiveCharacterTextSplitter
 
3
 
 
 
4
 
5
- def embed_splitted_docs(splitted_docs, embedding_model):
 
 
6
  """
7
  Encode the given list of documents using the specified embedding model.
8
 
 
1
  import spaces
2
  from langchain_text_splitters.character import RecursiveCharacterTextSplitter
3
+ from sentence_transformers import SentenceTransformer
4
 
5
+ EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
6
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
7
 
8
+
9
+ @spaces.GPU()
10
+ def embed_splitted_docs(splitted_docs):
11
  """
12
  Encode the given list of documents using the specified embedding model.
13
 
src/scripts/topic_modeling.py CHANGED
@@ -1,20 +1,18 @@
1
- import os
2
-
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
  import spaces
6
  from bertopic import BERTopic
7
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
8
  from cuml.cluster import HDBSCAN
9
  from cuml.manifold import UMAP
10
- from cuml.preprocessing import normalize
 
 
 
11
 
12
 
13
  @spaces.GPU()
14
  def topic_modeling(
15
  docs,
16
  embeddings,
17
- embedding_model,
18
  n_gram_range=(3, 6),
19
  mmr_diversity=1,
20
  mmr_top_n_words=30,
 
 
 
 
 
1
  import spaces
2
  from bertopic import BERTopic
3
  from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
4
  from cuml.cluster import HDBSCAN
5
  from cuml.manifold import UMAP
6
+ from sentence_transformers import SentenceTransformer
7
+
8
+ EMBEDDING_MODEL_NAME = "BAAI/bge-small-en"
9
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
10
 
11
 
12
  @spaces.GPU()
13
  def topic_modeling(
14
  docs,
15
  embeddings,
 
16
  n_gram_range=(3, 6),
17
  mmr_diversity=1,
18
  mmr_top_n_words=30,