demomodels commited on
Commit
bef26f3
·
verified ·
1 Parent(s): 9ba2a1c

Added summarization

Browse files
Files changed (1) hide show
  1. app.py +108 -2
app.py CHANGED
@@ -8,6 +8,15 @@ from transformers.pipelines.audio_utils import ffmpeg_read
8
  import tempfile
9
  import os
10
 
 
 
 
 
 
 
 
 
 
11
  MODEL_NAME = "openai/whisper-large-v3"
12
  BATCH_SIZE = 8
13
  FILE_LIMIT_MB = 1000
@@ -22,13 +31,110 @@ pipe = pipeline(
22
  device=device,
23
  )
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def transcribe(inputs, task):
27
  if inputs is None:
28
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
29
 
30
  text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
31
- return text
32
 
33
 
34
  def _return_yt_html_embed(yt_url):
@@ -85,7 +191,7 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
85
 
86
  text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
87
 
88
- return html_embed_str, text
89
 
90
 
91
  demo = gr.Blocks()
 
8
  import tempfile
9
  import os
10
 
11
+ import numpy as np
12
+ from gensim.models import Word2Vec
13
+ from sklearn.cluster import KMeans
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+ from collections import defaultdict
16
+ import spacy
17
+ from transformers import pipeline
18
+ from sklearn.metrics import davies_bouldin_score
19
+
20
  MODEL_NAME = "openai/whisper-large-v3"
21
  BATCH_SIZE = 8
22
  FILE_LIMIT_MB = 1000
 
31
  device=device,
32
  )
33
 
34
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
35
+
36
+ nlp = spacy.load("en_core_web_sm")
37
+
38
+ def summarize(text, max_length=1000):
39
+ return summarizer(text, max_length=min(max_length, len(text)), min_length=1, do_sample=False)[0]["summary_text"]
40
+
41
+
42
+ def segment_sentences(text):
43
+ # Process the text using spaCy
44
+ doc = nlp(text)
45
+
46
+ # Extract sentences from the processed document
47
+ return [sent.text for sent in doc.sents]
48
+
49
+
50
+ def preprocess_sentences(sentences):
51
+ preprocessed_sentences = []
52
+
53
+ for sentence in sentences:
54
+ # Tokenize and lemmatize the sentence using spaCy
55
+ doc = nlp(sentence.lower())
56
+ tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
57
+ preprocessed_sentences.append(tokens)
58
+
59
+ return preprocessed_sentences
60
+
61
+
62
+ def embedding(preprocessed_sentences):
63
+ model = Word2Vec(preprocessed_sentences, vector_size=100, window=5, min_count=1, sg=1)
64
+ sentence_embeddings = []
65
+
66
+ for sentence in preprocessed_sentences:
67
+ word_embeddings = [model.wv[word] for word in sentence if word in model.wv]
68
+ if word_embeddings:
69
+ sentence_embeddings.append(np.mean(word_embeddings, axis=0))
70
+ else:
71
+ # Handle the case when none of the words in the sentence exist in the Word2Vec vocabulary
72
+ sentence_embeddings.append(np.zeros(model.vector_size)) # Use zero vector as placeholder
73
+
74
+ return sentence_embeddings
75
+
76
+
77
+ def optimal_n_clusters(sentence_embeddings):
78
+ cosine_sim_matrix = cosine_similarity(sentence_embeddings)
79
+ db_scores = []
80
+ k_values = range(2, len(sentence_embeddings))
81
+
82
+ for k in k_values:
83
+ kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
84
+ cluster_labels = kmeans.fit_predict(cosine_sim_matrix)
85
+ db_scores.append(davies_bouldin_score(cosine_sim_matrix, cluster_labels))
86
+
87
+ # Choose the optimal number of clusters based on Davies-Bouldin index
88
+ return (cosine_sim_matrix, np.argmin(db_scores) + 2) # Add 2 to account for skipping k=1
89
+
90
+
91
+ def cluster_assignments(cosine_sim_matrix, optimal_n_clusters):
92
+ # Cluster sentence embeddings using KMeans with the optimal number of clusters
93
+ kmeans = KMeans(n_clusters=optimal_n_clusters, n_init=10, random_state=42)
94
+ return kmeans.fit_predict(cosine_sim_matrix)
95
+
96
+
97
+ def clusters(sentences, cluster_assignments):
98
+ # Group sentences into clusters
99
+ clusters = defaultdict(list)
100
+ for i, sentence in enumerate(sentences):
101
+ clusters[cluster_assignments[i]].append(sentence)
102
+
103
+ result = defaultdict(list)
104
+ for i in range(len(clusters)):
105
+ cluster = ' '.join(clusters[i])
106
+ title = summarize(cluster, 10)
107
+ result[title].extend(clusters[i])
108
+
109
+ return result
110
+
111
+
112
+ def format_as_bullet_points(dictionary):
113
+ bullet_points = ""
114
+ for key, values in dictionary.items():
115
+ bullet_points += f"- {key}:\n"
116
+ for value in values:
117
+ bullet_points += f" - {value}\n"
118
+ return bullet_points.strip()
119
+
120
+
121
+ def final_result(input):
122
+ text = summarize(input)
123
+ sentences = segment_sentences(text)
124
+ preprocessed_sentences = preprocess_sentences(sentences)
125
+ sentence_embeddings = embedding(preprocessed_sentences)
126
+ cosine_sim_matrix, optimal_number_of_clusters = optimal_n_clusters(sentence_embeddings)
127
+ clusters_assignments = cluster_assignments(cosine_sim_matrix, optimal_number_of_clusters)
128
+ all_clusters = clusters(sentences, clusters_assignments)
129
+ return format_as_bullet_points(all_clusters)
130
+
131
 
132
  def transcribe(inputs, task):
133
  if inputs is None:
134
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
135
 
136
  text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
137
+ return final_result(text)
138
 
139
 
140
  def _return_yt_html_embed(yt_url):
 
191
 
192
  text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
193
 
194
+ return html_embed_str, final_result(text)
195
 
196
 
197
  demo = gr.Blocks()