Spaces:
Sleeping
Sleeping
Added summarization
Browse files
app.py
CHANGED
|
@@ -8,6 +8,15 @@ from transformers.pipelines.audio_utils import ffmpeg_read
|
|
| 8 |
import tempfile
|
| 9 |
import os
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
MODEL_NAME = "openai/whisper-large-v3"
|
| 12 |
BATCH_SIZE = 8
|
| 13 |
FILE_LIMIT_MB = 1000
|
|
@@ -22,13 +31,110 @@ pipe = pipeline(
|
|
| 22 |
device=device,
|
| 23 |
)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def transcribe(inputs, task):
|
| 27 |
if inputs is None:
|
| 28 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
| 29 |
|
| 30 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
| 31 |
-
return
|
| 32 |
|
| 33 |
|
| 34 |
def _return_yt_html_embed(yt_url):
|
|
@@ -85,7 +191,7 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
|
|
| 85 |
|
| 86 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
| 87 |
|
| 88 |
-
return html_embed_str, text
|
| 89 |
|
| 90 |
|
| 91 |
demo = gr.Blocks()
|
|
|
|
| 8 |
import tempfile
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
import numpy as np
|
| 12 |
+
from gensim.models import Word2Vec
|
| 13 |
+
from sklearn.cluster import KMeans
|
| 14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
import spacy
|
| 17 |
+
from transformers import pipeline
|
| 18 |
+
from sklearn.metrics import davies_bouldin_score
|
| 19 |
+
|
| 20 |
MODEL_NAME = "openai/whisper-large-v3"
|
| 21 |
BATCH_SIZE = 8
|
| 22 |
FILE_LIMIT_MB = 1000
|
|
|
|
| 31 |
device=device,
|
| 32 |
)
|
| 33 |
|
| 34 |
+
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
| 35 |
+
|
| 36 |
+
nlp = spacy.load("en_core_web_sm")
|
| 37 |
+
|
| 38 |
+
def summarize(text, max_length=1000):
|
| 39 |
+
return summarizer(text, max_length=min(max_length, len(text)), min_length=1, do_sample=False)[0]["summary_text"]
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def segment_sentences(text):
|
| 43 |
+
# Process the text using spaCy
|
| 44 |
+
doc = nlp(text)
|
| 45 |
+
|
| 46 |
+
# Extract sentences from the processed document
|
| 47 |
+
return [sent.text for sent in doc.sents]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def preprocess_sentences(sentences):
|
| 51 |
+
preprocessed_sentences = []
|
| 52 |
+
|
| 53 |
+
for sentence in sentences:
|
| 54 |
+
# Tokenize and lemmatize the sentence using spaCy
|
| 55 |
+
doc = nlp(sentence.lower())
|
| 56 |
+
tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
|
| 57 |
+
preprocessed_sentences.append(tokens)
|
| 58 |
+
|
| 59 |
+
return preprocessed_sentences
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def embedding(preprocessed_sentences):
|
| 63 |
+
model = Word2Vec(preprocessed_sentences, vector_size=100, window=5, min_count=1, sg=1)
|
| 64 |
+
sentence_embeddings = []
|
| 65 |
+
|
| 66 |
+
for sentence in preprocessed_sentences:
|
| 67 |
+
word_embeddings = [model.wv[word] for word in sentence if word in model.wv]
|
| 68 |
+
if word_embeddings:
|
| 69 |
+
sentence_embeddings.append(np.mean(word_embeddings, axis=0))
|
| 70 |
+
else:
|
| 71 |
+
# Handle the case when none of the words in the sentence exist in the Word2Vec vocabulary
|
| 72 |
+
sentence_embeddings.append(np.zeros(model.vector_size)) # Use zero vector as placeholder
|
| 73 |
+
|
| 74 |
+
return sentence_embeddings
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def optimal_n_clusters(sentence_embeddings):
|
| 78 |
+
cosine_sim_matrix = cosine_similarity(sentence_embeddings)
|
| 79 |
+
db_scores = []
|
| 80 |
+
k_values = range(2, len(sentence_embeddings))
|
| 81 |
+
|
| 82 |
+
for k in k_values:
|
| 83 |
+
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
|
| 84 |
+
cluster_labels = kmeans.fit_predict(cosine_sim_matrix)
|
| 85 |
+
db_scores.append(davies_bouldin_score(cosine_sim_matrix, cluster_labels))
|
| 86 |
+
|
| 87 |
+
# Choose the optimal number of clusters based on Davies-Bouldin index
|
| 88 |
+
return (cosine_sim_matrix, np.argmin(db_scores) + 2) # Add 2 to account for skipping k=1
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def cluster_assignments(cosine_sim_matrix, optimal_n_clusters):
|
| 92 |
+
# Cluster sentence embeddings using KMeans with the optimal number of clusters
|
| 93 |
+
kmeans = KMeans(n_clusters=optimal_n_clusters, n_init=10, random_state=42)
|
| 94 |
+
return kmeans.fit_predict(cosine_sim_matrix)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def clusters(sentences, cluster_assignments):
|
| 98 |
+
# Group sentences into clusters
|
| 99 |
+
clusters = defaultdict(list)
|
| 100 |
+
for i, sentence in enumerate(sentences):
|
| 101 |
+
clusters[cluster_assignments[i]].append(sentence)
|
| 102 |
+
|
| 103 |
+
result = defaultdict(list)
|
| 104 |
+
for i in range(len(clusters)):
|
| 105 |
+
cluster = ' '.join(clusters[i])
|
| 106 |
+
title = summarize(cluster, 10)
|
| 107 |
+
result[title].extend(clusters[i])
|
| 108 |
+
|
| 109 |
+
return result
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def format_as_bullet_points(dictionary):
|
| 113 |
+
bullet_points = ""
|
| 114 |
+
for key, values in dictionary.items():
|
| 115 |
+
bullet_points += f"- {key}:\n"
|
| 116 |
+
for value in values:
|
| 117 |
+
bullet_points += f" - {value}\n"
|
| 118 |
+
return bullet_points.strip()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def final_result(input):
|
| 122 |
+
text = summarize(input)
|
| 123 |
+
sentences = segment_sentences(text)
|
| 124 |
+
preprocessed_sentences = preprocess_sentences(sentences)
|
| 125 |
+
sentence_embeddings = embedding(preprocessed_sentences)
|
| 126 |
+
cosine_sim_matrix, optimal_number_of_clusters = optimal_n_clusters(sentence_embeddings)
|
| 127 |
+
clusters_assignments = cluster_assignments(cosine_sim_matrix, optimal_number_of_clusters)
|
| 128 |
+
all_clusters = clusters(sentences, clusters_assignments)
|
| 129 |
+
return format_as_bullet_points(all_clusters)
|
| 130 |
+
|
| 131 |
|
| 132 |
def transcribe(inputs, task):
|
| 133 |
if inputs is None:
|
| 134 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
| 135 |
|
| 136 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
| 137 |
+
return final_result(text)
|
| 138 |
|
| 139 |
|
| 140 |
def _return_yt_html_embed(yt_url):
|
|
|
|
| 191 |
|
| 192 |
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
|
| 193 |
|
| 194 |
+
return html_embed_str, final_result(text)
|
| 195 |
|
| 196 |
|
| 197 |
demo = gr.Blocks()
|