Summarizer

Sleeping

App Files Files Community

demomodels commited on Mar 10, 2024

Commit

bef26f3

verified ·

1 Parent(s): 9ba2a1c

Added summarization

Browse files

Files changed (1) hide show

app.py +108 -2

app.py CHANGED Viewed

@@ -8,6 +8,15 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
@@ -22,13 +31,110 @@ pipe = pipeline(
     device=device,
 )
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return  text
 def _return_yt_html_embed(yt_url):
@@ -85,7 +191,7 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return html_embed_str, text
 demo = gr.Blocks()

 import tempfile
 import os
+import numpy as np
+from gensim.models import Word2Vec
+from sklearn.cluster import KMeans
+from sklearn.metrics.pairwise import cosine_similarity
+from collections import defaultdict
+import spacy
+from transformers import pipeline
+from sklearn.metrics import davies_bouldin_score
 MODEL_NAME = "openai/whisper-large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
     device=device,
 )
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+nlp = spacy.load("en_core_web_sm")
+def summarize(text, max_length=1000):
+    return summarizer(text, max_length=min(max_length, len(text)), min_length=1, do_sample=False)[0]["summary_text"]
+def segment_sentences(text):
+    # Process the text using spaCy
+    doc = nlp(text)
+    # Extract sentences from the processed document
+    return [sent.text for sent in doc.sents]
+def preprocess_sentences(sentences):
+    preprocessed_sentences = []
+    for sentence in sentences:
+        # Tokenize and lemmatize the sentence using spaCy
+        doc = nlp(sentence.lower())
+        tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
+        preprocessed_sentences.append(tokens)
+    return preprocessed_sentences
+def embedding(preprocessed_sentences):
+    model = Word2Vec(preprocessed_sentences, vector_size=100, window=5, min_count=1, sg=1)
+    sentence_embeddings = []
+    for sentence in preprocessed_sentences:
+        word_embeddings = [model.wv[word] for word in sentence if word in model.wv]
+        if word_embeddings:
+            sentence_embeddings.append(np.mean(word_embeddings, axis=0))
+        else:
+            # Handle the case when none of the words in the sentence exist in the Word2Vec vocabulary
+            sentence_embeddings.append(np.zeros(model.vector_size))  # Use zero vector as placeholder
+    return sentence_embeddings
+def optimal_n_clusters(sentence_embeddings):
+    cosine_sim_matrix = cosine_similarity(sentence_embeddings)
+    db_scores = []
+    k_values = range(2, len(sentence_embeddings))
+    for k in k_values:
+        kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
+        cluster_labels = kmeans.fit_predict(cosine_sim_matrix)
+        db_scores.append(davies_bouldin_score(cosine_sim_matrix, cluster_labels))
+    # Choose the optimal number of clusters based on Davies-Bouldin index
+    return (cosine_sim_matrix, np.argmin(db_scores) + 2)  # Add 2 to account for skipping k=1
+def cluster_assignments(cosine_sim_matrix, optimal_n_clusters):
+    # Cluster sentence embeddings using KMeans with the optimal number of clusters
+    kmeans = KMeans(n_clusters=optimal_n_clusters, n_init=10, random_state=42)
+    return kmeans.fit_predict(cosine_sim_matrix)
+def clusters(sentences, cluster_assignments):
+    # Group sentences into clusters
+    clusters = defaultdict(list)
+    for i, sentence in enumerate(sentences):
+        clusters[cluster_assignments[i]].append(sentence)
+    result = defaultdict(list)
+    for i in range(len(clusters)):
+        cluster = ' '.join(clusters[i])
+        title = summarize(cluster, 10)
+        result[title].extend(clusters[i])
+    return result
+def format_as_bullet_points(dictionary):
+    bullet_points = ""
+    for key, values in dictionary.items():
+        bullet_points += f"- {key}:\n"
+        for value in values:
+            bullet_points += f"  - {value}\n"
+    return bullet_points.strip()
+def final_result(input):
+    text = summarize(input)
+    sentences = segment_sentences(text)
+    preprocessed_sentences = preprocess_sentences(sentences)
+    sentence_embeddings = embedding(preprocessed_sentences)
+    cosine_sim_matrix, optimal_number_of_clusters = optimal_n_clusters(sentence_embeddings)
+    clusters_assignments = cluster_assignments(cosine_sim_matrix, optimal_number_of_clusters)
+    all_clusters = clusters(sentences, clusters_assignments)
+    return format_as_bullet_points(all_clusters)
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+    return final_result(text)
 def _return_yt_html_embed(yt_url):
     text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+    return html_embed_str, final_result(text)
 demo = gr.Blocks()