Daniel Tse
commited on
Commit
·
9bb604c
1
Parent(s):
01bea1f
Add sentence chunking
Browse files
app.py
CHANGED
|
@@ -29,12 +29,47 @@ def transcribe_audio(audiofile):
|
|
| 29 |
st.info('Done Transcription')
|
| 30 |
|
| 31 |
return transcription
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def summarize_podcast(audiotranscription):
|
| 34 |
st.info("Summarizing...")
|
| 35 |
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
st.session_state['summary'] = summarized_text
|
| 39 |
return summarized_text
|
| 40 |
|
|
|
|
| 29 |
st.info('Done Transcription')
|
| 30 |
|
| 31 |
return transcription
|
| 32 |
+
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):
|
| 33 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 34 |
+
sentences = sent_tokenize(text)
|
| 35 |
+
|
| 36 |
+
length = 0
|
| 37 |
+
chunk = ""
|
| 38 |
+
chunks = []
|
| 39 |
+
count = -1
|
| 40 |
+
|
| 41 |
+
for sentence in sentences:
|
| 42 |
+
count += 1
|
| 43 |
+
combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
|
| 44 |
+
|
| 45 |
+
if combined_length <= tokenizer.max_len_single_sentence: # if it doesn't exceed
|
| 46 |
+
chunk += sentence + " " # add the sentence to the chunk
|
| 47 |
+
length = combined_length # update the length counter
|
| 48 |
+
|
| 49 |
+
# if it is the last sentence
|
| 50 |
+
if count == len(sentences) - 1:
|
| 51 |
+
chunks.append(chunk) # save the chunk
|
| 52 |
+
|
| 53 |
+
else:
|
| 54 |
+
chunks.append(chunk) # save the chunk
|
| 55 |
+
# reset
|
| 56 |
+
length = 0
|
| 57 |
+
chunk = ""
|
| 58 |
+
|
| 59 |
+
# take care of the overflow sentence
|
| 60 |
+
chunk += sentence + " "
|
| 61 |
+
length = len(tokenizer.tokenize(sentence))
|
| 62 |
+
|
| 63 |
+
return chunks
|
| 64 |
|
| 65 |
def summarize_podcast(audiotranscription):
|
| 66 |
st.info("Summarizing...")
|
| 67 |
summarizer = pipeline("summarization", model="philschmid/flan-t5-base-samsum", device=0)
|
| 68 |
|
| 69 |
+
st.info("Chunking text")
|
| 70 |
+
text_chunks = chunk_and_preprocess_text(audiotranscription)
|
| 71 |
+
|
| 72 |
+
summarized_text = summarizer(text_chunks, max_len=200,min_len=50)
|
| 73 |
st.session_state['summary'] = summarized_text
|
| 74 |
return summarized_text
|
| 75 |
|