Spaces:

KevlarVK
/

content_summarizer

Runtime error

a-b-v-k

Revert "Streamlit version upgrade"

22c2f92 unverified almost 3 years ago

2.69 kB

	import requests
	from bs4 import BeautifulSoup
	from nltk.tokenize import sent_tokenize
	import nltk
	import re
	import streamlit as st
	from youtube_transcript_api import YouTubeTranscriptApi
	import spacy

	@st.cache
	def fetch_article_text(url: str):

	r = requests.get(url)
	soup = BeautifulSoup(r.text, "html.parser")
	results = soup.find_all(["h1", "p"])
	text = [result.text for result in results]
	ARTICLE = " ".join(text)
	return re.sub(r'\[\d+\]', '', ARTICLE)

	def count_tokens(text: str):
	return len(text.split(" "))

	@st.cache
	def get_text_from_youtube_url(url: str):

	id = url.split("=")[1]
	try:
	transcript = YouTubeTranscriptApi.get_transcript(id)
	except:
	transcript = YouTubeTranscriptApi.find_transcript(["en"])
	script = ""

	for text in transcript:
	t = text["text"]
	if t != '[Music]':
	script += t.lower() + " "

	return add_punctuation(script)

	def add_punctuation(text: str):

	# try:
	nlp = spacy.load("en_core_web_sm")
	# except:
	# import spacy.cli
	# spacy.cli.download("en_core_web_sm")
	# nlp = spacy.load("en_core_web_sm")

	doc = nlp(text)
	punctuation = [".", ",", ";", ":", "?", "!"]

	sentences = []
	for sentence in doc.sents:

	last_token = sentence[-1]
	if last_token.text in punctuation:
	sentence = sentence[:-1]

	last_word = sentence[-1]
	if last_word.pos_ == "NOUN":
	sentence = sentence.text + "."
	elif last_word.pos_ == "VERB":
	sentence = sentence.text + "?"
	else:
	sentence = sentence.text + "."

	sentence = sentence[0].upper() + sentence[1:]
	sentences.append(sentence)

	text_with_punctuation = " ".join(sentences)

	return text_with_punctuation


	def get_input_chunks(text: str, max_length: int = 500):

	text = re.sub(r'\[\d+\]', '', text)

	try:
	sentences = sent_tokenize(text)
	except:
	nltk.download('punkt')
	sentences = sent_tokenize(text)

	sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4]

	input_chunks = []
	temp_sentences = ""
	tokens = 0

	for sentence in sentences:
	if tokens + count_tokens(sentence) < max_length:
	temp_sentences += sentence
	tokens += count_tokens(sentence)
	else:
	input_chunks.append(temp_sentences)
	tokens = count_tokens(sentence)
	temp_sentences = sentence

	if len(temp_sentences) > 0:
	input_chunks.append(temp_sentences)

	return input_chunks