Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| from nltk.tokenize import sent_tokenize | |
| import nltk | |
| import re | |
| import streamlit as st | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| import spacy | |
| def fetch_article_text(url: str): | |
| r = requests.get(url) | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| results = soup.find_all(["h1", "p"]) | |
| text = [result.text for result in results] | |
| ARTICLE = " ".join(text) | |
| return re.sub(r'\[\d+\]', '', ARTICLE) | |
| def count_tokens(text: str): | |
| return len(text.split(" ")) | |
| def get_text_from_youtube_url(url: str): | |
| id = url.split("=")[1] | |
| try: | |
| transcript = YouTubeTranscriptApi.get_transcript(id) | |
| except: | |
| transcript = YouTubeTranscriptApi.find_transcript(["en"]) | |
| script = "" | |
| for text in transcript: | |
| t = text["text"] | |
| if t != '[Music]': | |
| script += t.lower() + " " | |
| return add_punctuation(script) | |
| def add_punctuation(text: str): | |
| # try: | |
| nlp = spacy.load("en_core_web_sm") | |
| # except: | |
| # import spacy.cli | |
| # spacy.cli.download("en_core_web_sm") | |
| # nlp = spacy.load("en_core_web_sm") | |
| doc = nlp(text) | |
| punctuation = [".", ",", ";", ":", "?", "!"] | |
| sentences = [] | |
| for sentence in doc.sents: | |
| last_token = sentence[-1] | |
| if last_token.text in punctuation: | |
| sentence = sentence[:-1] | |
| last_word = sentence[-1] | |
| if last_word.pos_ == "NOUN": | |
| sentence = sentence.text + "." | |
| elif last_word.pos_ == "VERB": | |
| sentence = sentence.text + "?" | |
| else: | |
| sentence = sentence.text + "." | |
| sentence = sentence[0].upper() + sentence[1:] | |
| sentences.append(sentence) | |
| text_with_punctuation = " ".join(sentences) | |
| return text_with_punctuation | |
| def get_input_chunks(text: str, max_length: int = 500): | |
| text = re.sub(r'\[\d+\]', '', text) | |
| try: | |
| sentences = sent_tokenize(text) | |
| except: | |
| nltk.download('punkt') | |
| sentences = sent_tokenize(text) | |
| sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4] | |
| input_chunks = [] | |
| temp_sentences = "" | |
| tokens = 0 | |
| for sentence in sentences: | |
| if tokens + count_tokens(sentence) < max_length: | |
| temp_sentences += sentence | |
| tokens += count_tokens(sentence) | |
| else: | |
| input_chunks.append(temp_sentences) | |
| tokens = count_tokens(sentence) | |
| temp_sentences = sentence | |
| if len(temp_sentences) > 0: | |
| input_chunks.append(temp_sentences) | |
| return input_chunks | |