Spaces:
Runtime error
Runtime error
File size: 967 Bytes
55af729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import requests
from bs4 import BeautifulSoup
import regex as re
import streamlit as st
@st.cache_data(ttl=7200)
def scrape_aljaz(my_url):
#print(my_url)
codehtml = requests.get(my_url)
page_soup = BeautifulSoup(codehtml.content, "html.parser")
# print("Le code HTML est:",page_soup)
article = page_soup.find("div", {"class": "wysiwyg wysiwyg--all-content css-ibbk12"})
try:
paragraphe = article.find_all("p")
except AttributeError:
return 'This is not a valid article, please choose another.'
fullArticle = ""
i=0
for news in paragraphe:
if i==0: # skip first iteration
i = 1
fullArticle = fullArticle + news.text.strip() # no newline before the first paragraph
continue
fullArticle = fullArticle + "\n" + news.text.strip()
#suppression des espaces entrelignes
fullArticle = re.sub(r'\n[\t\n\s]+\n*',r"\n",fullArticle)
return fullArticle
|