File size: 967 Bytes
55af729
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import requests
from bs4 import BeautifulSoup
import regex  as  re
import streamlit as st

@st.cache_data(ttl=7200)
def scrape_aljaz(my_url):
    #print(my_url)
    codehtml = requests.get(my_url)
    page_soup = BeautifulSoup(codehtml.content, "html.parser")
    # print("Le code HTML est:",page_soup)
    article = page_soup.find("div", {"class": "wysiwyg wysiwyg--all-content css-ibbk12"})
    try:
        paragraphe = article.find_all("p")
    except AttributeError:
        return 'This is not a valid article, please choose another.'
    fullArticle = ""
    i=0
    for news in paragraphe:
        if i==0: # skip first iteration
            i = 1
            fullArticle = fullArticle + news.text.strip() # no newline before the first paragraph
            continue
        fullArticle = fullArticle + "\n" + news.text.strip()
    
    #suppression des espaces entrelignes
    fullArticle = re.sub(r'\n[\t\n\s]+\n*',r"\n",fullArticle)
    return fullArticle