| | import os |
| | import streamlit as st |
| | import requests |
| | from streamlit_lottie import st_lottie |
| |
|
| | def main() -> None: |
| |
|
| | |
| |
|
| | def load_lottieurl(lottie_url:str): |
| | r = requests.get(url=lottie_url) |
| | return r.json() if r.status_code == 200 else None |
| |
|
| | def fetch(url): |
| | try: |
| | result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url}) |
| | return result.json() |
| | except Exception: |
| | return {} |
| | |
| | st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide") |
| |
|
| | lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json") |
| |
|
| | |
| | with st.container(): |
| | st.subheader("Article Scraper") |
| | st.title("A Digital News / Article Information Extraction Application") |
| | st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services") |
| | st.write("This service can be utilised in the data collection / curation process of data science workflow") |
| | st.write("[My Website >](https://www.rahulnenavath.co.in/)") |
| | st.subheader(f'About Article Scraper API:') |
| | st.write( |
| | """ |
| | - Article scraper API is deployed on AWS Lambda using AWS ECR container deployment |
| | - CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR |
| | - Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper) |
| | API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD) |
| | """ |
| | ) |
| |
|
| | with st.container(): |
| | st.write("---") |
| | left_col, right_col = st.columns(2) |
| |
|
| | with left_col: |
| | st.header("How it works?") |
| | st.write("##") |
| | st.write('**Input**: Article URL') |
| | st.write('**Output**: Extracted Article Information') |
| | st.write( |
| | """ |
| | **Working**: |
| | - Download the HTML content from the given Article URL |
| | - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames |
| | - Arrange Information appropriately |
| | - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters |
| | """ |
| | ) |
| | st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content') |
| | |
| | with right_col: |
| | st_lottie(lottie_animation, height=500) |
| |
|
| | with st.form("my_form"): |
| | article_url = st.text_input("Article URL", value="", key="article_url") |
| |
|
| | submitted = st.form_submit_button("Submit") |
| |
|
| | if submitted: |
| | with st.spinner('Scraping Information ...'): |
| | data = fetch(url=article_url) |
| | |
| | if data: |
| | st.success("Request is Successful") |
| | content = data.get("scraped_content") |
| | st.write("---") |
| | st.subheader(f'Extracted Article Information') |
| | st.write(f"**Article Title:** {content.get('article_title')}") |
| | st.write(f"**Author:** {content.get('author')}") |
| | st.write(f"**Published Date:** {content.get('publish_date')}") |
| | st.write(f"**Description:** {content.get('description')}") |
| | st.write(f"**Content:** {content.get('article_content')}") |
| | st.write(f"**Article URL:** {content.get('article_url')}") |
| | st.write(f"**Canonical URL:** {content.get('canonical_url')}") |
| | st.write(f"**Publisher Name:** {content.get('publisher_name')}") |
| | st.write(f"**Article Image:** {content.get('image')}") |
| | st.write(f"**Article Keywords:** {content.get('keywords')}") |
| | st.write(f"**Video URL:** {content.get('video_url')}") |
| | st.write(f"**Audio URL:** {content.get('audio_url')}") |
| | else: |
| | st.error("Request Failed") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |