Spaces:

rahulNenavath305
/

article-scraper

Runtime error

App Files Files Community

article-scraper / app.py

rahulNenavath305

made the heading bolder

d3490a4 almost 4 years ago

raw

history blame contribute delete

4.27 kB

	import os
	import streamlit as st
	import requests
	from streamlit_lottie import st_lottie

	def main() -> None:

	# ----- Loading Assets ----

	def load_lottieurl(lottie_url:str):
	r = requests.get(url=lottie_url)
	return r.json() if r.status_code == 200 else None

	def fetch(url):
	try:
	result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
	return result.json()
	except Exception:
	return {}

	st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")

	lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")

	# ----- Introduction --------
	with st.container():
	st.subheader("Article Scraper")
	st.title("A Digital News / Article Information Extraction Application")
	st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
	st.write("This service can be utilised in the data collection / curation process of data science workflow")
	st.write("[My Website >](https://www.rahulnenavath.co.in/)")
	st.subheader(f'About Article Scraper API:')
	st.write(
	"""
	- Article scraper API is deployed on AWS Lambda using AWS ECR container deployment
	- CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR
	- Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper)
	API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD)
	"""
	)

	with st.container():
	st.write("---")
	left_col, right_col = st.columns(2)

	with left_col:
	st.header("How it works?")
	st.write("##")
	st.write('Input: Article URL')
	st.write('Output: Extracted Article Information')
	st.write(
	"""
	Working:
	- Download the HTML content from the given Article URL
	- Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
	- Arrange Information appropriately
	- Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
	"""
	)
	st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content')

	with right_col:
	st_lottie(lottie_animation, height=500)

	with st.form("my_form"):
	article_url = st.text_input("Article URL", value="", key="article_url")

	submitted = st.form_submit_button("Submit")

	if submitted:
	with st.spinner('Scraping Information ...'):
	data = fetch(url=article_url)

	if data:
	st.success("Request is Successful")
	content = data.get("scraped_content")
	st.write("---")
	st.subheader(f'Extracted Article Information')
	st.write(f"Article Title: {content.get('article_title')}")
	st.write(f"Author: {content.get('author')}")
	st.write(f"Published Date: {content.get('publish_date')}")
	st.write(f"Description: {content.get('description')}")
	st.write(f"Content: {content.get('article_content')}")
	st.write(f"Article URL: {content.get('article_url')}")
	st.write(f"Canonical URL: {content.get('canonical_url')}")
	st.write(f"Publisher Name: {content.get('publisher_name')}")
	st.write(f"Article Image: {content.get('image')}")
	st.write(f"Article Keywords: {content.get('keywords')}")
	st.write(f"Video URL: {content.get('video_url')}")
	st.write(f"Audio URL: {content.get('audio_url')}")
	else:
	st.error("Request Failed")


	if __name__ == "__main__":
	main()