7testi7

Runtime error

App Files Files Community

trhacknon

ismot commited on Apr 29, 2023

Commit

0b3a3af

0 Parent(s):

Duplicate from ismot/7testi7

Browse files

Co-authored-by: Ismo Talka <ismot@users.noreply.huggingface.co>

Files changed (4) hide show

.gitattributes +27 -0
README.md +14 -0
app.py +96 -0
requirements.txt +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Article Scraper
+emoji: 🦀
+colorFrom: gray
+colorTo: yellow
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: app.py
+pinned: false
+license: mit
+duplicated_from: ismot/7testi7
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import streamlit as st
+import requests
+from streamlit_lottie import st_lottie
+def main() -> None:
+    # ----- Loading Assets ----
+    def load_lottieurl(lottie_url:str):
+        r = requests.get(url=lottie_url)
+        return r.json() if r.status_code == 200 else None
+    def fetch(url):
+        try:
+            result = requests.post(url=os.environ.get('scraper-api-endpoint'), json={'url': url})
+            return result.json()
+        except Exception:
+            return {}
+    st.set_page_config(page_title="Article Scraper - Rahul Portfolio Project", page_icon=":spider:", layout="wide")
+    lottie_animation = load_lottieurl(lottie_url="https://assets3.lottiefiles.com/private_files/lf30_UaWyEa.json")
+    # ----- Introduction --------
+    with st.container():
+        st.subheader("Article Scraper")
+        st.title("A Digital News / Article Information Extraction Application")
+        st.write("A portfolio project developed to showcase my ability in developing Information Extraction Services")
+        st.write("This service can be utilised in the data collection / curation process of data science workflow")
+        st.write("[My Website >](https://www.rahulnenavath.co.in/)")
+        st.subheader(f'About Article Scraper API:')
+        st.write(
+            """
+                - Article scraper API is deployed on AWS Lambda using AWS ECR container deployment
+                - CI/CD workflow is implemented via GitHub Actions to push the latest docker build to AWS ECR
+                - Check out the API codebase on [my GitHub >](https://github.com/RahulNenavath/Article-Scraper)
+                API Tech Stack: Python, Beautifulsoup, AWS Lambda, AWS ECR, Docker, GitHub Actions (CI/CD)
+            """
+        )
+    with st.container():
+        st.write("---")
+        left_col, right_col = st.columns(2)
+        with left_col:
+            st.header("How it works?")
+            st.write("##")
+            st.write('**Input**: Article URL')
+            st.write('**Output**: Extracted Article Information')
+            st.write(
+                """
+                **Working**:
+                - Download the HTML content from the given Article URL
+                - Makes use of BeautifulSoup and extracts content from different HTML tags and ClassNames
+                - Arrange Information appropriately
+                - Regex based text cleaning to remove characters like additional spaces, unicodes, tabs, and newline characters
+                """
+            )
+            st.warning(f'Note: Web scraping is highly dependent on the Article HTML structure. Hence one might have to further clean the scraped content')
+        with right_col:
+            st_lottie(lottie_animation, height=500)
+    with st.form("my_form"):
+        article_url = st.text_input("Article URL", value="", key="article_url")
+        submitted = st.form_submit_button("Submit")
+        if submitted:
+            with st.spinner('Scraping Information ...'):
+                data = fetch(url=article_url)
+            if data:
+                st.success("Request is Successful")
+                content = data.get("scraped_content")
+                st.write("---")
+                st.subheader(f'Extracted Article Information')
+                st.write(f"**Article Title:** {content.get('article_title')}")
+                st.write(f"**Author:** {content.get('author')}")
+                st.write(f"**Published Date:** {content.get('publish_date')}")
+                st.write(f"**Description:** {content.get('description')}")
+                st.write(f"**Content:** {content.get('article_content')}")
+                st.write(f"**Article URL:** {content.get('article_url')}")
+                st.write(f"**Canonical URL:** {content.get('canonical_url')}")
+                st.write(f"**Publisher Name:** {content.get('publisher_name')}")
+                st.write(f"**Article Image:** {content.get('image')}")
+                st.write(f"**Article Keywords:** {content.get('keywords')}")
+                st.write(f"**Video URL:** {content.get('video_url')}")
+                st.write(f"**Audio URL:** {content.get('audio_url')}")
+            else:
+                st.error("Request Failed")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit-lottie==0.0.3
+requests
+streamlit