Spaces:
Runtime error
Runtime error
initial_commit
Browse files- README.md +36 -12
- app.py +17 -0
- multiapp.py +19 -0
- packages.txt +1 -0
- paraphraser.py +28 -0
- requirements.txt +9 -0
- scrap.py +24 -0
- summary.py +24 -0
README.md
CHANGED
|
@@ -1,12 +1,36 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# python-dev-task-summarization
|
| 2 |
+
|
| 3 |
+
The task has been done in two methods-
|
| 4 |
+
- **using traditional Python libraries (like NLTK,Sumy)**
|
| 5 |
+
- **using pre-trained transformers model**
|
| 6 |
+
|
| 7 |
+
# Method-1
|
| 8 |
+
## using traditional Python libraries
|
| 9 |
+
#### Web Scraping Tools:
|
| 10 |
+
- Selenium
|
| 11 |
+
#### Paraphrasing Tools:
|
| 12 |
+
- used [nlpaug](https://github.com/makcedward/nlpaug) library
|
| 13 |
+
#### Summarization Tools:
|
| 14 |
+
- used [sumy](https://miso-belica.github.io/sumy/) library
|
| 15 |
+
#### System Requirements:
|
| 16 |
+
- you will find it in the _requirements.txt_ file
|
| 17 |
+
|
| 18 |
+
## How to test or run this?
|
| 19 |
+
- just open this link and follow the instructions: _**https://shamim237-python-dev-task-app-3n18pu.streamlit.app/**_
|
| 20 |
+
|
| 21 |
+
# Method-2
|
| 22 |
+
## Using pre-trained transformers model
|
| 23 |
+
#### Web Scraping Tools:
|
| 24 |
+
- ScrapperAPI
|
| 25 |
+
- BeautifulSoup
|
| 26 |
+
#### Paraphrasing Tools:
|
| 27 |
+
- used **"ramsrigouthamg/t5-large-paraphraser-diverse-high-quality"** pre-trained model from HuggingFace
|
| 28 |
+
#### Summarization Tools:
|
| 29 |
+
- used **/"google/pegasus-cnn_dailymail"/** pre-trained model from HuggingFace
|
| 30 |
+
#### System Requirements:
|
| 31 |
+
- you will find it in the _Python_Dev_Task.ipynb_ notebook or in the below link.
|
| 32 |
+
|
| 33 |
+
## How to test or run this?
|
| 34 |
+
- Just open the **"Python_Dev_Task.ipynb"** file in Colab _or_ open this link: **_https://colab.research.google.com/drive/1wwaj0TobsnzQL5jMVsYrF5z6rc1944tE?usp=sharing_**
|
| 35 |
+
- Run all the cell
|
| 36 |
+
- The summarization output will show up in the last cell of the notebook.
|
app.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from multiapp import MultiApp
|
| 3 |
+
from apps import paraphraseApp, summarizerApp, scraperrApp
|
| 4 |
+
|
| 5 |
+
app = MultiApp()
|
| 6 |
+
|
| 7 |
+
st.title("Python Dev Task @SkyRanko")
|
| 8 |
+
st.write("==================_Completed by_ **Shamim Mahbub**==================")
|
| 9 |
+
st.markdown("This app provides three services - :red[Scraping], :orange[Paraphrasing] and :blue[Summarizing]")
|
| 10 |
+
st.caption("Note: _After scraping data from Amazon, the data has been paraphrased using a model and then Summarization has been performed on the paraphrased data._")
|
| 11 |
+
|
| 12 |
+
# Add all your application here
|
| 13 |
+
app.add_app("Scraper", scraperrApp.app)
|
| 14 |
+
app.add_app("Paraphraser", paraphraseApp.app)
|
| 15 |
+
app.add_app("Summarizer", summarizerApp.app)
|
| 16 |
+
# The main app
|
| 17 |
+
app.run()
|
multiapp.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
class MultiApp:
|
| 4 |
+
def __init__(self):
|
| 5 |
+
self.apps = []
|
| 6 |
+
|
| 7 |
+
def add_app(self, title, func):
|
| 8 |
+
self.apps.append({
|
| 9 |
+
"title": title,
|
| 10 |
+
"function": func
|
| 11 |
+
})
|
| 12 |
+
|
| 13 |
+
def run(self):
|
| 14 |
+
app = st.selectbox(
|
| 15 |
+
'Choose one',
|
| 16 |
+
self.apps,
|
| 17 |
+
format_func=lambda app: app['title'])
|
| 18 |
+
|
| 19 |
+
app['function']()
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
firefox-esr
|
paraphraser.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import nlpaug.augmenter.word as naw
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 7 |
+
|
| 8 |
+
@st.cache(allow_output_mutation=True, ttl=48*3600)
|
| 9 |
+
def load_model():
|
| 10 |
+
aug = naw.ContextualWordEmbsAug(
|
| 11 |
+
model_path='bert-base-uncased', action="insert")
|
| 12 |
+
return aug
|
| 13 |
+
|
| 14 |
+
aug = load_model()
|
| 15 |
+
|
| 16 |
+
def parphrase(passage):
|
| 17 |
+
sen = []
|
| 18 |
+
for i in passage:
|
| 19 |
+
res = len(re.findall(r'\w+', i))
|
| 20 |
+
if res == 2:
|
| 21 |
+
pass
|
| 22 |
+
else:
|
| 23 |
+
res = i.replace('"', "'").replace("\n", "")
|
| 24 |
+
sen.append(res)
|
| 25 |
+
|
| 26 |
+
pas = " ".join(sen)
|
| 27 |
+
para_text = aug.augment(pas)
|
| 28 |
+
return para_text
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nlpaug==1.1.11
|
| 2 |
+
nltk
|
| 3 |
+
selenium==4.8.0
|
| 4 |
+
sentencepiece==0.1.97
|
| 5 |
+
streamlit==1.17.0
|
| 6 |
+
sumy==0.11.0
|
| 7 |
+
torch==1.13.1
|
| 8 |
+
transformers==4.25.1
|
| 9 |
+
webdriver-manager
|
scrap.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
from selenium import webdriver
|
| 3 |
+
from selenium.webdriver import Chrome
|
| 4 |
+
from selenium.webdriver.common.by import By
|
| 5 |
+
from selenium.webdriver.firefox.options import Options
|
| 6 |
+
from selenium.webdriver.firefox.service import Service
|
| 7 |
+
from webdriver_manager.firefox import GeckoDriverManager
|
| 8 |
+
from selenium.webdriver.common.by import By
|
| 9 |
+
|
| 10 |
+
def extract(link):
|
| 11 |
+
url = link
|
| 12 |
+
firefoxOptions = Options()
|
| 13 |
+
firefoxOptions.add_argument("--headless")
|
| 14 |
+
service = Service(GeckoDriverManager().install())
|
| 15 |
+
driver = webdriver.Firefox(
|
| 16 |
+
options=firefoxOptions,
|
| 17 |
+
service=service,
|
| 18 |
+
)
|
| 19 |
+
driver.get(url)
|
| 20 |
+
data = driver.find_element(By.ID,"aplus_feature_div")
|
| 21 |
+
data = data.text
|
| 22 |
+
data = data.split("\n")
|
| 23 |
+
time.sleep(2)
|
| 24 |
+
return data
|
summary.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from sumy.nlp.tokenizers import Tokenizer
|
| 4 |
+
from sumy.parsers.plaintext import PlaintextParser
|
| 5 |
+
from sumy.summarizers.lex_rank import LexRankSummarizer
|
| 6 |
+
|
| 7 |
+
@st.cache(allow_output_mutation=True, ttl=48*3600)
|
| 8 |
+
def dwnld_lib():
|
| 9 |
+
nltk.download('punkt')
|
| 10 |
+
|
| 11 |
+
dwnld_lib()
|
| 12 |
+
|
| 13 |
+
def text_summary(text):
|
| 14 |
+
para = " ".join(text)
|
| 15 |
+
# Create a plaintext parser and tokenizer
|
| 16 |
+
parser = PlaintextParser.from_string(para, Tokenizer("english"))
|
| 17 |
+
# Create a LexRank summarizer
|
| 18 |
+
summarizer = LexRankSummarizer()
|
| 19 |
+
# Summarize the text and print the results
|
| 20 |
+
summ = []
|
| 21 |
+
for sentence in summarizer(parser.document, 4):
|
| 22 |
+
summy = str(sentence).capitalize()
|
| 23 |
+
summ.append(summy)
|
| 24 |
+
return summ
|