diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -1,1504 +1,1504 @@ -import streamlit as st ### importing liberaries -from streamlit_extras.colored_header import colored_header -from streamlit_option_menu import option_menu -import streamlit.components.v1 as component -from streamlit_lottie import st_lottie, st_lottie_spinner -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.model_selection import train_test_split -from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import make_pipeline -from transformers import pipeline -from transformers import AutoTokenizer , AutoModelForSeq2SeqLM -from newspaper import Article -import nltk -import nltk.downloader -nltk.download('punkt_tab') -from nltk.tokenize import word_tokenize -from cleantext import clean -from PyPDF2 import PdfReader -import pdfminer -from pdfminer.high_level import extract_text -from pdfminer.high_level import extract_pages -from pdfminer.layout import LTTextContainer, LTChar, LTTextLine -import requests -import json -import numpy as np -import pandas as pd -import random -import base64 -import lxml -import lxml_html_clean -import re -import os - - -###### main app functions - -### insert external css -def insert_css(css_file:str): - with open(css_file) as f: - st.markdown(f"",unsafe_allow_html=True) - -### insert external html file -def insert_html(html_file): - with open(html_file) as f: - return f.read() - -### insert lottie animation json files -def insert_lottie_animation(animation_file:str): - with open(animation_file, "r") as f: - return json.load(f) - -### app tutorial video function -@st.dialog("App Tutorial") -def watch_tutorial(): - st.subheader("GenAi Summarizer🤖") - video_file = open("app_tutorial.mp4", "rb") - video_bytes = video_file.read() - st.text("") - st.video( - data=video_bytes,format="video/mp4", - loop=True,autoplay=True - ) - - -def download_text(text, filename): - """ - download article text - in document format - """ - #### Convert string to bytes - b64 = base64.b64encode(text.encode()).decode() - - href = f""" - - - - """ - - st.markdown(href, unsafe_allow_html=True) - if __name__=="__main__": - insert_css("cssfiles/download-article.css") - - -def copy_text(text): - html_code = f""" - - - - - - - - - - - - -
-
-

{text}

- - - - - - """ - - component.html(html_code,height=28) - - -### copy and download button -def Copy_download_button(article_text,article_format,article_file_name): - try: - ### column for copy and download article - Copy_btn_col,download_btn_col, blank_col_copy1, blank_col_copy2= st.columns([1,3,5,5],gap="small") - - with blank_col_copy1: - st.text("") - with blank_col_copy1: - st.text("") - - with Copy_btn_col: - copy_text(article_text) - - with download_btn_col: - download_text(text=article_format,filename=article_file_name) - except Exception as e: - st.warning("Something went wrong...",e,icon="⚠️") - - -### setting page layout -st.set_page_config( - page_title="GenAi Summarizer", - page_icon="🤗", - initial_sidebar_state="collapsed", - layout="wide" -) - - -#### app settings css -if __name__=="__main__": - insert_css("cssfiles/app.css") - - -### huging face modals -Hugingface_modals = { - "google-pegasus":"google/pegasus-xsum", - "facebook-bart":"facebook/bart-large-cnn", - "t5-base":"t5-base" -} - - -### summarization modal -def Hugingface_summarization_modal(summary_text,modal_name,maximum_length): - """ - it is an text summarization modal - it use hugingface modals for summarization task. - it generates summarized text output - """ - def summarization_modal_name(modal)->str: - if modal == "google-pegasus": - return "google/pegasus-xsum" - elif modal == "facebook-bart": - return "facebook/bart-large-cnn" - elif modal == "t5-base": - return "t5-base" - try: - use_modal = summarization_modal_name(modal_name) ### modal name - - auto_tokenizer = AutoTokenizer.from_pretrained(use_modal) ### using autokenizer for pretrained modal - auto_modal = AutoModelForSeq2SeqLM.from_pretrained(use_modal) - - ### creating pipeline - summarizer = pipeline("summarization",model=auto_modal,tokenizer=auto_tokenizer) - - summarizer_text = summary_text - - summary_generate = summarizer( ### summarizer - summarizer_text,max_length=maximum_length+20, - min_length=maximum_length, - do_sample=False - ) - - return summary_generate[0]['summary_text'] - - except Exception as e: - st.warning("Something went wrong...\n\n",e,icon="⚠️") - - - - -### displaying modals -@st.cache_data -def Modal_Level(modal_text): - if modal_text == "google-pegasus": - st.markdown( - f""" -
- - Maodal- - - google/pegasus-xsum -
- """,unsafe_allow_html=True - ) - - elif modal_text == "facebook-bart": - st.markdown( - f"""
- - Maodal- - - facebook/bart-large-cnn -
- """,unsafe_allow_html=True - ) - - elif modal_text == "t5-base": - st.markdown( - f"""
- - Maodal- - - t5-base -
- """,unsafe_allow_html=True - ) - if __name__=="__main__": - insert_css("cssfiles/modal.css") - - - -#### creating sidebar -app_sidebar = st.sidebar - -with app_sidebar: - st.text("") - st.subheader("GenAi Summarizer🤖") - st.write("Developer: **Nishant Maity**") - st.text("") - st.text("") - - ### creating menu bar - Main_menu = option_menu( - menu_title="", - options=["Article Summarizer","Text Summarizer","PDF Summarizer","App Info"], - icons=["chat-dots","card-heading","file-earmark-pdf","person-circle"], - default_index=0, - key="Menu Bar" - ) - st.text("") - - ### select modal for text and article summarizer - if Main_menu == "Article Summarizer" or Main_menu == "Text Summarizer": - - Summarizer_modal = st.selectbox( - label="Select Modal", - options=np.array(list(Hugingface_modals.keys())), - index=1, - key="Modals" - ) - -#### selecting number or paragraph for article summarizer -if Main_menu == "Article Summarizer": - with app_sidebar: - st.text("") - st.text("") - - Number_of_article_paragraph = st.slider( - label="Number of paragraph", - min_value=1,max_value=10, - step=1,value=2, - key="Number of paragraph" - ) - -with app_sidebar: - st.button( - label="Watch App Tutorial", - use_container_width=True, - on_click=watch_tutorial - ) - - -##### article summarizer functions - -##### naive bayes text classification function - -def is_url(text): - url_pattern = re.compile( - r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[^\s]+') - return bool(url_pattern.match(text)) - - -# Train a model for text vs URL classification -def train_model(): - """ - this function predict the given input - is a simple text or url,link - and generate output. - """ - #### dataset (normal text and URLs) - try: - data = [ - ('This is a normal sentence.', 'text'), - ('www.google.com', 'url'), - ('Check out this website', 'text'), - ('https://www.example.com', 'url'), - ('Machine learning is fun', 'text'), - ('http://openai.com', 'url'), - ('Python is a great language', 'text'), - ] - texts = [d[0] for d in data] - labels = [1 if d[1] == 'url' else 0 for d in data] ## 1 for url, 0 for text - - ##### modal training - X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42) - - model = make_pipeline(CountVectorizer(), MultinomialNB()) - - model.fit(X_train, y_train) #### Train the model - - model.score(X_train, y_train) - model.score(X_test, y_test) - - return model - - except Exception as e: - st.error("Error...\n\n",e,icon="⚠️") - - - -############################### article summarizer - - -if Main_menu == "Article Summarizer": - - blank_article1, article_column, blank_article2 = st.columns([2,8,2],gap="small") - - with blank_article1: ### blank space - pass - with blank_article2: ### blank space - pass - - #### main app column - with article_column: - - #### app title - st.text("") - App_Title = colored_header( - label="Web Article Summarizer 📑", - color_name="blue-green-70", - description="Search or paste url" - ) - - Text_input = st.text_input( - label="Search or paste url", - placeholder="machine learning, java url- https://www.example.com" - ) - - ### max slider value - def max_length_slider_value(max_length)->int: - if max_length == 1: - return 90 - elif max_length == 2: - return 150 - elif max_length == 3: - return 250 - elif max_length == 4: - return 380 - elif max_length == 5: - return 470 - elif max_length == 6: - return 600 - elif max_length == 7: - return 750 - elif max_length == 8: - return 900 - elif max_length == 9: - return 1200 - elif max_length == 10: - return 1360 - - @st.cache_data - def Default_max_length(default_value): - if default_value == 1: - random_value = np.random.randint(30,65,6) - return random.choice(random_value) - - elif default_value == 2: - random_value = np.random.randint(50,130,6) - return random.choice(random_value) - - elif default_value == 3: - random_value = np.random.randint(70,210,6) - return random.choice(random_value) - - elif default_value == 4: - random_value = np.random.randint(140,310,6) - return random.choice(random_value) - - elif default_value == 5: - random_value = np.random.randint(200,390,6) - return random.choice(random_value) - - elif default_value == 6: - random_value = np.random.randint(230,490,6) - return random.choice(random_value) - - elif default_value == 7: - random_value = np.random.randint(280,590,6) - return random.choice(random_value) - - elif default_value == 8: - random_value = np.random.randint(350,750,6) - return random.choice(random_value) - - elif default_value == 9: - random_value = np.random.randint(450,1050,6) - return random.choice(random_value) - - elif default_value == 10: - random_value = np.random.randint(560,1100,6) - return random.choice(random_value) - - - - - Button_column, Toggle_summary_btn, Modal_display = st.columns([1,1,3],gap="small") - - - # article_summarizer(max_length) - with Button_column: - ### generate article button - Generate_btn = st.button(label="Generate Article") - - with Toggle_summary_btn: - ### if on then it generates summary - summary_on = st.toggle( - label="Summarizer", - value=False, - key="Summarizer on off" - ) - - if summary_on: - st.toast(body="Summarizer Mode on",icon="📑") - else: - st.toast(body="Scraping Mode",icon="📰") - - with Modal_display: - - if summary_on: - Modal_Level(Summarizer_modal) - else: - pass - if summary_on: - max_length_article = st.slider( - label="max length", - min_value=10,max_value=max_length_slider_value(Number_of_article_paragraph), - key="max length",value=Default_max_length(Number_of_article_paragraph) - ) - - -################################################################################################ - - - ### article scraper function - def article_scraper(article_url): - """ - this function is used to scrap - web articles and it provide - text in the clean format - """ - try: - article = Article(article_url) ### article object - article.download() - article.parse() - nltk.download("punkt") - article.nlp() - - st.markdown("

Article

",unsafe_allow_html=True) - st.text("") - st.text("") - - st.markdown( ### article title - f""" -
{article.title}
- """,unsafe_allow_html=True - ) - - article_publishdate = article.publish_date ### article publish date - if article_publishdate == None: - pass - else: - st.text("published on - "+str(article_publishdate)) - - article_authors = article.authors #### article authors - if len(article_authors) == 0: - pass - else: - autho_name_print = ", ".join(map(str, article_authors)) - st.write(autho_name_print) - - - ### generating article summary - def get_top_paragraphs(text, num_paragraphs=Number_of_article_paragraph): - """ - this function gives - top 1 - 10 paragraph of the - scrap data - """ - paragraphs = text.split('\n\n') - - valid_paragraphs = [p.strip() for p in paragraphs if len(p.strip().split()) > 12] - top_paragraphs = valid_paragraphs[:num_paragraphs] - return '\n\n'.join(top_paragraphs) - - - article_summary = article.text - - def remove_bracketed_numbers(text)->str: - pattern = r'\[\d+\]' - cleaned_text = re.sub(pattern, '', text) - return cleaned_text - - - cleaned_article_text = remove_bracketed_numbers(get_top_paragraphs(article_summary)) - - if "clean_text" not in st.session_state: - st.session_state.clean_text = "" - - st.session_state.clean_text = cleaned_article_text - - def clean_output_text(text:str)->str: - """ - it gives clean text without emojies, - no ascii values english text - """ - clean_text = clean( - text=text,fix_unicode=True, - to_ascii=True,no_emoji=True, - lang="en",no_line_breaks=False, - keep_two_line_breaks=True - ) - return clean_text - ### Print the cleaned text - st.write(clean_output_text(st.session_state.clean_text)) - st.text("") - st.text("") - - - ### copy download button - Article_filename = f"{article.title}.doc" - - Article_text_format = f""" - \n\n\n -{str(article.title)} -published on - {str(article_publishdate)} -Authors - {", ".join(map(str, article_authors))} - \n\n\n -{str(cleaned_article_text)} - """ - - - if __name__=="__main__": - Copy_download_button( - article_text=clean_output_text(cleaned_article_text), - article_format=Article_text_format, - article_file_name=Article_filename - ) - - st.text("") - - if summary_on: - st.markdown("

Article Summary

",unsafe_allow_html=True) - - #### summarization modal - - with st.spinner("Generating Summary..."): - - - if __name__=="__main__": - summarized_article_text = Hugingface_summarization_modal( - summary_text=clean_output_text(cleaned_article_text), - modal_name=Summarizer_modal, - maximum_length=max_length_article - ) - #### clean ai generated paragraph - - - st.write(summarized_article_text) - st.text("") - st.text("") - - summary_format = f""" - -\n\n -{article.title} -\n\n\n -{summarized_article_text} -""" - #### copy or download summary button - if __name__=="__main__": - Copy_download_button( - article_text=summarized_article_text, - article_file_name=f"{article.title}-summary.doc", - article_format=summary_format - ) - - if summary_on: - - ### summarization details - summarization_details = { - "Summarization Details":["Modal Name","Text Length","Summary Length","Max Tokens"], - "Output":[ - f"{Summarizer_modal}", - f"Length - {len(cleaned_article_text.split())}", - f"Length - {len(summarized_article_text.split())}", - f"Tokens Used - {max_length_article}" - ] - } - - summarization_details_df = pd.DataFrame( - data=summarization_details, - index=["Hugingface Modal","No. words","No. Words","Max Length"] - ) - - st.text("") - st.text("") - st.text("") - st.dataframe(summarization_details_df,use_container_width=True) - - - - except Exception as err: - ### 404 error animation - - Error_404_col, page_not_found_col = st.columns(2) - - with Error_404_col: - - try: - Error_404 = insert_lottie_animation("lottie_animations/error-404.json") - st_lottie( - animation_source=Error_404, - speed=1, - reverse=False,loop=True, - quality="high", - height=315, - width=400, - key="404 error" - ) - except Exception as err: - st.warning("something went wrong...",err,icon="⚠️") - - with page_not_found_col: - - try: - page_not_found = insert_lottie_animation("lottie_animations/page-not-found.json") - st_lottie( - animation_source=page_not_found, - speed=1, - reverse=False,loop=True, - quality="high", - height=265, - width=400, - key="page not found" - ) - except Exception as err: - st.warning("something went wrong...",err,icon="⚠️") - - st.warning(f"Something went wrong...\n\n{err}",icon="⚠️") - - def article_summarizer(summary_length): - st.write(summary_length) - - - def check_url_exists(url): - try: - response = requests.head(url, allow_redirects=True) - if response.status_code < 400: - return True - else: - return False - except requests.exceptions.RequestException as e: - # Handle any exception (e.g., connection error, timeout) - return False - - - ########### link classified article - def link_classified(text): - """ - it use url or link to scrap articles - provide author name, publish date, summary of - article - """ - try: - url_text = text - article_url_link = f"{url_text}" ### url to scrap - if __name__=="__main__": - article_scraper(article_url_link) - st.text("") - st.text("") - - if check_url_exists(article_url_link): - st.link_button(label="Visit Article",url=(article_url_link)) - else: - st.warning("Url does not exist...",icon="⚠️") - - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - except Exception as err: - st.warning(f"Something went wrong...\n\n{err}",icon="⚠️") - - - - ####$ text classified article - def text_classified(text): - """ - it use wikipedia to scrap articles - provide author name, publish date, summary of - article - """ - try: - url_text = text.replace(" ","_") - article_url = f"https://en.wikipedia.org/wiki/{url_text}" ### url to scrap - if __name__=="__main__": - article_scraper(article_url) - st.text("") - st.text("") - - if check_url_exists(article_url): - st.link_button(label="Visit Article",url=article_url) - else: - st.warning("Url does not exist...",icon="⚠️") - - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - - except Exception as e: - st.warning("Something went wrong...",e,icon="⚠️") - - - -############################################################################################ - - ### j query animation - if not Generate_btn or Text_input.strip() == "": - - try: - def particle(Js_file): - with open(Js_file) as f: - component.html(f"{f.read()}", height=420) - - if __name__=="__main__": - particle("animation/particles.html") - - except Exception as e: - st.error("Something went wrong...\n\n",e) - - if Generate_btn: - if Text_input.strip() != "": - st.text("") - st.text("") - - ### Function to classify the input text - def classify_input(text, model): - try: - if is_url(text): - link_classified(text) - else: - #### If it's not detected as a URL - prediction = model.predict([text])[0] - if prediction == 1: - link_classified(Text_input) - else: - text_classified(Text_input) - except Exception as e: - st.error("Error...\n\n",e,icon="⚠️") - - with st.spinner("Generating Article..."): - if __name__=="__main__": - model = train_model() - classify_input(Text_input, model) - - - -#################################################################################################### - - -################################# Text summarizer - - -if Main_menu == "Text Summarizer": - - blank_text_sum1, text_summarizer_col, blank_text_sum2 = st.columns([2,8,2],gap="small") - - ### blank columns - with blank_text_sum1: - pass - with blank_text_sum2: - pass - - ### text summarizer app column - - with text_summarizer_col: - #### app title - st.text("") - text_summarizer_Title = colored_header( - label="Text Summarizer 📄", - color_name="violet-70", - description="enter or paste text hear" - ) - - placeholder_text = """write or paste your text hear -paragraph length should be greater then 30 words -to generate output tap on screen or press ctrl+enter - """ - - ### input box - text_summarizer_input = st.text_area( - label="Enter Text Hear", - placeholder=placeholder_text, - height=340, - key="text summarizer" - ) - Modal_Level(Summarizer_modal) - - if text_summarizer_input.strip() == "": - - try: - #### writing animation - write_hear_animation = insert_lottie_animation("lottie_animations/write-hear.json") - st_lottie( - animation_source=write_hear_animation, - speed=1, - reverse=False,loop=True, - quality="medium", - height=165, - width=240, - key="write hear" - ) - except Exception as err: - st.warning("something went wrong...",err,icon="⚠️") - - ### enter paragraph length greater than 35 words - elif len(text_summarizer_input.split()) < 20: - st.warning("paragraph should be greater than 35 words",icon="✏️") - - else: - - def word_token_maxvalue(text:str)->int: - """ - converting paragraph into - tokens - """ - word_para = [] - words = word_tokenize(text) - for i in words: - word_para.append(i) - - return len(word_para) - - @st.cache_data - def random_value_text(text:str)->int: - random_value = np.random.randint( - 10,word_token_maxvalue(text),6 - ) - - return random.choice(random_value) - - def clean_data_for_summarization(text:str)->str: - clean_text = clean( - text=text,fix_unicode=True, - to_ascii=True,no_emoji=True, - lang="en",no_line_breaks=False, - keep_two_line_breaks=True - ) - return clean_text - - - - text_Max_length = st.slider( - label="Max length", - min_value=10, - max_value=word_token_maxvalue(text_summarizer_input), - key="text summarizer max length", - step=1,value=random_value_text(text_summarizer_input) - ) - - Generate_text_summary = st.button( - label="Generate summary",key="text summary" - ) - - try: - #### writing loading - writing_loading_animation = insert_lottie_animation("lottie_animations/writing-loading.json") - summary_generating_animation = st_lottie_spinner( - animation_source=writing_loading_animation, - speed=2, - reverse=False,loop=True, - quality="medium", - height=165, - width=240, - key="writing generating" - ) - except Exception as err: - st.warning("something went wrong...",err,icon="⚠️") - - - #### initilization of modal - if Generate_text_summary: - - if __name__=="__main__": - - ##### summary generation - with summary_generating_animation: - - ### modal - Text_Summary_output = Hugingface_summarization_modal( - summary_text=clean_data_for_summarization(text_summarizer_input), - modal_name=Summarizer_modal, - maximum_length=text_Max_length - ) - - ##### summary displaying and copy - st.text("") - st.text("") - st.markdown("

Generated Summary

",unsafe_allow_html=True) - st.text("") - st.write(Text_Summary_output) - st.text("") - - copy_text(Text_Summary_output) - st.text("") - st.text("") - - ###### original text desplay and copy - st.markdown("

Original Text

",unsafe_allow_html=True) - st.text("") - original_text = clean_data_for_summarization(text_summarizer_input) - st.write(original_text) - st.text("") - copy_text(original_text) - - st.text("") - st.text("") - st.text("") - - ### summarization details - text_summarization_details = { - "Summarization Details":["Modal Name","Text Length","Summary Length","Max Tokens"], - "Output":[ - f"{Summarizer_modal}", - f"Length - {len(text_summarizer_input.split())}", - f"Length - {len(Text_Summary_output.split())}", - f"Tokens Used - {text_Max_length}" - ] - } - - summarization_details_df = pd.DataFrame( - data=text_summarization_details, - index=["Hugingface Modal","No. words","No. Words","Max Length"] - ) - - st.text("") - st.text("") - st.text("") - st.dataframe(summarization_details_df,use_container_width=True) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - - -############################################################################################################## - -############################## pdf summarizer - - -#### pdf and text summarizer functions - - -#### displaying uploaded pdf file -def display_pdf_file(uploaded_file): - """ - it is used to display the - file on screen - """ - #### saving the uploaded file - def save_uploadfile(save_file): - with open(os.path.join("data",save_file.name),"wb") as f: - f.write(save_file.getbuffer()) - return st.toast("file uploaded: {}".format(save_file.name)) - - try: - ### display pdf on screen - def displayPDF(pdf_file): - with open(pdf_file,"rb") as f: - base64_pdf = base64.b64encode(f.read()).decode("utf-8") - - pdf_display = f""" - - """ - - st.markdown(pdf_display,unsafe_allow_html=True) - - ### save and display file - save_uploadfile(uploaded_file) - pdf_file = "data/"+uploaded_file.name - displayPDF(pdf_file) - except Exception as e: - st.warning("Something Went wrong...\n\n",e,icon="⚠️") - - -#### Function to extract text from a specific page using pdfminer -def extract_text_pdfminer(pdf_file, page_number): - """ - this function extract pdf file - text by user input page number - """ - try: - extracted_text = '' - for i, page_layout in enumerate(extract_pages(pdf_file)): - if i == page_number - 1: - ### Extract text elements and format them as closely as possible to the original layout - for element in page_layout: - if isinstance(element, LTTextContainer): - for text_line in element: - if isinstance(text_line, LTTextLine): - line = ''.join([char.get_text() for char in text_line if isinstance(char, LTChar)]) - extracted_text += line.strip() + '\n' - return extracted_text - return st.warning("Invalid page number.",icon="⚠️") - except Exception as e: - st.warning("Something Went wrong...\n\n",e,icon="⚠️") - - -############################################### - - -##### clean text for summmarization task -def uploaded_Clean_Text_Summarization(clean_text:str)->str: - """ - it gives clean text for - summarization task - """ - try: - pattern = r'[|`~^$<>]' - cleaned_paragraph = re.sub(pattern, '', clean_text) - - ### using clean function - clean_output_para = clean( - text=cleaned_paragraph,fix_unicode=True, - to_ascii=True,no_emoji=True, - lang="en",no_line_breaks=False, - keep_two_line_breaks=True - ) - - except Exception as e: - st.warning("Something Went wrong...\n\n",e,icon="⚠️") - - return clean_output_para - - -### convert paragraph into tokens -def generate_text_para_tokens(text_para:str)->int: - """ - converting paragraph into - tokens - """ - try: - pattern = r'[|`~#^$<>]' - cleaned_paragraph = re.sub(pattern, '', text_para) - - #### using clean function - clean_para = clean( - text=cleaned_paragraph,fix_unicode=True, - to_ascii=True,no_emoji=True, - lang="en",no_line_breaks=False, - keep_two_line_breaks=True - ) - - word_tokens = [] - - for i in word_tokenize(clean_para): - word_tokens.append(i) - return len(np.array(word_tokens)) - - except Exception as e: - st.warning("Something Went wrong...\n\n",e,icon="⚠️") - - - - ### generates random value for slider -@st.cache_data -def random_text_para_value(para:str)->int: - try: - random_value = np.random.randint( - 20, generate_text_para_tokens(para), 6 - ) - return random.choice(random_value) - except Exception as e: - st.warning("Something Went wrong...\n\n",e,icon="⚠️") - - -#### PDF files summarizer -def process_pdf(file): - reader = PdfReader(file) - page_count = len(reader.pages) - - ### pdf display and information column - pdf_display_tab, pdf_summarizer_tab = st.tabs([f"Displaying {file.name}","Pdf Summarizer"]) - - ####### displaying pdf on pdf display tab - with pdf_display_tab: - st.markdown(f"

Pdf - {file.name}

",unsafe_allow_html=True) - - pdf_col, pdf_info_col = st.columns([5,3],gap="medium") - with pdf_col: - with st.spinner("Displaying file..."): - if __name__=="__main__": - display_pdf_file(file) - - with pdf_info_col: - st.write("Your File: {}".format(file.name)) - st.write(f"Number of pages: {str(page_count)}") - st.markdown(insert_html("htmlfiles/pdf-summarizer-info.html"),unsafe_allow_html=True) - - - ### pdf information and intract with pdf - with pdf_summarizer_tab: - - st.text("") - st.markdown("

Extract pdf text

",unsafe_allow_html=True) - - ### toggle button for extracting text - extract_by_page_all = st.toggle( - label="Extract whole Text",key="toggle for extract text", - value=False - ) - - ### extracting all pdf text - if extract_by_page_all: - st.write("Extract whole pdf Text") - - if st.button("Extract Whole Pdf",key="whole pdf text extract"): - - st.text("") - st.text("") - - with st.spinner("Extracting pdf..."): - whole_pdf_text = extract_text(file) - st.markdown("

Whole PDF Text

",unsafe_allow_html=True) - st.text("") - st.write(whole_pdf_text) - else: - reader = PdfReader(file) - total_pages = len(reader.pages) - st.write("Extract by page Number") - - pdf_page_no_col, pdf_page_noinfo_col = st.columns([3,5],gap="small") - - with pdf_page_no_col: - - ### input page number - Pdf_page_number_input = st.number_input( - label="Select the page number", - min_value=1, max_value=total_pages, - value=1,key="pdf page number",step=1 - ) - - with pdf_page_noinfo_col: - st.text("") - st.text("") - st.write(f"Selected page: {str(Pdf_page_number_input)}") - - Extract_page_no_button = st.button( - label="Extract Page text", - key="Extract button for page" - ) - st.text("") - st.text("") - - if Extract_page_no_button: - text_pdfminer = extract_text_pdfminer(file, Pdf_page_number_input) - st.session_state['extracted_text'] = text_pdfminer ### Store the extracted text in session state - - if 'extracted_text' in st.session_state: - Pdf_file_text = st.text_area( - label=f"Text data of {Pdf_page_number_input} page", - value= st.session_state['extracted_text'], - height=400 - ) - st.session_state['extracted_text'] = Pdf_file_text # Update the text in session state based on user's input - - #### pdf summarizer - st.text("") - Max_length_pdf_slider = st.slider( - label="Max Length",key="Pdf summarizer slider", - min_value=10,max_value=generate_text_para_tokens(Pdf_file_text), - value=random_text_para_value(Pdf_file_text) - ) - st.text("") - - upload_Pdf_summary_btn_col, upload_Pdf_print_btn_col, upload_clean_Pdf_print_btn_col, blank_Pdf_col1, blank_Pdf_col2 = st.columns( - [4,4,4,7,3],gap="small" - ) - - with blank_Pdf_col1: - pass - with blank_Pdf_col2: - pass - - with upload_Pdf_summary_btn_col: - Generate_upload_pdf_summary_btn = st.button( - label="Generate Summary", - key="Generate summary of uploaded text pdf" - ) - - with upload_clean_Pdf_print_btn_col: - Upload_clean_pdf_btn = st.button( - label="Print Clean Text", - key="Print clean pdf file" - ) - - - with upload_Pdf_print_btn_col: - upload_pdf_print_button = st.button( - label="Print Uploaded Text", - key="Print uploadded pdf" - ) - - ### clean text - if Upload_clean_pdf_btn: - with st.spinner("Generating Clean Text..."): - st.text("") - st.text("") - st.markdown("

Clean Text

",unsafe_allow_html=True) - st.text("") - st.write(uploaded_Clean_Text_Summarization(Pdf_file_text)) - st.text("") - copy_text(uploaded_Clean_Text_Summarization(Pdf_file_text)) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - ### uploaded text - elif upload_pdf_print_button: - with st.spinner("Generating Uploaded Text..."): - st.text("") - st.text("") - st.markdown("

Uploaded Text

",unsafe_allow_html=True) - st.text("") - st.text(Pdf_file_text) - st.text("") - copy_text(Pdf_file_text) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - ### generating summary - elif Generate_upload_pdf_summary_btn: - st.text("") - with st.spinner("Generating Summary..."): - st.text("") - if __name__=="__main__": - Uploded_Pdf_file_Summary = Hugingface_summarization_modal( - summary_text=uploaded_Clean_Text_Summarization(Pdf_file_text), - maximum_length=Max_length_pdf_slider, - modal_name="facebook-bart" - ) - st.markdown("

Summary

",unsafe_allow_html=True) - st.text("") - - st.write(Uploded_Pdf_file_Summary) - st.text("") - copy_text(Uploded_Pdf_file_Summary) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - - - -################################################# - - -##### text file summarizer -def process_text(file): - text_file = file.read().decode("utf-8") - st.text("") - st.markdown("

Text file

",unsafe_allow_html=True) - - - ### displaying text you can edit also - Uploaded_text = st.text_area( - label=f"{file.name[:-4]} text data", - value=text_file,key="text file data", - height=400 - ) - st.write(f"**{file.name[:-4]}** Edit your file press ctrl+enter") - - ###3 if length is less than 20 - if len(Uploaded_text.split()) < 20: - st.warning("Summarization Task failed\nnot enough amount of text...",icon="⚠️") - - else: - st.text("") - #### max length slider - max_text_para_length = st.slider( - label="Max Length",min_value=10, - max_value=generate_text_para_tokens(Uploaded_text), - step=1,key="paragraph length", - value=random_text_para_value(Uploaded_text) - ) - st.text("") - - upload_text_summary_btn_col, upload_text_print_btn_col, upload_clean_text_print_btn_col, blank_text_col1, blank_text_col2 = st.columns( - [4,4,4,7,3],gap="small" - ) - - with blank_text_col1: - pass - with blank_text_col2: - pass - - with upload_text_summary_btn_col: - Generate_upload_text_summary_btn = st.button( - label="Generate Summary", - key="Generate summary of uploaded text" - ) - - with upload_clean_text_print_btn_col: - Upload_clean_text_btn = st.button( - label="Print Clean Text", - key="Print clean text file" - ) - - - with upload_text_print_btn_col: - upload_text_print_button = st.button( - label="Print Uploaded Text", - key="Print uploadded text" - ) - - ### clean text - if Upload_clean_text_btn: - with st.spinner("Generating Clean Text..."): - st.text("") - st.text("") - st.markdown("

Clean Text

",unsafe_allow_html=True) - st.text("") - st.write(uploaded_Clean_Text_Summarization(Uploaded_text)) - st.text("") - copy_text(uploaded_Clean_Text_Summarization(Uploaded_text)) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
", - unsafe_allow_html=True) - - ### uploaded text - elif upload_text_print_button: - with st.spinner("Generating Uploaded Text..."): - st.text("") - st.text("") - st.markdown("

Uploaded Text

",unsafe_allow_html=True) - st.text("") - st.text(Uploaded_text) - st.text("") - copy_text(Uploaded_text) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
", - unsafe_allow_html=True) - - - ### generating summary - elif Generate_upload_text_summary_btn: - st.text("") - with st.spinner("Generating Summary..."): - st.text("") - if __name__=="__main__": - Uploded_Text_file_Summary = Hugingface_summarization_modal( - summary_text=uploaded_Clean_Text_Summarization(Uploaded_text), - maximum_length=max_text_para_length, - modal_name="facebook-bart" - ) - st.markdown("

Summary

",unsafe_allow_html=True) - st.text("") - - st.write(Uploded_Text_file_Summary) - st.text("") - copy_text(Uploded_Text_file_Summary) - st.text("") - st.text("") - st.text("") - st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) - - - -if Main_menu == "PDF Summarizer": - - ### blank and app columns - Blank_pdf1 ,pdf_summarizer_col, Blank_pdf2 = st.columns([1,8,1],gap="small") - - with Blank_pdf1: - pass - with Blank_pdf2: - pass - - with pdf_summarizer_col: - st.text("") - st.header("PDF Summarizer") ### app heading - - ### File uploader function - app_file_upload = st.file_uploader("Upload a PDF or Text file", type=["pdf", "txt"]) - - if app_file_upload is not None: - - ### if pdf file - if app_file_upload.type == "application/pdf": - if __name__=="__main__": - process_pdf(app_file_upload) - - #### if text file - elif app_file_upload.type == "text/plain": - if __name__=="__main__": - process_text(app_file_upload) - - else: - st.info("Upload your pdf, text file") - - - #### app info -if Main_menu == "App Info": - Blank_app_info1, App_info_col, Blank_app_info2 = st.columns([2,8,2]) - - #### blank columns - with Blank_app_info1: - pass - with Blank_app_info2: - pass - - ### app info column - with App_info_col: - st.text("") - st.header("App Info") - st.text("") - - if __name__=="__main__": - st.markdown(insert_html("htmlfiles/app-info.html"), - unsafe_allow_html=True - ) - +import streamlit as st ### importing liberaries +from streamlit_extras.colored_header import colored_header +from streamlit_option_menu import option_menu +import streamlit.components.v1 as component +from streamlit_lottie import st_lottie, st_lottie_spinner +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.model_selection import train_test_split +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +from transformers import pipeline +from transformers import AutoTokenizer , AutoModelForSeq2SeqLM +from newspaper import Article +import nltk +import nltk.downloader +nltk.download('punkt_tab') +from nltk.tokenize import word_tokenize +from cleantext import clean +from PyPDF2 import PdfReader +import pdfminer +from pdfminer.high_level import extract_text +from pdfminer.high_level import extract_pages +from pdfminer.layout import LTTextContainer, LTChar, LTTextLine +import requests +import json +import numpy as np +import pandas as pd +import random +import base64 +import lxml +import lxml_html_clean +import re +import os + + +###### main app functions + +### insert external css +def insert_css(css_file:str): + with open(css_file) as f: + st.markdown(f"",unsafe_allow_html=True) + +### insert external html file +def insert_html(html_file): + with open(html_file) as f: + return f.read() + +### insert lottie animation json files +def insert_lottie_animation(animation_file:str): + with open(animation_file, "r") as f: + return json.load(f) + +### app tutorial video function +@st.dialog("App Tutorial") +def watch_tutorial(): + st.subheader("GenAi Summarizer🤖") + video_file = open("app_tutorial.mp4", "rb") + video_bytes = video_file.read() + st.text("") + st.video( + data=video_bytes,format="video/mp4", + loop=True,autoplay=True + ) + + +def download_text(text, filename): + """ + download article text + in document format + """ + #### Convert string to bytes + b64 = base64.b64encode(text.encode()).decode() + + href = f""" + + + + """ + + st.markdown(href, unsafe_allow_html=True) + if __name__=="__main__": + insert_css("cssfiles/download-article.css") + + +def copy_text(text): + html_code = f""" + + + + + + + + + + + + +
+
+

{text}

+ + + + + + """ + + component.html(html_code,height=28) + + +### copy and download button +def Copy_download_button(article_text,article_format,article_file_name): + try: + ### column for copy and download article + Copy_btn_col,download_btn_col, blank_col_copy1, blank_col_copy2= st.columns([1,3,5,5],gap="small") + + with blank_col_copy1: + st.text("") + with blank_col_copy1: + st.text("") + + with Copy_btn_col: + copy_text(article_text) + + with download_btn_col: + download_text(text=article_format,filename=article_file_name) + except Exception as e: + st.warning("Something went wrong...",e,icon="⚠️") + + +### setting page layout +st.set_page_config( + page_title="GenAi Summarizer", + page_icon="🤗", + initial_sidebar_state="collapsed", + layout="wide" +) + + +#### app settings css +if __name__=="__main__": + insert_css("cssfiles/app.css") + + +### huging face modals +Hugingface_modals = { + "google-pegasus":"google/pegasus-xsum", + "facebook-bart":"facebook/bart-large-cnn", + "t5-base":"t5-base" +} + + +### summarization modal +def Hugingface_summarization_modal(summary_text,modal_name,maximum_length): + """ + it is an text summarization modal + it use hugingface modals for summarization task. + it generates summarized text output + """ + def summarization_modal_name(modal)->str: + if modal == "google-pegasus": + return "google/pegasus-xsum" + elif modal == "facebook-bart": + return "facebook/bart-large-cnn" + elif modal == "t5-base": + return "t5-base" + try: + use_modal = summarization_modal_name(modal_name) ### modal name + + auto_tokenizer = AutoTokenizer.from_pretrained(use_modal) ### using autokenizer for pretrained modal + auto_modal = AutoModelForSeq2SeqLM.from_pretrained(use_modal) + + ### creating pipeline + summarizer = pipeline("summarization",model=auto_modal,tokenizer=auto_tokenizer) + + summarizer_text = summary_text + + summary_generate = summarizer( ### summarizer + summarizer_text,max_length=maximum_length+20, + min_length=maximum_length, + do_sample=False + ) + + return summary_generate[0]['summary_text'] + + except Exception as e: + st.warning("Something went wrong...\n\n",e,icon="⚠️") + + + + +### displaying modals +@st.cache_data +def Modal_Level(modal_text): + if modal_text == "google-pegasus": + st.markdown( + f""" +
+ + Maodal- + + google/pegasus-xsum +
+ """,unsafe_allow_html=True + ) + + elif modal_text == "facebook-bart": + st.markdown( + f"""
+ + Maodal- + + facebook/bart-large-cnn +
+ """,unsafe_allow_html=True + ) + + elif modal_text == "t5-base": + st.markdown( + f"""
+ + Maodal- + + t5-base +
+ """,unsafe_allow_html=True + ) + if __name__=="__main__": + insert_css("cssfiles/modal.css") + + + +#### creating sidebar +app_sidebar = st.sidebar + +with app_sidebar: + st.text("") + st.subheader("GenAi Summarizer🤖") + st.write("Developer: **Nishant Maity**") + st.text("") + st.text("") + + ### creating menu bar + Main_menu = option_menu( + menu_title="", + options=["Article Summarizer","Text Summarizer","PDF Summarizer","App Info"], + icons=["chat-dots","card-heading","file-earmark-pdf","person-circle"], + default_index=0, + key="Menu Bar" + ) + st.text("") + + ### select modal for text and article summarizer + if Main_menu == "Article Summarizer" or Main_menu == "Text Summarizer": + + Summarizer_modal = st.selectbox( + label="Select Modal", + options=np.array(list(Hugingface_modals.keys())), + index=1, + key="Modals" + ) + +#### selecting number or paragraph for article summarizer +if Main_menu == "Article Summarizer": + with app_sidebar: + st.text("") + st.text("") + + Number_of_article_paragraph = st.slider( + label="Number of paragraph", + min_value=1,max_value=10, + step=1,value=2, + key="Number of paragraph" + ) + +with app_sidebar: + st.button( + label="Watch App Tutorial", + use_container_width=True, + on_click=watch_tutorial + ) + + +##### article summarizer functions + +##### naive bayes text classification function + +def is_url(text): + url_pattern = re.compile( + r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|(?:www\.)[^\s]+') + return bool(url_pattern.match(text)) + + +# Train a model for text vs URL classification +def train_model(): + """ + this function predict the given input + is a simple text or url,link + and generate output. + """ + #### dataset (normal text and URLs) + try: + data = [ + ('This is a normal sentence.', 'text'), + ('www.google.com', 'url'), + ('Check out this website', 'text'), + ('https://www.example.com', 'url'), + ('Machine learning is fun', 'text'), + ('http://openai.com', 'url'), + ('Python is a great language', 'text'), + ] + texts = [d[0] for d in data] + labels = [1 if d[1] == 'url' else 0 for d in data] ## 1 for url, 0 for text + + ##### modal training + X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42) + + model = make_pipeline(CountVectorizer(), MultinomialNB()) + + model.fit(X_train, y_train) #### Train the model + + model.score(X_train, y_train) + model.score(X_test, y_test) + + return model + + except Exception as e: + st.error("Error...\n\n",e,icon="⚠️") + + + +############################### article summarizer + + +if Main_menu == "Article Summarizer": + + blank_article1, article_column, blank_article2 = st.columns([2,8,2],gap="small") + + with blank_article1: ### blank space + pass + with blank_article2: ### blank space + pass + + #### main app column + with article_column: + + #### app title + st.text("") + App_Title = colored_header( + label="Web Article Summarizer 📑", + color_name="blue-green-70", + description="Search or paste url" + ) + + Text_input = st.text_input( + label="Search or paste url", + placeholder="machine learning, java url- https://www.example.com" + ) + + ### max slider value + def max_length_slider_value(max_length)->int: + if max_length == 1: + return 90 + elif max_length == 2: + return 150 + elif max_length == 3: + return 250 + elif max_length == 4: + return 380 + elif max_length == 5: + return 470 + elif max_length == 6: + return 600 + elif max_length == 7: + return 750 + elif max_length == 8: + return 900 + elif max_length == 9: + return 1200 + elif max_length == 10: + return 1360 + + @st.cache_data + def Default_max_length(default_value): + if default_value == 1: + random_value = np.random.randint(30,65,6) + return random.choice(random_value) + + elif default_value == 2: + random_value = np.random.randint(50,130,6) + return random.choice(random_value) + + elif default_value == 3: + random_value = np.random.randint(70,210,6) + return random.choice(random_value) + + elif default_value == 4: + random_value = np.random.randint(140,310,6) + return random.choice(random_value) + + elif default_value == 5: + random_value = np.random.randint(200,390,6) + return random.choice(random_value) + + elif default_value == 6: + random_value = np.random.randint(230,490,6) + return random.choice(random_value) + + elif default_value == 7: + random_value = np.random.randint(280,590,6) + return random.choice(random_value) + + elif default_value == 8: + random_value = np.random.randint(350,750,6) + return random.choice(random_value) + + elif default_value == 9: + random_value = np.random.randint(450,1050,6) + return random.choice(random_value) + + elif default_value == 10: + random_value = np.random.randint(560,1100,6) + return random.choice(random_value) + + + + + Button_column, Toggle_summary_btn, Modal_display = st.columns([1,1,3],gap="small") + + + # article_summarizer(max_length) + with Button_column: + ### generate article button + Generate_btn = st.button(label="Generate Article") + + with Toggle_summary_btn: + ### if on then it generates summary + summary_on = st.toggle( + label="Summarizer", + value=False, + key="Summarizer on off" + ) + + if summary_on: + st.toast(body="Summarizer Mode on",icon="📑") + else: + st.toast(body="Scraping Mode",icon="📰") + + with Modal_display: + + if summary_on: + Modal_Level(Summarizer_modal) + else: + pass + if summary_on: + max_length_article = st.slider( + label="max length", + min_value=10,max_value=max_length_slider_value(Number_of_article_paragraph), + key="max length",value=Default_max_length(Number_of_article_paragraph) + ) + + +################################################################################################ + + + ### article scraper function + def article_scraper(article_url): + """ + this function is used to scrap + web articles and it provide + text in the clean format + """ + try: + article = Article(article_url) ### article object + article.download() + article.parse() + nltk.download("punkt") + article.nlp() + + st.markdown("

Article

",unsafe_allow_html=True) + st.text("") + st.text("") + + st.markdown( ### article title + f""" +
{article.title}
+ """,unsafe_allow_html=True + ) + + article_publishdate = article.publish_date ### article publish date + if article_publishdate == None: + pass + else: + st.text("published on - "+str(article_publishdate)) + + article_authors = article.authors #### article authors + if len(article_authors) == 0: + pass + else: + autho_name_print = ", ".join(map(str, article_authors)) + st.write(autho_name_print) + + + ### generating article summary + def get_top_paragraphs(text, num_paragraphs=Number_of_article_paragraph): + """ + this function gives + top 1 - 10 paragraph of the + scrap data + """ + paragraphs = text.split('\n\n') + + valid_paragraphs = [p.strip() for p in paragraphs if len(p.strip().split()) > 12] + top_paragraphs = valid_paragraphs[:num_paragraphs] + return '\n\n'.join(top_paragraphs) + + + article_summary = article.text + + def remove_bracketed_numbers(text)->str: + pattern = r'\[\d+\]' + cleaned_text = re.sub(pattern, '', text) + return cleaned_text + + + cleaned_article_text = remove_bracketed_numbers(get_top_paragraphs(article_summary)) + + if "clean_text" not in st.session_state: + st.session_state.clean_text = "" + + st.session_state.clean_text = cleaned_article_text + + def clean_output_text(text:str)->str: + """ + it gives clean text without emojies, + no ascii values english text + """ + clean_text = clean( + text=text,fix_unicode=True, + to_ascii=True,no_emoji=True, + lang="en",no_line_breaks=False, + keep_two_line_breaks=True + ) + return clean_text + ### Print the cleaned text + st.write(clean_output_text(st.session_state.clean_text)) + st.text("") + st.text("") + + + ### copy download button + Article_filename = f"{article.title}.doc" + + Article_text_format = f""" + \n\n\n +{str(article.title)} +published on - {str(article_publishdate)} +Authors - {", ".join(map(str, article_authors))} + \n\n\n +{str(cleaned_article_text)} + """ + + + if __name__=="__main__": + Copy_download_button( + article_text=clean_output_text(cleaned_article_text), + article_format=Article_text_format, + article_file_name=Article_filename + ) + + st.text("") + + if summary_on: + st.markdown("

Article Summary

",unsafe_allow_html=True) + + #### summarization modal + + with st.spinner("Generating Summary..."): + + + if __name__=="__main__": + summarized_article_text = Hugingface_summarization_modal( + summary_text=clean_output_text(cleaned_article_text), + modal_name=Summarizer_modal, + maximum_length=max_length_article + ) + #### clean ai generated paragraph + + + st.write(summarized_article_text) + st.text("") + st.text("") + + summary_format = f""" + +\n\n +{article.title} +\n\n\n +{summarized_article_text} +""" + #### copy or download summary button + if __name__=="__main__": + Copy_download_button( + article_text=summarized_article_text, + article_file_name=f"{article.title}-summary.doc", + article_format=summary_format + ) + + if summary_on: + + ### summarization details + summarization_details = { + "Summarization Details":["Modal Name","Text Length","Summary Length","Max Tokens"], + "Output":[ + f"{Summarizer_modal}", + f"Length - {len(cleaned_article_text.split())}", + f"Length - {len(summarized_article_text.split())}", + f"Tokens Used - {max_length_article}" + ] + } + + summarization_details_df = pd.DataFrame( + data=summarization_details, + index=["Hugingface Modal","No. words","No. Words","Max Length"] + ) + + st.text("") + st.text("") + st.text("") + st.dataframe(summarization_details_df,use_container_width=True) + + + + except Exception as err: + ### 404 error animation + + Error_404_col, page_not_found_col = st.columns(2) + + with Error_404_col: + + try: + Error_404 = insert_lottie_animation("lottie_animations/error-404.json") + st_lottie( + animation_source=Error_404, + speed=1, + reverse=False,loop=True, + quality="high", + height=315, + width=400, + key="404 error" + ) + except Exception as err: + st.warning("something went wrong...",err,icon="⚠️") + + with page_not_found_col: + + try: + page_not_found = insert_lottie_animation("lottie_animations/page-not-found.json") + st_lottie( + animation_source=page_not_found, + speed=1, + reverse=False,loop=True, + quality="high", + height=265, + width=400, + key="page not found" + ) + except Exception as err: + st.warning("something went wrong...",err,icon="⚠️") + + st.warning(f"Something went wrong...\n\n{err}",icon="⚠️") + + def article_summarizer(summary_length): + st.write(summary_length) + + + def check_url_exists(url): + try: + response = requests.head(url, allow_redirects=True) + if response.status_code < 400: + return True + else: + return False + except requests.exceptions.RequestException as e: + # Handle any exception (e.g., connection error, timeout) + return False + + + ########### link classified article + def link_classified(text): + """ + it use url or link to scrap articles + provide author name, publish date, summary of + article + """ + try: + url_text = text + article_url_link = f"{url_text}" ### url to scrap + if __name__=="__main__": + article_scraper(article_url_link) + st.text("") + st.text("") + + if check_url_exists(article_url_link): + st.link_button(label="Visit Article",url=(article_url_link)) + else: + st.warning("Url does not exist...",icon="⚠️") + + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + except Exception as err: + st.warning(f"Something went wrong...\n\n{err}",icon="⚠️") + + + + ####$ text classified article + def text_classified(text): + """ + it use wikipedia to scrap articles + provide author name, publish date, summary of + article + """ + try: + url_text = text.replace(" ","_") + article_url = f"https://en.wikipedia.org/wiki/{url_text}" ### url to scrap + if __name__=="__main__": + article_scraper(article_url) + st.text("") + st.text("") + + if check_url_exists(article_url): + st.link_button(label="Visit Article",url=article_url) + else: + st.warning("Url does not exist...",icon="⚠️") + + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + + except Exception as e: + st.warning("Something went wrong...",e,icon="⚠️") + + + +############################################################################################ + + ### j query animation + if not Generate_btn or Text_input.strip() == "": + + try: + def particle(Js_file): + with open(Js_file) as f: + component.html(f"{f.read()}", height=420) + + if __name__=="__main__": + particle("animation/particles.html") + + except Exception as e: + st.error("Something went wrong...\n\n",e) + + if Generate_btn: + if Text_input.strip() != "": + st.text("") + st.text("") + + ### Function to classify the input text + def classify_input(text, model): + try: + if is_url(text): + link_classified(text) + else: + #### If it's not detected as a URL + prediction = model.predict([text])[0] + if prediction == 1: + link_classified(Text_input) + else: + text_classified(Text_input) + except Exception as e: + st.error("Error...\n\n",e,icon="⚠️") + + with st.spinner("Generating Article..."): + if __name__=="__main__": + model = train_model() + classify_input(Text_input, model) + + + +#################################################################################################### + + +################################# Text summarizer + + +if Main_menu == "Text Summarizer": + + blank_text_sum1, text_summarizer_col, blank_text_sum2 = st.columns([2,8,2],gap="small") + + ### blank columns + with blank_text_sum1: + pass + with blank_text_sum2: + pass + + ### text summarizer app column + + with text_summarizer_col: + #### app title + st.text("") + text_summarizer_Title = colored_header( + label="Text Summarizer 📄", + color_name="violet-70", + description="enter or paste text hear" + ) + + placeholder_text = """write or paste your text hear +paragraph length should be greater then 30 words +to generate output tap on screen or press ctrl+enter + """ + + ### input box + text_summarizer_input = st.text_area( + label="Enter Text Hear", + placeholder=placeholder_text, + height=340, + key="text summarizer" + ) + Modal_Level(Summarizer_modal) + + if text_summarizer_input.strip() == "": + + try: + #### writing animation + write_hear_animation = insert_lottie_animation("lottie_animations/write-hear.json") + st_lottie( + animation_source=write_hear_animation, + speed=1, + reverse=False,loop=True, + quality="medium", + height=165, + width=240, + key="write hear" + ) + except Exception as err: + st.warning("something went wrong...",err,icon="⚠️") + + ### enter paragraph length greater than 35 words + elif len(text_summarizer_input.split()) < 20: + st.warning("paragraph should be greater than 35 words",icon="✏️") + + else: + + def word_token_maxvalue(text:str)->int: + """ + converting paragraph into + tokens + """ + word_para = [] + words = word_tokenize(text) + for i in words: + word_para.append(i) + + return len(word_para) + + @st.cache_data + def random_value_text(text:str)->int: + random_value = np.random.randint( + 10,word_token_maxvalue(text),6 + ) + + return random.choice(random_value) + + def clean_data_for_summarization(text:str)->str: + clean_text = clean( + text=text,fix_unicode=True, + to_ascii=True,no_emoji=True, + lang="en",no_line_breaks=False, + keep_two_line_breaks=True + ) + return clean_text + + + + text_Max_length = st.slider( + label="Max length", + min_value=10, + max_value=word_token_maxvalue(text_summarizer_input), + key="text summarizer max length", + step=1,value=random_value_text(text_summarizer_input) + ) + + Generate_text_summary = st.button( + label="Generate summary",key="text summary" + ) + + try: + #### writing loading + writing_loading_animation = insert_lottie_animation("lottie_animations/writing-loading.json") + summary_generating_animation = st_lottie_spinner( + animation_source=writing_loading_animation, + speed=2, + reverse=False,loop=True, + quality="medium", + height=165, + width=240, + key="writing generating" + ) + except Exception as err: + st.warning("something went wrong...",err,icon="⚠️") + + + #### initilization of modal + if Generate_text_summary: + + if __name__=="__main__": + + ##### summary generation + with summary_generating_animation: + + ### modal + Text_Summary_output = Hugingface_summarization_modal( + summary_text=clean_data_for_summarization(text_summarizer_input), + modal_name=Summarizer_modal, + maximum_length=text_Max_length + ) + + ##### summary displaying and copy + st.text("") + st.text("") + st.markdown("

Generated Summary

",unsafe_allow_html=True) + st.text("") + st.write(Text_Summary_output) + st.text("") + + copy_text(Text_Summary_output) + st.text("") + st.text("") + + ###### original text desplay and copy + st.markdown("

Original Text

",unsafe_allow_html=True) + st.text("") + original_text = clean_data_for_summarization(text_summarizer_input) + st.write(original_text) + st.text("") + copy_text(original_text) + + st.text("") + st.text("") + st.text("") + + ### summarization details + text_summarization_details = { + "Summarization Details":["Modal Name","Text Length","Summary Length","Max Tokens"], + "Output":[ + f"{Summarizer_modal}", + f"Length - {len(text_summarizer_input.split())}", + f"Length - {len(Text_Summary_output.split())}", + f"Tokens Used - {text_Max_length}" + ] + } + + summarization_details_df = pd.DataFrame( + data=text_summarization_details, + index=["Hugingface Modal","No. words","No. Words","Max Length"] + ) + + st.text("") + st.text("") + st.text("") + st.dataframe(summarization_details_df,use_container_width=True) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + + +############################################################################################################## + +############################## pdf summarizer + + +#### pdf and text summarizer functions + + +#### displaying uploaded pdf file +def display_pdf_file(uploaded_file): + """ + it is used to display the + file on screen + """ + #### saving the uploaded file + def save_uploadfile(save_file): + with open(os.path.join("data",save_file.name),"wb") as f: + f.write(save_file.getbuffer()) + return st.toast("file uploaded: {}".format(save_file.name)) + + try: + ### display pdf on screen + def displayPDF(pdf_file): + with open(pdf_file,"rb") as f: + base64_pdf = base64.b64encode(f.read()).decode("utf-8") + + pdf_display = f""" + + """ + + st.markdown(pdf_display,unsafe_allow_html=True) + + ### save and display file + save_uploadfile(uploaded_file) + pdf_file = "data/"+uploaded_file.name + displayPDF(pdf_file) + except Exception as e: + st.warning("Something Went wrong...\n\n",e,icon="⚠️") + + +#### Function to extract text from a specific page using pdfminer +def extract_text_pdfminer(pdf_file, page_number): + """ + this function extract pdf file + text by user input page number + """ + try: + extracted_text = '' + for i, page_layout in enumerate(extract_pages(pdf_file)): + if i == page_number - 1: + ### Extract text elements and format them as closely as possible to the original layout + for element in page_layout: + if isinstance(element, LTTextContainer): + for text_line in element: + if isinstance(text_line, LTTextLine): + line = ''.join([char.get_text() for char in text_line if isinstance(char, LTChar)]) + extracted_text += line.strip() + '\n' + return extracted_text + return st.warning("Invalid page number.",icon="⚠️") + except Exception as e: + st.warning("Something Went wrong...\n\n",e,icon="⚠️") + + +############################################### + + +##### clean text for summmarization task +def uploaded_Clean_Text_Summarization(clean_text:str)->str: + """ + it gives clean text for + summarization task + """ + try: + pattern = r'[|`~^$<>]' + cleaned_paragraph = re.sub(pattern, '', clean_text) + + ### using clean function + clean_output_para = clean( + text=cleaned_paragraph,fix_unicode=True, + to_ascii=True,no_emoji=True, + lang="en",no_line_breaks=False, + keep_two_line_breaks=True + ) + + except Exception as e: + st.warning("Something Went wrong...\n\n",e,icon="⚠️") + + return clean_output_para + + +### convert paragraph into tokens +def generate_text_para_tokens(text_para:str)->int: + """ + converting paragraph into + tokens + """ + try: + pattern = r'[|`~#^$<>]' + cleaned_paragraph = re.sub(pattern, '', text_para) + + #### using clean function + clean_para = clean( + text=cleaned_paragraph,fix_unicode=True, + to_ascii=True,no_emoji=True, + lang="en",no_line_breaks=False, + keep_two_line_breaks=True + ) + + word_tokens = [] + + for i in word_tokenize(clean_para): + word_tokens.append(i) + return len(np.array(word_tokens)) + + except Exception as e: + st.warning("Something Went wrong...\n\n",e,icon="⚠️") + + + + ### generates random value for slider +@st.cache_data +def random_text_para_value(para:str)->int: + try: + random_value = np.random.randint( + 20, generate_text_para_tokens(para), 6 + ) + return random.choice(random_value) + except Exception as e: + st.warning("Something Went wrong...\n\n",e,icon="⚠️") + + +#### PDF files summarizer +def process_pdf(file): + reader = PdfReader(file) + page_count = len(reader.pages) + + ### pdf display and information column + pdf_display_tab, pdf_summarizer_tab = st.tabs([f"Displaying {file.name}","Pdf Summarizer"]) + + ####### displaying pdf on pdf display tab + with pdf_display_tab: + st.markdown(f"

Pdf - {file.name}

",unsafe_allow_html=True) + + pdf_col, pdf_info_col = st.columns([5,3],gap="medium") + with pdf_col: + with st.spinner("Displaying file..."): + if __name__=="__main__": + display_pdf_file(file) + + with pdf_info_col: + st.write("Your File: {}".format(file.name)) + st.write(f"Number of pages: {str(page_count)}") + st.markdown(insert_html("htmlfiles/pdf-summarizer-info.html"),unsafe_allow_html=True) + + + ### pdf information and intract with pdf + with pdf_summarizer_tab: + + st.text("") + st.markdown("

Extract pdf text

",unsafe_allow_html=True) + + ### toggle button for extracting text + extract_by_page_all = st.toggle( + label="Extract whole Text",key="toggle for extract text", + value=False + ) + + ### extracting all pdf text + if extract_by_page_all: + st.write("Extract whole pdf Text") + + if st.button("Extract Whole Pdf",key="whole pdf text extract"): + + st.text("") + st.text("") + + with st.spinner("Extracting pdf..."): + whole_pdf_text = extract_text(file) + st.markdown("

Whole PDF Text

",unsafe_allow_html=True) + st.text("") + st.write(whole_pdf_text) + else: + reader = PdfReader(file) + total_pages = len(reader.pages) + st.write("Extract by page Number") + + pdf_page_no_col, pdf_page_noinfo_col = st.columns([3,5],gap="small") + + with pdf_page_no_col: + + ### input page number + Pdf_page_number_input = st.number_input( + label="Select the page number", + min_value=1, max_value=total_pages, + value=1,key="pdf page number",step=1 + ) + + with pdf_page_noinfo_col: + st.text("") + st.text("") + st.write(f"Selected page: {str(Pdf_page_number_input)}") + + Extract_page_no_button = st.button( + label="Extract Page text", + key="Extract button for page" + ) + st.text("") + st.text("") + + if Extract_page_no_button: + text_pdfminer = extract_text_pdfminer(file, Pdf_page_number_input) + st.session_state['extracted_text'] = text_pdfminer ### Store the extracted text in session state + + if 'extracted_text' in st.session_state: + Pdf_file_text = st.text_area( + label=f"Text data of {Pdf_page_number_input} page", + value= st.session_state['extracted_text'], + height=400 + ) + st.session_state['extracted_text'] = Pdf_file_text # Update the text in session state based on user's input + + #### pdf summarizer + st.text("") + Max_length_pdf_slider = st.slider( + label="Max Length",key="Pdf summarizer slider", + min_value=10,max_value=generate_text_para_tokens(Pdf_file_text), + value=random_text_para_value(Pdf_file_text) + ) + st.text("") + + upload_Pdf_summary_btn_col, upload_Pdf_print_btn_col, upload_clean_Pdf_print_btn_col, blank_Pdf_col1, blank_Pdf_col2 = st.columns( + [4,4,4,7,3],gap="small" + ) + + with blank_Pdf_col1: + pass + with blank_Pdf_col2: + pass + + with upload_Pdf_summary_btn_col: + Generate_upload_pdf_summary_btn = st.button( + label="Generate Summary", + key="Generate summary of uploaded text pdf" + ) + + with upload_clean_Pdf_print_btn_col: + Upload_clean_pdf_btn = st.button( + label="Print Clean Text", + key="Print clean pdf file" + ) + + + with upload_Pdf_print_btn_col: + upload_pdf_print_button = st.button( + label="Print Uploaded Text", + key="Print uploadded pdf" + ) + + ### clean text + if Upload_clean_pdf_btn: + with st.spinner("Generating Clean Text..."): + st.text("") + st.text("") + st.markdown("

Clean Text

",unsafe_allow_html=True) + st.text("") + st.write(uploaded_Clean_Text_Summarization(Pdf_file_text)) + st.text("") + copy_text(uploaded_Clean_Text_Summarization(Pdf_file_text)) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + ### uploaded text + elif upload_pdf_print_button: + with st.spinner("Generating Uploaded Text..."): + st.text("") + st.text("") + st.markdown("

Uploaded Text

",unsafe_allow_html=True) + st.text("") + st.text(Pdf_file_text) + st.text("") + copy_text(Pdf_file_text) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + ### generating summary + elif Generate_upload_pdf_summary_btn: + st.text("") + with st.spinner("Generating Summary..."): + st.text("") + if __name__=="__main__": + Uploded_Pdf_file_Summary = Hugingface_summarization_modal( + summary_text=uploaded_Clean_Text_Summarization(Pdf_file_text), + maximum_length=Max_length_pdf_slider, + modal_name="facebook-bart" + ) + st.markdown("

Summary

",unsafe_allow_html=True) + st.text("") + + st.write(Uploded_Pdf_file_Summary) + st.text("") + copy_text(Uploded_Pdf_file_Summary) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + + + +################################################# + + +##### text file summarizer +def process_text(file): + text_file = file.read().decode("utf-8") + st.text("") + st.markdown("

Text file

",unsafe_allow_html=True) + + + ### displaying text you can edit also + Uploaded_text = st.text_area( + label=f"{file.name[:-4]} text data", + value=text_file,key="text file data", + height=400 + ) + st.write(f"**{file.name[:-4]}** Edit your file press ctrl+enter") + + ###3 if length is less than 20 + if len(Uploaded_text.split()) < 20: + st.warning("Summarization Task failed\nnot enough amount of text...",icon="⚠️") + + else: + st.text("") + #### max length slider + max_text_para_length = st.slider( + label="Max Length",min_value=10, + max_value=generate_text_para_tokens(Uploaded_text), + step=1,key="paragraph length", + value=random_text_para_value(Uploaded_text) + ) + st.text("") + + upload_text_summary_btn_col, upload_text_print_btn_col, upload_clean_text_print_btn_col, blank_text_col1, blank_text_col2 = st.columns( + [4,4,4,7,3],gap="small" + ) + + with blank_text_col1: + pass + with blank_text_col2: + pass + + with upload_text_summary_btn_col: + Generate_upload_text_summary_btn = st.button( + label="Generate Summary", + key="Generate summary of uploaded text" + ) + + with upload_clean_text_print_btn_col: + Upload_clean_text_btn = st.button( + label="Print Clean Text", + key="Print clean text file" + ) + + + with upload_text_print_btn_col: + upload_text_print_button = st.button( + label="Print Uploaded Text", + key="Print uploadded text" + ) + + ### clean text + if Upload_clean_text_btn: + with st.spinner("Generating Clean Text..."): + st.text("") + st.text("") + st.markdown("

Clean Text

",unsafe_allow_html=True) + st.text("") + st.write(uploaded_Clean_Text_Summarization(Uploaded_text)) + st.text("") + copy_text(uploaded_Clean_Text_Summarization(Uploaded_text)) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
", + unsafe_allow_html=True) + + ### uploaded text + elif upload_text_print_button: + with st.spinner("Generating Uploaded Text..."): + st.text("") + st.text("") + st.markdown("

Uploaded Text

",unsafe_allow_html=True) + st.text("") + st.text(Uploaded_text) + st.text("") + copy_text(Uploaded_text) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
", + unsafe_allow_html=True) + + + ### generating summary + elif Generate_upload_text_summary_btn: + st.text("") + with st.spinner("Generating Summary..."): + st.text("") + if __name__=="__main__": + Uploded_Text_file_Summary = Hugingface_summarization_modal( + summary_text=uploaded_Clean_Text_Summarization(Uploaded_text), + maximum_length=max_text_para_length, + modal_name="facebook-bart" + ) + st.markdown("

Summary

",unsafe_allow_html=True) + st.text("") + + st.write(Uploded_Text_file_Summary) + st.text("") + copy_text(Uploded_Text_file_Summary) + st.text("") + st.text("") + st.text("") + st.markdown("
Created by Nishant Maity
",unsafe_allow_html=True) + + + +if Main_menu == "PDF Summarizer": + + ### blank and app columns + Blank_pdf1 ,pdf_summarizer_col, Blank_pdf2 = st.columns([1,8,1],gap="small") + + with Blank_pdf1: + pass + with Blank_pdf2: + pass + + with pdf_summarizer_col: + st.text("") + st.header("PDF Summarizer") ### app heading + + ### File uploader function + app_file_upload = st.file_uploader("Upload a PDF or Text file", type=["pdf", "txt"]) + + if app_file_upload is not None: + + ### if pdf file + if app_file_upload.type == "application/pdf": + if __name__=="__main__": + process_pdf(app_file_upload) + + #### if text file + elif app_file_upload.type == "text/plain": + if __name__=="__main__": + process_text(app_file_upload) + + else: + st.info("Upload your pdf, text file") + + + #### app info +if Main_menu == "App Info": + Blank_app_info1, App_info_col, Blank_app_info2 = st.columns([2,8,2]) + + #### blank columns + with Blank_app_info1: + pass + with Blank_app_info2: + pass + + ### app info column + with App_info_col: + st.text("") + st.header("App Info") + st.text("") + + if __name__=="__main__": + st.markdown(insert_html("htmlfiles/app-info.html"), + unsafe_allow_html=True + ) +