Spaces:

Kawthar12h
/

Text_Summarization

Build error

App Files Files Community

Kawthar12h commited on Sep 14, 2024

Commit

9270041

verified ·

1 Parent(s): 584e102

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -59

app.py CHANGED Viewed

@@ -1,65 +1,27 @@
-!pip install transformers
-!pip install torch
-!pip install gradio
 import gradio as gr
 from transformers import pipeline
 import torch
 from bs4 import BeautifulSoup
 import requests
 def summarize_article(url, min_len, max_len):
-  #Create summarization pipeline
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     try:
-        # Send an HTTP GET request to the URL(take it from user) and retrieve the web page content
         r = requests.get(url)
-        # Creat object from BeautifulSoup to extract the text content of the webpage, parsing the HTML content
         soup = BeautifulSoup(r.text, 'html.parser')
-        # To finds all the <h1> (header) and <p> (paragraph) elements in the HTML content
-        results = soup.find_all(['h1','p'])
-        # Extract the text content from each element and store it in a list called text
         text = [result.text for result in results]
-        # joins all the extracted text into a single string, representing the entire article
         ARTICLE = ' '.join(text)
-        # Replace sentence-ending punctuation with a special token (<eos>) . This helps split the article into smaller chunks for summarization.
         ARTICLE = ARTICLE.replace('\n', '')
         ARTICLE = ARTICLE.replace('.', '.<eos>')
         ARTICLE = ARTICLE.replace('?', '?<eos>')
         ARTICLE = ARTICLE.replace('!', '!<eos>')
-        # Splits the article into sentences based on the <eos> token and stores them in a list called sentences.
         sentences = ARTICLE.split('<eos>')
-        # Sets the maximum length (in words) for each chunk of text during summarization.
-        max_chunk = 500
-        # Initializes a variable to keep track of the current chunk being processed
         current_chunk = 0
-        # Creates an empty list called chunks to store the individual chunks of text
         chunks = []
-        # For loop iterates through each sentence in the sentences list
-        '''If the length of the current chunk (in terms of words) plus the length of the current sentence (split by spaces) is less than or equal to the max_chunk length:
-        The sentence is added to the current chunk.
-        Otherwise:
-        The current_chunk index is incremented to move to the next chunk.
-        A new chunk is created, and the current sentence becomes the first sentence in this new chunk.
-        The current chunk is appended to the chunks list.
-        '''
         for sentence in sentences:
             if len(chunks) == current_chunk + 1:
                 if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
@@ -70,33 +32,25 @@ def summarize_article(url, min_len, max_len):
             else:
                 chunks.append(sentence.split(' '))
-        ''' After processing all sentences, the loop iterates through each chunk,
-        to ensures that each chunk is represented as a single string (rather than a list of words).
-        '''
         for chunk_id in range(len(chunks)):
             chunks[chunk_id] = ' '.join(chunks[chunk_id])
-        # Apply Summarization to text with lenth of 30-120 word for each chunk
-        res = summarizer(chunks, max_length = max_len, min_length = min_len, do_sample=False)
-        # Extracting the 'summary_text' value from each summary in the res list
         summary = ' '.join([summ['summary_text'] for summ in res])
         return summary
-    # Handle potential errors during web request or parsing
-    except Exception as e:
         return f"Error: {str(e)}"
-# Create Gradio Interface
-interface = gr.Interface(
-    fn=summarize_article,
-    inputs=[
-        gr.Textbox(label="Enter the article URL"),
-        gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length"),
-        gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
-    ],
-    outputs=gr.Textbox(label="Summary")
-)
-interface.launch()

 import gradio as gr
 from transformers import pipeline
 import torch
 from bs4 import BeautifulSoup
 import requests
 def summarize_article(url, min_len, max_len):
     summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
     try:
         r = requests.get(url)
         soup = BeautifulSoup(r.text, 'html.parser')
+        results = soup.find_all(['h1', 'p'])
         text = [result.text for result in results]
         ARTICLE = ' '.join(text)
         ARTICLE = ARTICLE.replace('\n', '')
         ARTICLE = ARTICLE.replace('.', '.<eos>')
         ARTICLE = ARTICLE.replace('?', '?<eos>')
         ARTICLE = ARTICLE.replace('!', '!<eos>')
         sentences = ARTICLE.split('<eos>')
         current_chunk = 0
         chunks = []
         for sentence in sentences:
             if len(chunks) == current_chunk + 1:
                 if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
             else:
                 chunks.append(sentence.split(' '))
         for chunk_id in range(len(chunks)):
             chunks[chunk_id] = ' '.join(chunks[chunk_id])
+        res = summarizer(chunks, max_length=max_len, min_length=min_len, do_sample=False)
         summary = ' '.join([summ['summary_text'] for summ in res])
         return summary
+    except Exception as e:  # Handle potential errors during web request or parsing
         return f"Error: {str(e)}"
+with gr.Blocks() as iface:
+    url_input = gr.Textbox(label="Enter the article URL")
+    min_len_slider = gr.Slider(minimum=10, maximum=100, step=1, label="Adjust Minimum Length")
+    max_len_slider = gr.Slider(minimum=50, maximum=1000, step=1, label="Adjust Maximum Length")
+    summary_output = gr.Textbox(label="Summary")
+    btn = gr.Button("Summarize")
+    btn.click(fn=summarize_article, inputs=[url_input, min_len_slider, max_len_slider], outputs=summary_output)
+iface.launch()