Spaces:
Running
Running
File size: 6,510 Bytes
7107674 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
"""
@file app.py
@description Main application entry point for the Text Summarizer. Handles Flask routes,
integrates summarization algorithms (SpaCy, NLTK, Gensim, Sumy), and manages data processing
for text and URL inputs.
@author Amey Thakur <https://github.com/Amey-Thakur>
@author Mega Satish <https://github.com/msatmod>
@created 2022-08-09
@repository https://github.com/Amey-Thakur/TEXT-SUMMARIZER
@license MIT
"""
from __future__ import unicode_literals
from flask import Flask,render_template,url_for,request
# Import proprietary and third-party summarization modules
from spacy_summarization import text_summarizer # SpaCy-based summarization logic
from summa.summarizer import summarize # Summa (Gensim TextRank fork) implementation
from nltk_summarization import nltk_summarizer # NLTK frequency-based summarization
import time
import spacy
# Initialize SpaCy's English model for Natural Language Processing task
nlp = spacy.load("en_core_web_sm")
app = Flask(__name__)
# Web Scraping Pkg
from bs4 import BeautifulSoup
from urllib.request import urlopen
# Sumy Package Imports for LexRank Algorithm
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
def sumy_summary(docx):
"""
Generates a text summary using the LexRank algorithm provided by Sumy.
@param docx (str): The input text document to be summarized.
@return result (str): The concatenated summary string containing top-ranked sentences.
"""
parser = PlaintextParser.from_string(docx,Tokenizer("english"))
lex_summarizer = LexRankSummarizer()
summary = lex_summarizer(parser.document,3) # Extract top 3 sentences
summary_list = [str(sentence) for sentence in summary]
result = ' '.join(summary_list)
return result
def readingTime(mytext):
"""
Estimates the reading time for a given text based on average reading speed.
@param mytext (str): The input text to analyze.
@return estimatedTime (float): The estimated reading time in minutes (assuming 200 wpm).
"""
total_words = len([ token.text for token in nlp(mytext)]) # Tokenize and count words
estimatedTime = total_words/200.0
return estimatedTime
# Fetch Text From Url
def get_text(url):
"""
Scrapes and processes textual content from a valid URL.
@param url (str): The HTTP URL of the target webpage.
@return fetched_text (str): The cleaned text content extracted from paragraph tags.
"""
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser") # Parse HTML content
fetched_text = ' '.join(map(lambda p:p.text,soup.find_all('p'))) # Extract text from <p> tags
return fetched_text
@app.route('/')
def index():
return render_template('index.html')
@app.route('/analyze',methods=['GET','POST'])
def analyze():
"""
Route to handle direct text input for summarization.
Processes the input text, calculates reading times, and returns the simplified summary.
"""
start = time.time()
if request.method == 'POST':
rawtext = request.form['rawtext']
# Calculate metrics for original text
final_reading_time = readingTime(rawtext)
# Generate summary using SpaCy-based custom algorithm
final_summary = text_summarizer(rawtext)
summary_reading_time = readingTime(final_summary)
end = time.time()
final_time = end-start
return render_template('index.html',ctext=rawtext,final_summary=final_summary,final_time=final_time,final_reading_time=final_reading_time,summary_reading_time=summary_reading_time)
@app.route('/analyze_url',methods=['GET','POST'])
def analyze_url():
"""
Route to handle URL-based input for summarization.
Fetches content from the URL, extracts text, and performs summarization.
"""
start = time.time()
if request.method == 'POST':
raw_url = request.form['raw_url']
rawtext = get_text(raw_url)
final_reading_time = readingTime(rawtext)
final_summary = text_summarizer(rawtext)
summary_reading_time = readingTime(final_summary)
end = time.time()
final_time = end-start
return render_template('index.html',ctext=rawtext,final_summary=final_summary,final_time=final_time,final_reading_time=final_reading_time,summary_reading_time=summary_reading_time)
@app.route('/compare_summary')
def compare_summary():
return render_template('compare_summary.html')
@app.route('/comparer',methods=['GET','POST'])
def comparer():
"""
Comparative analysis route.
Runs multiple summarization algorithms (SpaCy, Gensim, NLTK, Sumy) on the same input
to allow side-by-side performance and quality comparison.
"""
start = time.time()
if request.method == 'POST':
rawtext = request.form['rawtext']
final_reading_time = readingTime(rawtext)
# 1. SpaCy Summarizer
try:
final_summary_spacy = text_summarizer(rawtext)
summary_reading_time = readingTime(final_summary_spacy)
except Exception:
final_summary_spacy = "Error: Text too short or processing failed."
summary_reading_time = 0
# 2. Gensim Summarizer (Summa)
try:
final_summary_gensim = summarize(rawtext)
summary_reading_time_gensim = readingTime(final_summary_gensim)
except Exception:
final_summary_gensim = "Error: Text too short or processing failed."
summary_reading_time_gensim = 0
# 3. NLTK Summarizer (Frequency Dist)
try:
final_summary_nltk = nltk_summarizer(rawtext)
summary_reading_time_nltk = readingTime(final_summary_nltk)
except Exception:
final_summary_nltk = "Error: Text too short or processing failed."
summary_reading_time_nltk = 0
# 4. Sumy Summarizer (LexRank)
try:
final_summary_sumy = sumy_summary(rawtext)
summary_reading_time_sumy = readingTime(final_summary_sumy)
except Exception:
final_summary_sumy = "Error: Text too short or processing failed."
summary_reading_time_sumy = 0
end = time.time()
final_time = end-start
return render_template('compare_summary.html',ctext=rawtext,final_summary_spacy=final_summary_spacy,final_summary_gensim=final_summary_gensim,final_summary_nltk=final_summary_nltk,final_time=final_time,final_reading_time=final_reading_time,summary_reading_time=summary_reading_time,summary_reading_time_gensim=summary_reading_time_gensim,final_summary_sumy=final_summary_sumy,summary_reading_time_sumy=summary_reading_time_sumy,summary_reading_time_nltk=summary_reading_time_nltk)
@app.route('/about')
def about():
return render_template('index.html')
@app.errorhandler(404)
def page_not_found(e):
return render_template('404.html'), 404
if __name__ == '__main__':
app.run(debug=True) |