from flask import Flask, request, render_template, jsonify from nltk.tokenize import word_tokenize, sent_tokenize from urllib.request import urlopen from bs4 import BeautifulSoup import nltk import socket from difflib import SequenceMatcher from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans nltk.download('punkt') app = Flask(__name__) def simple_summary(text, max_sentences=3): sents = sent_tokenize(text) return " ".join(sents[:max_sentences]) if sents else text[:200] + "..." @app.route("/") def home(): return render_template("index.html") @app.route("/process_urls", methods=["POST"]) def process_urls(): data = request.form selected_sites = request.form.getlist("sites") mode = data.get("mode", "tokenize") articles = {} for url in selected_sites: try: page = urlopen(url) soup = BeautifulSoup(page, "html.parser") text = soup.get_text(separator=" ") articles[url] = text except Exception as e: articles[url] = f"Error fetching: {str(e)}" # ----------------------------- # Compare articles side-by-side # ----------------------------- comparison_results = [] urls = list(articles.keys()) for i in range(len(urls)): for j in range(i+1, len(urls)): a, b = articles[urls[i]], articles[urls[j]] sents_a, sents_b = sent_tokenize(a), sent_tokenize(b) diff_a = [] for sent in sents_a: if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b): diff_a.append(sent) comparison_results.append({ "site1": urls[i], "site2": urls[j], "diff_sentences_site1": diff_a }) # ----------------------------- # Cluster articles by topic # ----------------------------- clusters = {} if len(articles) > 0: vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(list(articles.values())) n_clusters = min(3, len(articles)) kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X) for idx, label in enumerate(kmeans.labels_): clusters.setdefault(int(label), []).append(urls[idx]) # ----------------------------- # Summarize each article using NLTK simple summary # ----------------------------- summaries = {} for url, text in articles.items(): summaries[url] = simple_summary(text, max_sentences=3) return jsonify({ "articles": articles, "comparisons": comparison_results, "clusters": clusters, "summaries": summaries }) if __name__ == "__main__": socket.setdefaulttimeout(20) app.run(host="0.0.0.0", port=7860, debug=False)