Spaces:
Sleeping
Sleeping
File size: 2,851 Bytes
f21fee7 fbfa006 f21fee7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
from flask import Flask, request, render_template, jsonify
from nltk.tokenize import word_tokenize, sent_tokenize
from urllib.request import urlopen
from bs4 import BeautifulSoup
import nltk
import socket
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
nltk.download('punkt')
app = Flask(__name__)
def simple_summary(text, max_sentences=3):
sents = sent_tokenize(text)
return " ".join(sents[:max_sentences]) if sents else text[:200] + "..."
@app.route("/")
def home():
return render_template("index.html")
@app.route("/process_urls", methods=["POST"])
def process_urls():
data = request.form
selected_sites = request.form.getlist("sites")
mode = data.get("mode", "tokenize")
articles = {}
for url in selected_sites:
try:
page = urlopen(url)
soup = BeautifulSoup(page, "html.parser")
text = soup.get_text(separator=" ")
articles[url] = text
except Exception as e:
articles[url] = f"Error fetching: {str(e)}"
# -----------------------------
# Compare articles side-by-side
# -----------------------------
comparison_results = []
urls = list(articles.keys())
for i in range(len(urls)):
for j in range(i+1, len(urls)):
a, b = articles[urls[i]], articles[urls[j]]
sents_a, sents_b = sent_tokenize(a), sent_tokenize(b)
diff_a = []
for sent in sents_a:
if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b):
diff_a.append(sent)
comparison_results.append({
"site1": urls[i],
"site2": urls[j],
"diff_sentences_site1": diff_a
})
# -----------------------------
# Cluster articles by topic
# -----------------------------
clusters = {}
if len(articles) > 0:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(list(articles.values()))
n_clusters = min(3, len(articles))
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
for idx, label in enumerate(kmeans.labels_):
clusters.setdefault(int(label), []).append(urls[idx])
# -----------------------------
# Summarize each article using NLTK simple summary
# -----------------------------
summaries = {}
for url, text in articles.items():
summaries[url] = simple_summary(text, max_sentences=3)
return jsonify({
"articles": articles,
"comparisons": comparison_results,
"clusters": clusters,
"summaries": summaries
})
if __name__ == "__main__":
socket.setdefaulttimeout(20)
app.run(host="0.0.0.0", port=7860, debug=False)
|