Spaces:
Sleeping
Sleeping
| from flask import Flask, request, render_template, jsonify | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| from urllib.request import urlopen | |
| from bs4 import BeautifulSoup | |
| import nltk | |
| import socket | |
| from difflib import SequenceMatcher | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.cluster import KMeans | |
| nltk.download('punkt') | |
| app = Flask(__name__) | |
| def simple_summary(text, max_sentences=3): | |
| sents = sent_tokenize(text) | |
| return " ".join(sents[:max_sentences]) if sents else text[:200] + "..." | |
| def home(): | |
| return render_template("index.html") | |
| def process_urls(): | |
| data = request.form | |
| selected_sites = request.form.getlist("sites") | |
| mode = data.get("mode", "tokenize") | |
| articles = {} | |
| for url in selected_sites: | |
| try: | |
| page = urlopen(url) | |
| soup = BeautifulSoup(page, "html.parser") | |
| text = soup.get_text(separator=" ") | |
| articles[url] = text | |
| except Exception as e: | |
| articles[url] = f"Error fetching: {str(e)}" | |
| # ----------------------------- | |
| # Compare articles side-by-side | |
| # ----------------------------- | |
| comparison_results = [] | |
| urls = list(articles.keys()) | |
| for i in range(len(urls)): | |
| for j in range(i+1, len(urls)): | |
| a, b = articles[urls[i]], articles[urls[j]] | |
| sents_a, sents_b = sent_tokenize(a), sent_tokenize(b) | |
| diff_a = [] | |
| for sent in sents_a: | |
| if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b): | |
| diff_a.append(sent) | |
| comparison_results.append({ | |
| "site1": urls[i], | |
| "site2": urls[j], | |
| "diff_sentences_site1": diff_a | |
| }) | |
| # ----------------------------- | |
| # Cluster articles by topic | |
| # ----------------------------- | |
| clusters = {} | |
| if len(articles) > 0: | |
| vectorizer = TfidfVectorizer(stop_words='english') | |
| X = vectorizer.fit_transform(list(articles.values())) | |
| n_clusters = min(3, len(articles)) | |
| kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X) | |
| for idx, label in enumerate(kmeans.labels_): | |
| clusters.setdefault(int(label), []).append(urls[idx]) | |
| # ----------------------------- | |
| # Summarize each article using NLTK simple summary | |
| # ----------------------------- | |
| summaries = {} | |
| for url, text in articles.items(): | |
| summaries[url] = simple_summary(text, max_sentences=3) | |
| return jsonify({ | |
| "articles": articles, | |
| "comparisons": comparison_results, | |
| "clusters": clusters, | |
| "summaries": summaries | |
| }) | |
| if __name__ == "__main__": | |
| socket.setdefaulttimeout(20) | |
| app.run(host="0.0.0.0", port=7860, debug=False) | |