Spaces:

Gagan0141
/

Webscraping

Sleeping

App Files Files Community

Gagan0141 commited on Nov 6, 2025

Commit

f21fee7

verified ·

1 Parent(s): 47cae26

Update app.py

Browse files

Files changed (1) hide show

app.py +87 -85

app.py CHANGED Viewed

@@ -1,85 +1,87 @@
-from flask import Flask, request, render_template, jsonify
-from nltk.tokenize import word_tokenize, sent_tokenize
-from urllib.request import urlopen
-from bs4 import BeautifulSoup
-import nltk
-from difflib import SequenceMatcher
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.cluster import KMeans
-nltk.download('punkt')
-app = Flask(__name__)
-def simple_summary(text, max_sentences=3):
-    sents = sent_tokenize(text)
-    return " ".join(sents[:max_sentences]) if sents else text[:200] + "..."
-@app.route("/")
-def home():
-    return render_template("index.html")
-@app.route("/process_urls", methods=["POST"])
-def process_urls():
-    data = request.form
-    selected_sites = request.form.getlist("sites")
-    mode = data.get("mode", "tokenize")
-    articles = {}
-    for url in selected_sites:
-        try:
-            page = urlopen(url)
-            soup = BeautifulSoup(page, "html.parser")
-            text = soup.get_text(separator=" ")
-            articles[url] = text
-        except Exception as e:
-            articles[url] = f"Error fetching: {str(e)}"
-    # -----------------------------
-    # Compare articles side-by-side
-    # -----------------------------
-    comparison_results = []
-    urls = list(articles.keys())
-    for i in range(len(urls)):
-        for j in range(i+1, len(urls)):
-            a, b = articles[urls[i]], articles[urls[j]]
-            sents_a, sents_b = sent_tokenize(a), sent_tokenize(b)
-            diff_a = []
-            for sent in sents_a:
-                if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b):
-                    diff_a.append(sent)
-            comparison_results.append({
-                "site1": urls[i],
-                "site2": urls[j],
-                "diff_sentences_site1": diff_a
-            })
-    # -----------------------------
-    # Cluster articles by topic
-    # -----------------------------
-    clusters = {}
-    if len(articles) > 0:
-        vectorizer = TfidfVectorizer(stop_words='english')
-        X = vectorizer.fit_transform(list(articles.values()))
-        n_clusters = min(3, len(articles))
-        kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
-        for idx, label in enumerate(kmeans.labels_):
-            clusters.setdefault(int(label), []).append(urls[idx])
-    # -----------------------------
-    # Summarize each article using NLTK simple summary
-    # -----------------------------
-    summaries = {}
-    for url, text in articles.items():
-        summaries[url] = simple_summary(text, max_sentences=3)
-    return jsonify({
-        "articles": articles,
-        "comparisons": comparison_results,
-        "clusters": clusters,
-        "summaries": summaries
-    })
-if __name__ == "__main__":
-    app.run(debug=True)

+from flask import Flask, request, render_template, jsonify
+from nltk.tokenize import word_tokenize, sent_tokenize
+from urllib.request import urlopen
+from bs4 import BeautifulSoup
+import nltk
+from difflib import SequenceMatcher
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+nltk.download('punkt')
+app = Flask(__name__)
+def simple_summary(text, max_sentences=3):
+    sents = sent_tokenize(text)
+    return " ".join(sents[:max_sentences]) if sents else text[:200] + "..."
+@app.route("/")
+def home():
+    return render_template("index.html")
+@app.route("/process_urls", methods=["POST"])
+def process_urls():
+    data = request.form
+    selected_sites = request.form.getlist("sites")
+    mode = data.get("mode", "tokenize")
+    articles = {}
+    for url in selected_sites:
+        try:
+            page = urlopen(url)
+            soup = BeautifulSoup(page, "html.parser")
+            text = soup.get_text(separator=" ")
+            articles[url] = text
+        except Exception as e:
+            articles[url] = f"Error fetching: {str(e)}"
+    # -----------------------------
+    # Compare articles side-by-side
+    # -----------------------------
+    comparison_results = []
+    urls = list(articles.keys())
+    for i in range(len(urls)):
+        for j in range(i+1, len(urls)):
+            a, b = articles[urls[i]], articles[urls[j]]
+            sents_a, sents_b = sent_tokenize(a), sent_tokenize(b)
+            diff_a = []
+            for sent in sents_a:
+                if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b):
+                    diff_a.append(sent)
+            comparison_results.append({
+                "site1": urls[i],
+                "site2": urls[j],
+                "diff_sentences_site1": diff_a
+            })
+    # -----------------------------
+    # Cluster articles by topic
+    # -----------------------------
+    clusters = {}
+    if len(articles) > 0:
+        vectorizer = TfidfVectorizer(stop_words='english')
+        X = vectorizer.fit_transform(list(articles.values()))
+        n_clusters = min(3, len(articles))
+        kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
+        for idx, label in enumerate(kmeans.labels_):
+            clusters.setdefault(int(label), []).append(urls[idx])
+    # -----------------------------
+    # Summarize each article using NLTK simple summary
+    # -----------------------------
+    summaries = {}
+    for url, text in articles.items():
+        summaries[url] = simple_summary(text, max_sentences=3)
+    return jsonify({
+        "articles": articles,
+        "comparisons": comparison_results,
+        "clusters": clusters,
+        "summaries": summaries
+    })
+if __name__ == "__main__":
+    socket.setdefaulttimeout(20)
+    app.run(host="0.0.0.0", port=7860, debug=False)