Spaces:

Gagan0141
/

Webscraping

Sleeping

App Files Files Community

Gagan0141 commited on Nov 6, 2025

Commit

75d4851

verified ·

1 Parent(s): 505a450

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -278

app.py DELETED Viewed

@@ -1,278 +0,0 @@
-from flask import Flask, request, jsonify, render_template
-from urllib.request import Request, urlopen
-from bs4 import BeautifulSoup
-import nltk
-import re
-import socket
-from urllib.parse import urlparse
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.cluster import KMeans
-import numpy as np
-# Ensure NLTK data exists
-nltk.download("punkt", quiet=True)
-nltk.download("punkt_tab", quiet=True)
-from nltk.tokenize import word_tokenize, sent_tokenize
-app = Flask(__name__)
-# -------------------------
-# Helper: fetch page safely
-# -------------------------
-def fetch_page(url, timeout=15):
-    """
-    Fetch URL content using urllib with a browser-like User-Agent.
-    Returns cleaned text or raises Exception.
-    """
-    try:
-        req = Request(
-            url,
-            headers={
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                              "AppleWebKit/537.36 (KHTML, like Gecko) "
-                              "Chrome/120.0 Safari/537.36"
-            },
-        )
-        resp = urlopen(req, timeout=timeout)
-        raw = resp.read()
-        soup = BeautifulSoup(raw, "html.parser")
-        # remove scripts/styles etc
-        for tag in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
-            tag.extract()
-        text = soup.get_text(separator=" ")
-        text = re.sub(r"\s+", " ", text).strip()
-        return text
-    except Exception as e:
-        raise
-# -------------------------
-# Helper: extract heading tag text
-# -------------------------
-def extract_heading_text(soup, tag):
-    elements = soup.find_all(tag)
-    return " ".join([el.get_text(" ", strip=True) for el in elements]).strip()
-# -------------------------
-# Clean / normalize text
-# -------------------------
-def clean_text(t):
-    return re.sub(r"\s+", " ", t or "").strip()
-# -------------------------
-# Summarize (extractive)
-# -------------------------
-def summarize(text, num_sentences=3):
-    sentences = sent_tokenize(text)
-    if len(sentences) <= num_sentences:
-        return " ".join(sentences)
-    try:
-        vec = TfidfVectorizer(stop_words="english")
-        X = vec.fit_transform(sentences)
-        scores = np.array(X.sum(axis=1)).ravel()
-        top_idx = scores.argsort()[-num_sentences:][::-1]
-        top_sentences = [sentences[i] for i in sorted(top_idx)]
-        return " ".join(top_sentences)
-    except Exception:
-        return " ".join(sentences[:num_sentences])
-# -------------------------
-# Topic clustering
-# -------------------------
-def cluster_texts(texts, n_clusters=3):
-    if len(texts) == 0:
-        return []
-    if len(texts) <= 1:
-        return [0] * len(texts)
-    k = min(n_clusters, len(texts))
-    vec = TfidfVectorizer(stop_words="english")
-    X = vec.fit_transform(texts)
-    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
-    labels = kmeans.fit_predict(X)
-    return labels.tolist()
-# -------------------------
-# Duplicate detection (cosine)
-# -------------------------
-def detect_duplicates(texts, threshold=0.55):
-    n = len(texts)
-    if n <= 1:
-        return []
-    vec = TfidfVectorizer(stop_words="english")
-    X = vec.fit_transform(texts)
-    sim = cosine_similarity(X)
-    groups = []
-    used = set()
-    for i in range(n):
-        if i in used:
-            continue
-        group = [i]
-        used.add(i)
-        for j in range(i + 1, n):
-            if sim[i, j] >= threshold:
-                group.append(j)
-                used.add(j)
-        if len(group) > 1:
-            groups.append(group)
-    return groups
-# -------------------------
-# Sentence-level change detection (exact-match)
-# -------------------------
-def changed_sentences(textA, textB):
-    sA = [s.strip() for s in sent_tokenize(textA) if s.strip()]
-    sB = [s.strip() for s in sent_tokenize(textB) if s.strip()]
-    setA = set(sA)
-    setB = set(sB)
-    changedA = [s for s in sA if s not in setB]
-    changedB = [s for s in sB if s not in setA]
-    return changedA, changedB
-# -------------------------
-# Return hostname helper
-# -------------------------
-def hostname(url):
-    try:
-        p = urlparse(url)
-        return p.netloc or url
-    except Exception:
-        return url
-# -------------------------
-# Routes
-# -------------------------
-@app.route("/")
-def home():
-    # list of preselected sites (you can add/remove)
-    sites = {
-        "Indian Express": "https://indianexpress.com/",
-        "Times of India": "https://timesofindia.indiatimes.com/",
-        "NDTV": "https://www.ndtv.com/",
-        "BBC News": "https://www.bbc.com/news",
-        "CNN": "https://www.cnn.com/",
-        "The Hindu": "https://www.thehindu.com/",
-    }
-    return render_template("index.html", sites=sites)
-@app.route("/process_urls", methods=["POST"])
-def process_urls():
-    payload = request.get_json(force=True)
-    urls = payload.get("urls", []) or []
-    mode = payload.get("mode", "tokenize")
-    results = []
-    texts_for_clustering = []
-    for raw_url in urls:
-        url = raw_url.strip()
-        if not url:
-            continue
-        try:
-            # fetch page raw
-            req = Request(
-                url,
-                headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-                                       "AppleWebKit/537.36 (KHTML, like Gecko) "
-                                       "Chrome/120.0 Safari/537.36"}
-            )
-            resp = urlopen(req, timeout=15)
-            soup = BeautifulSoup(resp.read(), "html.parser")
-            # choose extraction according to mode (H1..H6 or full)
-            if mode in ["H1", "H2", "H3", "H4", "H5", "H6"]:
-                tag = mode.lower()
-                extracted = extract_heading_text(soup, tag)
-            else:
-                # full text
-                for tag_rm in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
-                    tag_rm.extract()
-                extracted = soup.get_text(separator=" ")
-                extracted = clean_text(extracted)
-            words = []
-            sentences = []
-            if extracted:
-                # tokenization may throw in weird content, guard it
-                try:
-                    words = word_tokenize(extracted)
-                except Exception:
-                    words = extracted.split()
-                try:
-                    sentences = sent_tokenize(extracted)
-                except Exception:
-                    sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', extracted) if s.strip()]
-            summary = summarize(extracted) if extracted else ""
-            texts_for_clustering.append(extracted)
-            results.append({
-                "url": url,
-                "host": hostname(url),
-                "text": extracted,
-                "words": words,
-                "sentences": sentences,
-                "summary": summary,
-            })
-        except Exception as e:
-            results.append({
-                "url": url,
-                "host": hostname(url),
-                "text": "",
-                "words": [],
-                "sentences": [],
-                "summary": "",
-                "error": str(e)
-            })
-    # clustering
-    texts_only = [r.get("text", "") for r in results]
-    clusters = cluster_texts(texts_only, n_clusters=3) if len(texts_only) > 0 else []
-    # attach clusters (fill default 0 if sizes mismatch)
-    if len(clusters) != len(results):
-        clusters = [int(c) if i < len(clusters) else 0 for i, c in enumerate(range(len(results)))]
-    for i, r in enumerate(results):
-        r["cluster"] = int(clusters[i]) if i < len(clusters) else 0
-    # duplicate groups (convert index groups to url groups)
-    dup_idx_groups = detect_duplicates(texts_only, threshold=0.55)
-    dup_url_groups = [[results[i]["url"] for i in grp] for grp in dup_idx_groups]
-    return jsonify({
-        "articles": results,
-        "duplicate_groups": dup_url_groups
-    })
-@app.route("/compare_texts", methods=["POST"])
-def compare_texts_route():
-    data = request.get_json(force=True)
-    text1 = data.get("text1", "") or ""
-    text2 = data.get("text2", "") or ""
-    # compute changed sentences (exact-match)
-    changedA, changedB = changed_sentences(text1, text2)
-    # build html: show only changed sentences highlighted, and keep order from original
-    def highlight_html(original_text, changed_set):
-        sents = [s.strip() for s in sent_tokenize(original_text) if s.strip()]
-        pieces = []
-        for s in sents:
-            if s in changed_set:
-                pieces.append(f"<p class='changed'>{escape_html(s)}</p>")
-        return "".join(pieces)
-    left_html = highlight_html(text1, set(changedA))
-    right_html = highlight_html(text2, set(changedB))
-    return jsonify({"left": left_html, "right": right_html, "changedA_count": len(changedA), "changedB_count": len(changedB)})
-# small helper used in templates/JS if needed
-def escape_html(s):
-    return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
-            .replace('"', "&quot;").replace("'", "&#39;"))
-if __name__ == "__main__":
-    # increase default socket timeout a bit
-    socket.setdefaulttimeout(20)
-    app.run(debug=True)