Gagan0141 commited on
Commit
75d4851
·
verified ·
1 Parent(s): 505a450

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -278
app.py DELETED
@@ -1,278 +0,0 @@
1
- from flask import Flask, request, jsonify, render_template
2
- from urllib.request import Request, urlopen
3
- from bs4 import BeautifulSoup
4
- import nltk
5
- import re
6
- import socket
7
- from urllib.parse import urlparse
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
- from sklearn.cluster import KMeans
11
- import numpy as np
12
-
13
- # Ensure NLTK data exists
14
- nltk.download("punkt", quiet=True)
15
- nltk.download("punkt_tab", quiet=True)
16
- from nltk.tokenize import word_tokenize, sent_tokenize
17
-
18
- app = Flask(__name__)
19
-
20
- # -------------------------
21
- # Helper: fetch page safely
22
- # -------------------------
23
- def fetch_page(url, timeout=15):
24
- """
25
- Fetch URL content using urllib with a browser-like User-Agent.
26
- Returns cleaned text or raises Exception.
27
- """
28
- try:
29
- req = Request(
30
- url,
31
- headers={
32
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
33
- "AppleWebKit/537.36 (KHTML, like Gecko) "
34
- "Chrome/120.0 Safari/537.36"
35
- },
36
- )
37
- resp = urlopen(req, timeout=timeout)
38
- raw = resp.read()
39
- soup = BeautifulSoup(raw, "html.parser")
40
-
41
- # remove scripts/styles etc
42
- for tag in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
43
- tag.extract()
44
-
45
- text = soup.get_text(separator=" ")
46
- text = re.sub(r"\s+", " ", text).strip()
47
- return text
48
- except Exception as e:
49
- raise
50
-
51
- # -------------------------
52
- # Helper: extract heading tag text
53
- # -------------------------
54
- def extract_heading_text(soup, tag):
55
- elements = soup.find_all(tag)
56
- return " ".join([el.get_text(" ", strip=True) for el in elements]).strip()
57
-
58
- # -------------------------
59
- # Clean / normalize text
60
- # -------------------------
61
- def clean_text(t):
62
- return re.sub(r"\s+", " ", t or "").strip()
63
-
64
- # -------------------------
65
- # Summarize (extractive)
66
- # -------------------------
67
- def summarize(text, num_sentences=3):
68
- sentences = sent_tokenize(text)
69
- if len(sentences) <= num_sentences:
70
- return " ".join(sentences)
71
- try:
72
- vec = TfidfVectorizer(stop_words="english")
73
- X = vec.fit_transform(sentences)
74
- scores = np.array(X.sum(axis=1)).ravel()
75
- top_idx = scores.argsort()[-num_sentences:][::-1]
76
- top_sentences = [sentences[i] for i in sorted(top_idx)]
77
- return " ".join(top_sentences)
78
- except Exception:
79
- return " ".join(sentences[:num_sentences])
80
-
81
- # -------------------------
82
- # Topic clustering
83
- # -------------------------
84
- def cluster_texts(texts, n_clusters=3):
85
- if len(texts) == 0:
86
- return []
87
- if len(texts) <= 1:
88
- return [0] * len(texts)
89
- k = min(n_clusters, len(texts))
90
- vec = TfidfVectorizer(stop_words="english")
91
- X = vec.fit_transform(texts)
92
- kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
93
- labels = kmeans.fit_predict(X)
94
- return labels.tolist()
95
-
96
- # -------------------------
97
- # Duplicate detection (cosine)
98
- # -------------------------
99
- def detect_duplicates(texts, threshold=0.55):
100
- n = len(texts)
101
- if n <= 1:
102
- return []
103
- vec = TfidfVectorizer(stop_words="english")
104
- X = vec.fit_transform(texts)
105
- sim = cosine_similarity(X)
106
- groups = []
107
- used = set()
108
- for i in range(n):
109
- if i in used:
110
- continue
111
- group = [i]
112
- used.add(i)
113
- for j in range(i + 1, n):
114
- if sim[i, j] >= threshold:
115
- group.append(j)
116
- used.add(j)
117
- if len(group) > 1:
118
- groups.append(group)
119
- return groups
120
-
121
- # -------------------------
122
- # Sentence-level change detection (exact-match)
123
- # -------------------------
124
- def changed_sentences(textA, textB):
125
- sA = [s.strip() for s in sent_tokenize(textA) if s.strip()]
126
- sB = [s.strip() for s in sent_tokenize(textB) if s.strip()]
127
- setA = set(sA)
128
- setB = set(sB)
129
- changedA = [s for s in sA if s not in setB]
130
- changedB = [s for s in sB if s not in setA]
131
- return changedA, changedB
132
-
133
- # -------------------------
134
- # Return hostname helper
135
- # -------------------------
136
- def hostname(url):
137
- try:
138
- p = urlparse(url)
139
- return p.netloc or url
140
- except Exception:
141
- return url
142
-
143
- # -------------------------
144
- # Routes
145
- # -------------------------
146
- @app.route("/")
147
- def home():
148
- # list of preselected sites (you can add/remove)
149
- sites = {
150
- "Indian Express": "https://indianexpress.com/",
151
- "Times of India": "https://timesofindia.indiatimes.com/",
152
- "NDTV": "https://www.ndtv.com/",
153
- "BBC News": "https://www.bbc.com/news",
154
- "CNN": "https://www.cnn.com/",
155
- "The Hindu": "https://www.thehindu.com/",
156
- }
157
- return render_template("index.html", sites=sites)
158
-
159
- @app.route("/process_urls", methods=["POST"])
160
- def process_urls():
161
- payload = request.get_json(force=True)
162
- urls = payload.get("urls", []) or []
163
- mode = payload.get("mode", "tokenize")
164
-
165
- results = []
166
- texts_for_clustering = []
167
-
168
- for raw_url in urls:
169
- url = raw_url.strip()
170
- if not url:
171
- continue
172
- try:
173
- # fetch page raw
174
- req = Request(
175
- url,
176
- headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
177
- "AppleWebKit/537.36 (KHTML, like Gecko) "
178
- "Chrome/120.0 Safari/537.36"}
179
- )
180
- resp = urlopen(req, timeout=15)
181
- soup = BeautifulSoup(resp.read(), "html.parser")
182
-
183
- # choose extraction according to mode (H1..H6 or full)
184
- if mode in ["H1", "H2", "H3", "H4", "H5", "H6"]:
185
- tag = mode.lower()
186
- extracted = extract_heading_text(soup, tag)
187
- else:
188
- # full text
189
- for tag_rm in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
190
- tag_rm.extract()
191
- extracted = soup.get_text(separator=" ")
192
- extracted = clean_text(extracted)
193
-
194
- words = []
195
- sentences = []
196
- if extracted:
197
- # tokenization may throw in weird content, guard it
198
- try:
199
- words = word_tokenize(extracted)
200
- except Exception:
201
- words = extracted.split()
202
- try:
203
- sentences = sent_tokenize(extracted)
204
- except Exception:
205
- sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', extracted) if s.strip()]
206
-
207
- summary = summarize(extracted) if extracted else ""
208
- texts_for_clustering.append(extracted)
209
-
210
- results.append({
211
- "url": url,
212
- "host": hostname(url),
213
- "text": extracted,
214
- "words": words,
215
- "sentences": sentences,
216
- "summary": summary,
217
- })
218
- except Exception as e:
219
- results.append({
220
- "url": url,
221
- "host": hostname(url),
222
- "text": "",
223
- "words": [],
224
- "sentences": [],
225
- "summary": "",
226
- "error": str(e)
227
- })
228
-
229
- # clustering
230
- texts_only = [r.get("text", "") for r in results]
231
- clusters = cluster_texts(texts_only, n_clusters=3) if len(texts_only) > 0 else []
232
- # attach clusters (fill default 0 if sizes mismatch)
233
- if len(clusters) != len(results):
234
- clusters = [int(c) if i < len(clusters) else 0 for i, c in enumerate(range(len(results)))]
235
- for i, r in enumerate(results):
236
- r["cluster"] = int(clusters[i]) if i < len(clusters) else 0
237
-
238
- # duplicate groups (convert index groups to url groups)
239
- dup_idx_groups = detect_duplicates(texts_only, threshold=0.55)
240
- dup_url_groups = [[results[i]["url"] for i in grp] for grp in dup_idx_groups]
241
-
242
- return jsonify({
243
- "articles": results,
244
- "duplicate_groups": dup_url_groups
245
- })
246
-
247
- @app.route("/compare_texts", methods=["POST"])
248
- def compare_texts_route():
249
- data = request.get_json(force=True)
250
- text1 = data.get("text1", "") or ""
251
- text2 = data.get("text2", "") or ""
252
-
253
- # compute changed sentences (exact-match)
254
- changedA, changedB = changed_sentences(text1, text2)
255
-
256
- # build html: show only changed sentences highlighted, and keep order from original
257
- def highlight_html(original_text, changed_set):
258
- sents = [s.strip() for s in sent_tokenize(original_text) if s.strip()]
259
- pieces = []
260
- for s in sents:
261
- if s in changed_set:
262
- pieces.append(f"<p class='changed'>{escape_html(s)}</p>")
263
- return "".join(pieces)
264
-
265
- left_html = highlight_html(text1, set(changedA))
266
- right_html = highlight_html(text2, set(changedB))
267
-
268
- return jsonify({"left": left_html, "right": right_html, "changedA_count": len(changedA), "changedB_count": len(changedB)})
269
-
270
- # small helper used in templates/JS if needed
271
- def escape_html(s):
272
- return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
273
- .replace('"', "&quot;").replace("'", "&#39;"))
274
-
275
- if __name__ == "__main__":
276
- # increase default socket timeout a bit
277
- socket.setdefaulttimeout(20)
278
- app.run(debug=True)