Gagan0141 commited on
Commit
1049881
·
verified ·
1 Parent(s): 90c82f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +278 -278
app.py CHANGED
@@ -1,278 +1,278 @@
1
- from flask import Flask, request, jsonify, render_template
2
- from urllib.request import Request, urlopen
3
- from bs4 import BeautifulSoup
4
- import nltk
5
- import re
6
- import socket
7
- from urllib.parse import urlparse
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
- from sklearn.cluster import KMeans
11
- import numpy as np
12
-
13
- # Ensure NLTK data exists
14
- nltk.download("punkt", quiet=True)
15
- nltk.download("punkt_tab", quiet=True)
16
- from nltk.tokenize import word_tokenize, sent_tokenize
17
-
18
- app = Flask(__name__)
19
-
20
- # -------------------------
21
- # Helper: fetch page safely
22
- # -------------------------
23
- def fetch_page(url, timeout=15):
24
- """
25
- Fetch URL content using urllib with a browser-like User-Agent.
26
- Returns cleaned text or raises Exception.
27
- """
28
- try:
29
- req = Request(
30
- url,
31
- headers={
32
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
33
- "AppleWebKit/537.36 (KHTML, like Gecko) "
34
- "Chrome/120.0 Safari/537.36"
35
- },
36
- )
37
- resp = urlopen(req, timeout=timeout)
38
- raw = resp.read()
39
- soup = BeautifulSoup(raw, "html.parser")
40
-
41
- # remove scripts/styles etc
42
- for tag in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
43
- tag.extract()
44
-
45
- text = soup.get_text(separator=" ")
46
- text = re.sub(r"\s+", " ", text).strip()
47
- return text
48
- except Exception as e:
49
- raise
50
-
51
- # -------------------------
52
- # Helper: extract heading tag text
53
- # -------------------------
54
- def extract_heading_text(soup, tag):
55
- elements = soup.find_all(tag)
56
- return " ".join([el.get_text(" ", strip=True) for el in elements]).strip()
57
-
58
- # -------------------------
59
- # Clean / normalize text
60
- # -------------------------
61
- def clean_text(t):
62
- return re.sub(r"\s+", " ", t or "").strip()
63
-
64
- # -------------------------
65
- # Summarize (extractive)
66
- # -------------------------
67
- def summarize(text, num_sentences=3):
68
- sentences = sent_tokenize(text)
69
- if len(sentences) <= num_sentences:
70
- return " ".join(sentences)
71
- try:
72
- vec = TfidfVectorizer(stop_words="english")
73
- X = vec.fit_transform(sentences)
74
- scores = np.array(X.sum(axis=1)).ravel()
75
- top_idx = scores.argsort()[-num_sentences:][::-1]
76
- top_sentences = [sentences[i] for i in sorted(top_idx)]
77
- return " ".join(top_sentences)
78
- except Exception:
79
- return " ".join(sentences[:num_sentences])
80
-
81
- # -------------------------
82
- # Topic clustering
83
- # -------------------------
84
- def cluster_texts(texts, n_clusters=3):
85
- if len(texts) == 0:
86
- return []
87
- if len(texts) <= 1:
88
- return [0] * len(texts)
89
- k = min(n_clusters, len(texts))
90
- vec = TfidfVectorizer(stop_words="english")
91
- X = vec.fit_transform(texts)
92
- kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
93
- labels = kmeans.fit_predict(X)
94
- return labels.tolist()
95
-
96
- # -------------------------
97
- # Duplicate detection (cosine)
98
- # -------------------------
99
- def detect_duplicates(texts, threshold=0.55):
100
- n = len(texts)
101
- if n <= 1:
102
- return []
103
- vec = TfidfVectorizer(stop_words="english")
104
- X = vec.fit_transform(texts)
105
- sim = cosine_similarity(X)
106
- groups = []
107
- used = set()
108
- for i in range(n):
109
- if i in used:
110
- continue
111
- group = [i]
112
- used.add(i)
113
- for j in range(i + 1, n):
114
- if sim[i, j] >= threshold:
115
- group.append(j)
116
- used.add(j)
117
- if len(group) > 1:
118
- groups.append(group)
119
- return groups
120
-
121
- # -------------------------
122
- # Sentence-level change detection (exact-match)
123
- # -------------------------
124
- def changed_sentences(textA, textB):
125
- sA = [s.strip() for s in sent_tokenize(textA) if s.strip()]
126
- sB = [s.strip() for s in sent_tokenize(textB) if s.strip()]
127
- setA = set(sA)
128
- setB = set(sB)
129
- changedA = [s for s in sA if s not in setB]
130
- changedB = [s for s in sB if s not in setA]
131
- return changedA, changedB
132
-
133
- # -------------------------
134
- # Return hostname helper
135
- # -------------------------
136
- def hostname(url):
137
- try:
138
- p = urlparse(url)
139
- return p.netloc or url
140
- except Exception:
141
- return url
142
-
143
- # -------------------------
144
- # Routes
145
- # -------------------------
146
- @app.route("/")
147
- def home():
148
- # list of preselected sites (you can add/remove)
149
- sites = {
150
- "Indian Express": "https://indianexpress.com/",
151
- "Times of India": "https://timesofindia.indiatimes.com/",
152
- "NDTV": "https://www.ndtv.com/",
153
- "BBC News": "https://www.bbc.com/news",
154
- "CNN": "https://www.cnn.com/",
155
- "The Hindu": "https://www.thehindu.com/",
156
- }
157
- return render_template("index.html", sites=sites)
158
-
159
- @app.route("/process_urls", methods=["POST"])
160
- def process_urls():
161
- payload = request.get_json(force=True)
162
- urls = payload.get("urls", []) or []
163
- mode = payload.get("mode", "tokenize")
164
-
165
- results = []
166
- texts_for_clustering = []
167
-
168
- for raw_url in urls:
169
- url = raw_url.strip()
170
- if not url:
171
- continue
172
- try:
173
- # fetch page raw
174
- req = Request(
175
- url,
176
- headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
177
- "AppleWebKit/537.36 (KHTML, like Gecko) "
178
- "Chrome/120.0 Safari/537.36"}
179
- )
180
- resp = urlopen(req, timeout=15)
181
- soup = BeautifulSoup(resp.read(), "html.parser")
182
-
183
- # choose extraction according to mode (H1..H6 or full)
184
- if mode in ["H1", "H2", "H3", "H4", "H5", "H6"]:
185
- tag = mode.lower()
186
- extracted = extract_heading_text(soup, tag)
187
- else:
188
- # full text
189
- for tag_rm in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
190
- tag_rm.extract()
191
- extracted = soup.get_text(separator=" ")
192
- extracted = clean_text(extracted)
193
-
194
- words = []
195
- sentences = []
196
- if extracted:
197
- # tokenization may throw in weird content, guard it
198
- try:
199
- words = word_tokenize(extracted)
200
- except Exception:
201
- words = extracted.split()
202
- try:
203
- sentences = sent_tokenize(extracted)
204
- except Exception:
205
- sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', extracted) if s.strip()]
206
-
207
- summary = summarize(extracted) if extracted else ""
208
- texts_for_clustering.append(extracted)
209
-
210
- results.append({
211
- "url": url,
212
- "host": hostname(url),
213
- "text": extracted,
214
- "words": words,
215
- "sentences": sentences,
216
- "summary": summary,
217
- })
218
- except Exception as e:
219
- results.append({
220
- "url": url,
221
- "host": hostname(url),
222
- "text": "",
223
- "words": [],
224
- "sentences": [],
225
- "summary": "",
226
- "error": str(e)
227
- })
228
-
229
- # clustering
230
- texts_only = [r.get("text", "") for r in results]
231
- clusters = cluster_texts(texts_only, n_clusters=3) if len(texts_only) > 0 else []
232
- # attach clusters (fill default 0 if sizes mismatch)
233
- if len(clusters) != len(results):
234
- clusters = [int(c) if i < len(clusters) else 0 for i, c in enumerate(range(len(results)))]
235
- for i, r in enumerate(results):
236
- r["cluster"] = int(clusters[i]) if i < len(clusters) else 0
237
-
238
- # duplicate groups (convert index groups to url groups)
239
- dup_idx_groups = detect_duplicates(texts_only, threshold=0.55)
240
- dup_url_groups = [[results[i]["url"] for i in grp] for grp in dup_idx_groups]
241
-
242
- return jsonify({
243
- "articles": results,
244
- "duplicate_groups": dup_url_groups
245
- })
246
-
247
- @app.route("/compare_texts", methods=["POST"])
248
- def compare_texts_route():
249
- data = request.get_json(force=True)
250
- text1 = data.get("text1", "") or ""
251
- text2 = data.get("text2", "") or ""
252
-
253
- # compute changed sentences (exact-match)
254
- changedA, changedB = changed_sentences(text1, text2)
255
-
256
- # build html: show only changed sentences highlighted, and keep order from original
257
- def highlight_html(original_text, changed_set):
258
- sents = [s.strip() for s in sent_tokenize(original_text) if s.strip()]
259
- pieces = []
260
- for s in sents:
261
- if s in changed_set:
262
- pieces.append(f"<p class='changed'>{escape_html(s)}</p>")
263
- return "".join(pieces)
264
-
265
- left_html = highlight_html(text1, set(changedA))
266
- right_html = highlight_html(text2, set(changedB))
267
-
268
- return jsonify({"left": left_html, "right": right_html, "changedA_count": len(changedA), "changedB_count": len(changedB)})
269
-
270
- # small helper used in templates/JS if needed
271
- def escape_html(s):
272
- return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
273
- .replace('"', "&quot;").replace("'", "&#39;"))
274
-
275
- if __name__ == "__main__":
276
- # increase default socket timeout a bit
277
- socket.setdefaulttimeout(20)
278
- app.run(debug=True)
 
1
+ from flask import Flask, request, jsonify, render_template
2
+ from urllib.request import Request, urlopen
3
+ from bs4 import BeautifulSoup
4
+ import nltk
5
+ import re
6
+ import socket
7
+ from urllib.parse import urlparse
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from sklearn.cluster import KMeans
11
+ import numpy as np
12
+
13
+ # Ensure NLTK data exists
14
+ nltk.download("punkt", quiet=True)
15
+ nltk.download("punkt_tab", quiet=True)
16
+ from nltk.tokenize import word_tokenize, sent_tokenize
17
+
18
+ app = Flask(__name__)
19
+
20
+ # -------------------------
21
+ # Helper: fetch page safely
22
+ # -------------------------
23
+ def fetch_page(url, timeout=15):
24
+ """
25
+ Fetch URL content using urllib with a browser-like User-Agent.
26
+ Returns cleaned text or raises Exception.
27
+ """
28
+ try:
29
+ req = Request(
30
+ url,
31
+ headers={
32
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
33
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
34
+ "Chrome/120.0 Safari/537.36"
35
+ },
36
+ )
37
+ resp = urlopen(req, timeout=timeout)
38
+ raw = resp.read()
39
+ soup = BeautifulSoup(raw, "html.parser")
40
+
41
+ # remove scripts/styles etc
42
+ for tag in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
43
+ tag.extract()
44
+
45
+ text = soup.get_text(separator=" ")
46
+ text = re.sub(r"\s+", " ", text).strip()
47
+ return text
48
+ except Exception as e:
49
+ raise
50
+
51
+ # -------------------------
52
+ # Helper: extract heading tag text
53
+ # -------------------------
54
+ def extract_heading_text(soup, tag):
55
+ elements = soup.find_all(tag)
56
+ return " ".join([el.get_text(" ", strip=True) for el in elements]).strip()
57
+
58
+ # -------------------------
59
+ # Clean / normalize text
60
+ # -------------------------
61
+ def clean_text(t):
62
+ return re.sub(r"\s+", " ", t or "").strip()
63
+
64
+ # -------------------------
65
+ # Summarize (extractive)
66
+ # -------------------------
67
+ def summarize(text, num_sentences=3):
68
+ sentences = sent_tokenize(text)
69
+ if len(sentences) <= num_sentences:
70
+ return " ".join(sentences)
71
+ try:
72
+ vec = TfidfVectorizer(stop_words="english")
73
+ X = vec.fit_transform(sentences)
74
+ scores = np.array(X.sum(axis=1)).ravel()
75
+ top_idx = scores.argsort()[-num_sentences:][::-1]
76
+ top_sentences = [sentences[i] for i in sorted(top_idx)]
77
+ return " ".join(top_sentences)
78
+ except Exception:
79
+ return " ".join(sentences[:num_sentences])
80
+
81
+ # -------------------------
82
+ # Topic clustering
83
+ # -------------------------
84
+ def cluster_texts(texts, n_clusters=3):
85
+ if len(texts) == 0:
86
+ return []
87
+ if len(texts) <= 1:
88
+ return [0] * len(texts)
89
+ k = min(n_clusters, len(texts))
90
+ vec = TfidfVectorizer(stop_words="english")
91
+ X = vec.fit_transform(texts)
92
+ kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
93
+ labels = kmeans.fit_predict(X)
94
+ return labels.tolist()
95
+
96
+ # -------------------------
97
+ # Duplicate detection (cosine)
98
+ # -------------------------
99
+ def detect_duplicates(texts, threshold=0.55):
100
+ n = len(texts)
101
+ if n <= 1:
102
+ return []
103
+ vec = TfidfVectorizer(stop_words="english")
104
+ X = vec.fit_transform(texts)
105
+ sim = cosine_similarity(X)
106
+ groups = []
107
+ used = set()
108
+ for i in range(n):
109
+ if i in used:
110
+ continue
111
+ group = [i]
112
+ used.add(i)
113
+ for j in range(i + 1, n):
114
+ if sim[i, j] >= threshold:
115
+ group.append(j)
116
+ used.add(j)
117
+ if len(group) > 1:
118
+ groups.append(group)
119
+ return groups
120
+
121
+ # -------------------------
122
+ # Sentence-level change detection (exact-match)
123
+ # -------------------------
124
+ def changed_sentences(textA, textB):
125
+ sA = [s.strip() for s in sent_tokenize(textA) if s.strip()]
126
+ sB = [s.strip() for s in sent_tokenize(textB) if s.strip()]
127
+ setA = set(sA)
128
+ setB = set(sB)
129
+ changedA = [s for s in sA if s not in setB]
130
+ changedB = [s for s in sB if s not in setA]
131
+ return changedA, changedB
132
+
133
+ # -------------------------
134
+ # Return hostname helper
135
+ # -------------------------
136
+ def hostname(url):
137
+ try:
138
+ p = urlparse(url)
139
+ return p.netloc or url
140
+ except Exception:
141
+ return url
142
+
143
+ # -------------------------
144
+ # Routes
145
+ # -------------------------
146
+ @app.route("/")
147
+ def home():
148
+ # list of preselected sites (you can add/remove)
149
+ sites = {
150
+ "Indian Express": "https://indianexpress.com/",
151
+ "Times of India": "https://timesofindia.indiatimes.com/",
152
+ "NDTV": "https://www.ndtv.com/",
153
+ "BBC News": "https://www.bbc.com/news",
154
+ "CNN": "https://www.cnn.com/",
155
+ "The Hindu": "https://www.thehindu.com/",
156
+ }
157
+ return render_template("index.html", sites=sites)
158
+
159
+ @app.route("/process_urls", methods=["POST"])
160
+ def process_urls():
161
+ payload = request.get_json(force=True)
162
+ urls = payload.get("urls", []) or []
163
+ mode = payload.get("mode", "tokenize")
164
+
165
+ results = []
166
+ texts_for_clustering = []
167
+
168
+ for raw_url in urls:
169
+ url = raw_url.strip()
170
+ if not url:
171
+ continue
172
+ try:
173
+ # fetch page raw
174
+ req = Request(
175
+ url,
176
+ headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
177
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
178
+ "Chrome/120.0 Safari/537.36"}
179
+ )
180
+ resp = urlopen(req, timeout=15)
181
+ soup = BeautifulSoup(resp.read(), "html.parser")
182
+
183
+ # choose extraction according to mode (H1..H6 or full)
184
+ if mode in ["H1", "H2", "H3", "H4", "H5", "H6"]:
185
+ tag = mode.lower()
186
+ extracted = extract_heading_text(soup, tag)
187
+ else:
188
+ # full text
189
+ for tag_rm in soup(["script", "style", "noscript", "iframe", "header", "footer"]):
190
+ tag_rm.extract()
191
+ extracted = soup.get_text(separator=" ")
192
+ extracted = clean_text(extracted)
193
+
194
+ words = []
195
+ sentences = []
196
+ if extracted:
197
+ # tokenization may throw in weird content, guard it
198
+ try:
199
+ words = word_tokenize(extracted)
200
+ except Exception:
201
+ words = extracted.split()
202
+ try:
203
+ sentences = sent_tokenize(extracted)
204
+ except Exception:
205
+ sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', extracted) if s.strip()]
206
+
207
+ summary = summarize(extracted) if extracted else ""
208
+ texts_for_clustering.append(extracted)
209
+
210
+ results.append({
211
+ "url": url,
212
+ "host": hostname(url),
213
+ "text": extracted,
214
+ "words": words,
215
+ "sentences": sentences,
216
+ "summary": summary,
217
+ })
218
+ except Exception as e:
219
+ results.append({
220
+ "url": url,
221
+ "host": hostname(url),
222
+ "text": "",
223
+ "words": [],
224
+ "sentences": [],
225
+ "summary": "",
226
+ "error": str(e)
227
+ })
228
+
229
+ # clustering
230
+ texts_only = [r.get("text", "") for r in results]
231
+ clusters = cluster_texts(texts_only, n_clusters=3) if len(texts_only) > 0 else []
232
+ # attach clusters (fill default 0 if sizes mismatch)
233
+ if len(clusters) != len(results):
234
+ clusters = [int(c) if i < len(clusters) else 0 for i, c in enumerate(range(len(results)))]
235
+ for i, r in enumerate(results):
236
+ r["cluster"] = int(clusters[i]) if i < len(clusters) else 0
237
+
238
+ # duplicate groups (convert index groups to url groups)
239
+ dup_idx_groups = detect_duplicates(texts_only, threshold=0.55)
240
+ dup_url_groups = [[results[i]["url"] for i in grp] for grp in dup_idx_groups]
241
+
242
+ return jsonify({
243
+ "articles": results,
244
+ "duplicate_groups": dup_url_groups
245
+ })
246
+
247
+ @app.route("/compare_texts", methods=["POST"])
248
+ def compare_texts_route():
249
+ data = request.get_json(force=True)
250
+ text1 = data.get("text1", "") or ""
251
+ text2 = data.get("text2", "") or ""
252
+
253
+ # compute changed sentences (exact-match)
254
+ changedA, changedB = changed_sentences(text1, text2)
255
+
256
+ # build html: show only changed sentences highlighted, and keep order from original
257
+ def highlight_html(original_text, changed_set):
258
+ sents = [s.strip() for s in sent_tokenize(original_text) if s.strip()]
259
+ pieces = []
260
+ for s in sents:
261
+ if s in changed_set:
262
+ pieces.append(f"<p class='changed'>{escape_html(s)}</p>")
263
+ return "".join(pieces)
264
+
265
+ left_html = highlight_html(text1, set(changedA))
266
+ right_html = highlight_html(text2, set(changedB))
267
+
268
+ return jsonify({"left": left_html, "right": right_html, "changedA_count": len(changedA), "changedB_count": len(changedB)})
269
+
270
+ # small helper used in templates/JS if needed
271
+ def escape_html(s):
272
+ return (s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
273
+ .replace('"', "&quot;").replace("'", "&#39;"))
274
+
275
+ if __name__ == "__main__":
276
+ # increase default socket timeout a bit
277
+ socket.setdefaulttimeout(20)
278
+ app.run(host="0.0.0.0", port=7860, debug=False)