Gagan0141 commited on
Commit
f21fee7
·
verified ·
1 Parent(s): 47cae26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -85
app.py CHANGED
@@ -1,85 +1,87 @@
1
- from flask import Flask, request, render_template, jsonify
2
- from nltk.tokenize import word_tokenize, sent_tokenize
3
- from urllib.request import urlopen
4
- from bs4 import BeautifulSoup
5
- import nltk
6
- from difflib import SequenceMatcher
7
- from sklearn.feature_extraction.text import TfidfVectorizer
8
- from sklearn.cluster import KMeans
9
-
10
- nltk.download('punkt')
11
-
12
- app = Flask(__name__)
13
-
14
- def simple_summary(text, max_sentences=3):
15
- sents = sent_tokenize(text)
16
- return " ".join(sents[:max_sentences]) if sents else text[:200] + "..."
17
-
18
- @app.route("/")
19
- def home():
20
- return render_template("index.html")
21
-
22
- @app.route("/process_urls", methods=["POST"])
23
- def process_urls():
24
- data = request.form
25
- selected_sites = request.form.getlist("sites")
26
- mode = data.get("mode", "tokenize")
27
-
28
- articles = {}
29
- for url in selected_sites:
30
- try:
31
- page = urlopen(url)
32
- soup = BeautifulSoup(page, "html.parser")
33
- text = soup.get_text(separator=" ")
34
- articles[url] = text
35
- except Exception as e:
36
- articles[url] = f"Error fetching: {str(e)}"
37
-
38
- # -----------------------------
39
- # Compare articles side-by-side
40
- # -----------------------------
41
- comparison_results = []
42
- urls = list(articles.keys())
43
- for i in range(len(urls)):
44
- for j in range(i+1, len(urls)):
45
- a, b = articles[urls[i]], articles[urls[j]]
46
- sents_a, sents_b = sent_tokenize(a), sent_tokenize(b)
47
- diff_a = []
48
- for sent in sents_a:
49
- if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b):
50
- diff_a.append(sent)
51
- comparison_results.append({
52
- "site1": urls[i],
53
- "site2": urls[j],
54
- "diff_sentences_site1": diff_a
55
- })
56
-
57
- # -----------------------------
58
- # Cluster articles by topic
59
- # -----------------------------
60
- clusters = {}
61
- if len(articles) > 0:
62
- vectorizer = TfidfVectorizer(stop_words='english')
63
- X = vectorizer.fit_transform(list(articles.values()))
64
- n_clusters = min(3, len(articles))
65
- kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
66
- for idx, label in enumerate(kmeans.labels_):
67
- clusters.setdefault(int(label), []).append(urls[idx])
68
-
69
-
70
- # -----------------------------
71
- # Summarize each article using NLTK simple summary
72
- # -----------------------------
73
- summaries = {}
74
- for url, text in articles.items():
75
- summaries[url] = simple_summary(text, max_sentences=3)
76
-
77
- return jsonify({
78
- "articles": articles,
79
- "comparisons": comparison_results,
80
- "clusters": clusters,
81
- "summaries": summaries
82
- })
83
-
84
- if __name__ == "__main__":
85
- app.run(debug=True)
 
 
 
1
+ from flask import Flask, request, render_template, jsonify
2
+ from nltk.tokenize import word_tokenize, sent_tokenize
3
+ from urllib.request import urlopen
4
+ from bs4 import BeautifulSoup
5
+ import nltk
6
+ from difflib import SequenceMatcher
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.cluster import KMeans
9
+
10
+ nltk.download('punkt')
11
+
12
+ app = Flask(__name__)
13
+
14
+ def simple_summary(text, max_sentences=3):
15
+ sents = sent_tokenize(text)
16
+ return " ".join(sents[:max_sentences]) if sents else text[:200] + "..."
17
+
18
+ @app.route("/")
19
+ def home():
20
+ return render_template("index.html")
21
+
22
+ @app.route("/process_urls", methods=["POST"])
23
+ def process_urls():
24
+ data = request.form
25
+ selected_sites = request.form.getlist("sites")
26
+ mode = data.get("mode", "tokenize")
27
+
28
+ articles = {}
29
+ for url in selected_sites:
30
+ try:
31
+ page = urlopen(url)
32
+ soup = BeautifulSoup(page, "html.parser")
33
+ text = soup.get_text(separator=" ")
34
+ articles[url] = text
35
+ except Exception as e:
36
+ articles[url] = f"Error fetching: {str(e)}"
37
+
38
+ # -----------------------------
39
+ # Compare articles side-by-side
40
+ # -----------------------------
41
+ comparison_results = []
42
+ urls = list(articles.keys())
43
+ for i in range(len(urls)):
44
+ for j in range(i+1, len(urls)):
45
+ a, b = articles[urls[i]], articles[urls[j]]
46
+ sents_a, sents_b = sent_tokenize(a), sent_tokenize(b)
47
+ diff_a = []
48
+ for sent in sents_a:
49
+ if any(SequenceMatcher(None, sent, s).ratio() < 0.8 for s in sents_b):
50
+ diff_a.append(sent)
51
+ comparison_results.append({
52
+ "site1": urls[i],
53
+ "site2": urls[j],
54
+ "diff_sentences_site1": diff_a
55
+ })
56
+
57
+ # -----------------------------
58
+ # Cluster articles by topic
59
+ # -----------------------------
60
+ clusters = {}
61
+ if len(articles) > 0:
62
+ vectorizer = TfidfVectorizer(stop_words='english')
63
+ X = vectorizer.fit_transform(list(articles.values()))
64
+ n_clusters = min(3, len(articles))
65
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
66
+ for idx, label in enumerate(kmeans.labels_):
67
+ clusters.setdefault(int(label), []).append(urls[idx])
68
+
69
+
70
+ # -----------------------------
71
+ # Summarize each article using NLTK simple summary
72
+ # -----------------------------
73
+ summaries = {}
74
+ for url, text in articles.items():
75
+ summaries[url] = simple_summary(text, max_sentences=3)
76
+
77
+ return jsonify({
78
+ "articles": articles,
79
+ "comparisons": comparison_results,
80
+ "clusters": clusters,
81
+ "summaries": summaries
82
+ })
83
+
84
+ if __name__ == "__main__":
85
+ socket.setdefaulttimeout(20)
86
+ app.run(host="0.0.0.0", port=7860, debug=False)
87
+