simar007 commited on
Commit
ae4572b
·
verified ·
1 Parent(s): e2729a7

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +184 -0
  2. requirements.txt +9 -0
  3. scraper.py +48 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Content Extractor - Hugging Face Version
3
+ --------------------------------------------
4
+ ✅ Flask + BeautifulSoup + NLTK
5
+ ✅ Extracts headings, paragraphs, links, images
6
+ ✅ Performs NLP analysis (word counts, frequency, stopwords)
7
+ ✅ Auto language detection
8
+ """
9
+
10
+ from flask import Flask, render_template, request, jsonify
11
+ from flask_cors import CORS
12
+ import os
13
+ import requests
14
+ from bs4 import BeautifulSoup
15
+ import nltk
16
+ from nltk.corpus import stopwords
17
+ from nltk.probability import FreqDist
18
+ from nltk.tokenize import word_tokenize, sent_tokenize
19
+ import re
20
+ from langdetect import detect, DetectorFactory
21
+
22
+ # Flask setup
23
+ app = Flask(__name__)
24
+ CORS(app)
25
+
26
+ # Fix random seed for langdetect
27
+ DetectorFactory.seed = 0
28
+
29
+ # Download required NLTK resources (with full compatibility)
30
+ for pkg in ["punkt", "punkt_tab", "stopwords"]:
31
+ try:
32
+ nltk.download(pkg, quiet=True)
33
+ except Exception as e:
34
+ print(f"⚠️ Could not download {pkg}: {e}")
35
+
36
+ # ---------------------------------------------------------------
37
+ # 1️⃣ Extract Web Content
38
+ # ---------------------------------------------------------------
39
+ def extract_content(url):
40
+ try:
41
+ print("\n🌐 Fetching website content...")
42
+
43
+ headers = {
44
+ "User-Agent": (
45
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
46
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
47
+ "Chrome/124.0.0.0 Safari/537.36"
48
+ )
49
+ }
50
+
51
+ response = requests.get(url, headers=headers, timeout=10)
52
+ response.raise_for_status()
53
+
54
+ soup = BeautifulSoup(response.text, "html5lib")
55
+
56
+ # Extract various elements
57
+ headings = []
58
+ for i in range(1, 7):
59
+ tag = f'h{i}'
60
+ headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
61
+
62
+ paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
63
+ images = [img['src'] for img in soup.find_all('img', src=True)]
64
+ links = [a['href'] for a in soup.find_all('a', href=True)]
65
+
66
+ text = soup.get_text(separator=' ', strip=True)
67
+
68
+ # Try to detect language
69
+ try:
70
+ lang = detect(text[:500]) if text else "unknown"
71
+ except:
72
+ lang = "unknown"
73
+
74
+ return {
75
+ "headings": headings,
76
+ "paragraphs": paragraphs,
77
+ "images": images,
78
+ "links": links,
79
+ "text": text,
80
+ "language": lang
81
+ }
82
+
83
+ except requests.exceptions.HTTPError as e:
84
+ print(f"❌ HTTP error: {e}")
85
+ except requests.exceptions.RequestException as e:
86
+ print(f"❌ Network error: {e}")
87
+ except Exception as e:
88
+ print(f"❌ General error while fetching webpage: {e}")
89
+
90
+ return None
91
+
92
+ # ---------------------------------------------------------------
93
+ # 2️⃣ NLP Text Analysis
94
+ # ---------------------------------------------------------------
95
+ def analyze_text(text, lang="english"):
96
+ if not text:
97
+ return None
98
+
99
+ print("\n🧠 Analyzing text using NLTK...")
100
+
101
+ cleaned = re.sub(r'[^A-Za-z ]', ' ', text)
102
+
103
+ try:
104
+ words = word_tokenize(cleaned)
105
+ sentences = sent_tokenize(text)
106
+ except LookupError:
107
+ nltk.download("punkt_tab", quiet=True)
108
+ words = word_tokenize(cleaned)
109
+ sentences = sent_tokenize(text)
110
+
111
+ try:
112
+ sw = stopwords.words(lang)
113
+ except:
114
+ sw = stopwords.words("english")
115
+
116
+ filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2]
117
+ freq = FreqDist(filtered)
118
+ top_words = freq.most_common(10)
119
+
120
+ return {
121
+ "word_count": len(words),
122
+ "sentence_count": len(sentences),
123
+ "unique_words": len(set(filtered)),
124
+ "top_words": top_words,
125
+ "stopword_count": len(words) - len(filtered),
126
+ "filtered_words": filtered[:50]
127
+ }
128
+
129
+ # ---------------------------------------------------------------
130
+ # 3️⃣ Flask Routes
131
+ # ---------------------------------------------------------------
132
+ @app.route('/')
133
+ def index():
134
+ return render_template('index.html')
135
+
136
+ @app.route('/extract', methods=['POST'])
137
+ def extract_route():
138
+ try:
139
+ data = request.get_json()
140
+ url = data.get('url')
141
+ tag = data.get('tag', 'all')
142
+
143
+ if not url:
144
+ return jsonify({"error": "No URL provided"}), 400
145
+
146
+ if not url.startswith("http"):
147
+ url = "https://" + url
148
+
149
+ content = extract_content(url)
150
+ if not content:
151
+ return jsonify({"error": "Failed to fetch content"}), 400
152
+
153
+ analysis = analyze_text(content.get("text", ""))
154
+ content["analysis"] = analysis
155
+
156
+ if tag != "all":
157
+ tag_map = {
158
+ "h1": "headings",
159
+ "p": "paragraphs",
160
+ "img": "images",
161
+ "a": "links"
162
+ }
163
+ result = content.get(tag_map.get(tag, ""), [])
164
+ return jsonify({
165
+ "tag": tag,
166
+ "results": result,
167
+ "language": content.get("language"),
168
+ "analysis": analysis
169
+ })
170
+
171
+ return jsonify(content)
172
+
173
+ except Exception as e:
174
+ print("❌ Backend Error:", e)
175
+ return jsonify({"error": str(e)}), 500
176
+
177
+ # ---------------------------------------------------------------
178
+ # 4️⃣ Run Flask App (Hugging Face compatible)
179
+ # ---------------------------------------------------------------
180
+ if __name__ == "__main__":
181
+ print("=" * 70)
182
+ print("🚀 Hugging Face Web Content Extractor running...")
183
+ print("=" * 70)
184
+ app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ flask-cors
3
+ beautifulsoup4
4
+ html5lib
5
+ requests
6
+ nltk
7
+ langdetect
8
+ gunicorn
9
+
scraper.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scraper.py
2
+ import urllib.request
3
+ from bs4 import BeautifulSoup
4
+
5
+ def extract_content(url):
6
+ """
7
+ Extracts HTML content from a URL and returns:
8
+ - all headings (h1-h6)
9
+ - all paragraph texts
10
+ - all image URLs
11
+ - all hyperlinks
12
+ - all visible text
13
+ """
14
+ try:
15
+ # Fetch webpage
16
+ response = urllib.request.urlopen(url)
17
+ page_data = response.read()
18
+ soup = BeautifulSoup(page_data, "html5lib")
19
+
20
+ # Headings
21
+ headings = []
22
+ for i in range(1, 7):
23
+ tag = f'h{i}'
24
+ headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
25
+
26
+ # Paragraphs
27
+ paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
28
+
29
+ # Images
30
+ images = [img['src'] for img in soup.find_all('img', src=True)]
31
+
32
+ # Hyperlinks
33
+ links = [a['href'] for a in soup.find_all('a', href=True)]
34
+
35
+ # Visible text
36
+ text = soup.get_text(separator=' ', strip=True)
37
+
38
+ return {
39
+ "headings": headings,
40
+ "paragraphs": paragraphs,
41
+ "images": images,
42
+ "links": links,
43
+ "text": text
44
+ }
45
+
46
+ except Exception as e:
47
+ print("❌ Error while fetching webpage:", e)
48
+ return None