Spaces:

simar007
/

web-scraper

Sleeping

App Files Files Community

simar007 commited on Nov 10, 2025

Commit

ae4572b

verified ·

1 Parent(s): e2729a7

Upload 3 files

Browse files

Files changed (3) hide show

app.py +184 -0
requirements.txt +9 -0
scraper.py +48 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+Web Content Extractor - Hugging Face Version
+--------------------------------------------
+✅ Flask + BeautifulSoup + NLTK
+✅ Extracts headings, paragraphs, links, images
+✅ Performs NLP analysis (word counts, frequency, stopwords)
+✅ Auto language detection
+"""
+from flask import Flask, render_template, request, jsonify
+from flask_cors import CORS
+import os
+import requests
+from bs4 import BeautifulSoup
+import nltk
+from nltk.corpus import stopwords
+from nltk.probability import FreqDist
+from nltk.tokenize import word_tokenize, sent_tokenize
+import re
+from langdetect import detect, DetectorFactory
+# Flask setup
+app = Flask(__name__)
+CORS(app)
+# Fix random seed for langdetect
+DetectorFactory.seed = 0
+# Download required NLTK resources (with full compatibility)
+for pkg in ["punkt", "punkt_tab", "stopwords"]:
+    try:
+        nltk.download(pkg, quiet=True)
+    except Exception as e:
+        print(f"⚠️ Could not download {pkg}: {e}")
+# ---------------------------------------------------------------
+# 1️⃣ Extract Web Content
+# ---------------------------------------------------------------
+def extract_content(url):
+    try:
+        print("\n🌐 Fetching website content...")
+        headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/124.0.0.0 Safari/537.36"
+            )
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html5lib")
+        # Extract various elements
+        headings = []
+        for i in range(1, 7):
+            tag = f'h{i}'
+            headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
+        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
+        images = [img['src'] for img in soup.find_all('img', src=True)]
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        text = soup.get_text(separator=' ', strip=True)
+        # Try to detect language
+        try:
+            lang = detect(text[:500]) if text else "unknown"
+        except:
+            lang = "unknown"
+        return {
+            "headings": headings,
+            "paragraphs": paragraphs,
+            "images": images,
+            "links": links,
+            "text": text,
+            "language": lang
+        }
+    except requests.exceptions.HTTPError as e:
+        print(f"❌ HTTP error: {e}")
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Network error: {e}")
+    except Exception as e:
+        print(f"❌ General error while fetching webpage: {e}")
+    return None
+# ---------------------------------------------------------------
+# 2️⃣ NLP Text Analysis
+# ---------------------------------------------------------------
+def analyze_text(text, lang="english"):
+    if not text:
+        return None
+    print("\n🧠 Analyzing text using NLTK...")
+    cleaned = re.sub(r'[^A-Za-z ]', ' ', text)
+    try:
+        words = word_tokenize(cleaned)
+        sentences = sent_tokenize(text)
+    except LookupError:
+        nltk.download("punkt_tab", quiet=True)
+        words = word_tokenize(cleaned)
+        sentences = sent_tokenize(text)
+    try:
+        sw = stopwords.words(lang)
+    except:
+        sw = stopwords.words("english")
+    filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2]
+    freq = FreqDist(filtered)
+    top_words = freq.most_common(10)
+    return {
+        "word_count": len(words),
+        "sentence_count": len(sentences),
+        "unique_words": len(set(filtered)),
+        "top_words": top_words,
+        "stopword_count": len(words) - len(filtered),
+        "filtered_words": filtered[:50]
+    }
+# ---------------------------------------------------------------
+# 3️⃣ Flask Routes
+# ---------------------------------------------------------------
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/extract', methods=['POST'])
+def extract_route():
+    try:
+        data = request.get_json()
+        url = data.get('url')
+        tag = data.get('tag', 'all')
+        if not url:
+            return jsonify({"error": "No URL provided"}), 400
+        if not url.startswith("http"):
+            url = "https://" + url
+        content = extract_content(url)
+        if not content:
+            return jsonify({"error": "Failed to fetch content"}), 400
+        analysis = analyze_text(content.get("text", ""))
+        content["analysis"] = analysis
+        if tag != "all":
+            tag_map = {
+                "h1": "headings",
+                "p": "paragraphs",
+                "img": "images",
+                "a": "links"
+            }
+            result = content.get(tag_map.get(tag, ""), [])
+            return jsonify({
+                "tag": tag,
+                "results": result,
+                "language": content.get("language"),
+                "analysis": analysis
+            })
+        return jsonify(content)
+    except Exception as e:
+        print("❌ Backend Error:", e)
+        return jsonify({"error": str(e)}), 500
+# ---------------------------------------------------------------
+# 4️⃣ Run Flask App (Hugging Face compatible)
+# ---------------------------------------------------------------
+if __name__ == "__main__":
+    print("=" * 70)
+    print("🚀 Hugging Face Web Content Extractor running...")
+    print("=" * 70)
+    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flask
+flask-cors
+beautifulsoup4
+html5lib
+requests
+nltk
+langdetect
+gunicorn

scraper.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# scraper.py
+import urllib.request
+from bs4 import BeautifulSoup
+def extract_content(url):
+    """
+    Extracts HTML content from a URL and returns:
+      - all headings (h1-h6)
+      - all paragraph texts
+      - all image URLs
+      - all hyperlinks
+      - all visible text
+    """
+    try:
+        # Fetch webpage
+        response = urllib.request.urlopen(url)
+        page_data = response.read()
+        soup = BeautifulSoup(page_data, "html5lib")
+        # Headings
+        headings = []
+        for i in range(1, 7):
+            tag = f'h{i}'
+            headings += [h.get_text(strip=True) for h in soup.find_all(tag)]
+        # Paragraphs
+        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
+        # Images
+        images = [img['src'] for img in soup.find_all('img', src=True)]
+        # Hyperlinks
+        links = [a['href'] for a in soup.find_all('a', href=True)]
+        # Visible text
+        text = soup.get_text(separator=' ', strip=True)
+        return {
+            "headings": headings,
+            "paragraphs": paragraphs,
+            "images": images,
+            "links": links,
+            "text": text
+        }
+    except Exception as e:
+        print("❌ Error while fetching webpage:", e)
+        return None