Spaces:

simar007
/

web-scraper

Sleeping

File size: 5,604 Bytes

ae4572b

"""
Web Content Extractor - Hugging Face Version
--------------------------------------------
✅ Flask + BeautifulSoup + NLTK
✅ Extracts headings, paragraphs, links, images
✅ Performs NLP analysis (word counts, frequency, stopwords)
✅ Auto language detection
"""

from flask import Flask, render_template, request, jsonify
from flask_cors import CORS
import os
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from langdetect import detect, DetectorFactory

# Flask setup
app = Flask(__name__)
CORS(app)

# Fix random seed for langdetect
DetectorFactory.seed = 0

# Download required NLTK resources (with full compatibility)
for pkg in ["punkt", "punkt_tab", "stopwords"]:
    try:
        nltk.download(pkg, quiet=True)
    except Exception as e:
        print(f"⚠️ Could not download {pkg}: {e}")

# ---------------------------------------------------------------
# 1️⃣ Extract Web Content
# ---------------------------------------------------------------
def extract_content(url):
    try:
        print("\n🌐 Fetching website content...")

        headers = {
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0.0.0 Safari/537.36"
            )
        }

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html5lib")

        # Extract various elements
        headings = []
        for i in range(1, 7):
            tag = f'h{i}'
            headings += [h.get_text(strip=True) for h in soup.find_all(tag)]

        paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]
        images = [img['src'] for img in soup.find_all('img', src=True)]
        links = [a['href'] for a in soup.find_all('a', href=True)]

        text = soup.get_text(separator=' ', strip=True)

        # Try to detect language
        try:
            lang = detect(text[:500]) if text else "unknown"
        except:
            lang = "unknown"

        return {
            "headings": headings,
            "paragraphs": paragraphs,
            "images": images,
            "links": links,
            "text": text,
            "language": lang
        }

    except requests.exceptions.HTTPError as e:
        print(f"❌ HTTP error: {e}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Network error: {e}")
    except Exception as e:
        print(f"❌ General error while fetching webpage: {e}")

    return None

# ---------------------------------------------------------------
# 2️⃣ NLP Text Analysis
# ---------------------------------------------------------------
def analyze_text(text, lang="english"):
    if not text:
        return None

    print("\n🧠 Analyzing text using NLTK...")

    cleaned = re.sub(r'[^A-Za-z ]', ' ', text)

    try:
        words = word_tokenize(cleaned)
        sentences = sent_tokenize(text)
    except LookupError:
        nltk.download("punkt_tab", quiet=True)
        words = word_tokenize(cleaned)
        sentences = sent_tokenize(text)

    try:
        sw = stopwords.words(lang)
    except:
        sw = stopwords.words("english")

    filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2]
    freq = FreqDist(filtered)
    top_words = freq.most_common(10)

    return {
        "word_count": len(words),
        "sentence_count": len(sentences),
        "unique_words": len(set(filtered)),
        "top_words": top_words,
        "stopword_count": len(words) - len(filtered),
        "filtered_words": filtered[:50]
    }

# ---------------------------------------------------------------
# 3️⃣ Flask Routes
# ---------------------------------------------------------------
@app.route('/')
def index():
    return render_template('index.html')

@app.route('/extract', methods=['POST'])
def extract_route():
    try:
        data = request.get_json()
        url = data.get('url')
        tag = data.get('tag', 'all')

        if not url:
            return jsonify({"error": "No URL provided"}), 400

        if not url.startswith("http"):
            url = "https://" + url

        content = extract_content(url)
        if not content:
            return jsonify({"error": "Failed to fetch content"}), 400

        analysis = analyze_text(content.get("text", ""))
        content["analysis"] = analysis

        if tag != "all":
            tag_map = {
                "h1": "headings",
                "p": "paragraphs",
                "img": "images",
                "a": "links"
            }
            result = content.get(tag_map.get(tag, ""), [])
            return jsonify({
                "tag": tag,
                "results": result,
                "language": content.get("language"),
                "analysis": analysis
            })

        return jsonify(content)

    except Exception as e:
        print("❌ Backend Error:", e)
        return jsonify({"error": str(e)}), 500

# ---------------------------------------------------------------
# 4️⃣ Run Flask App (Hugging Face compatible)
# ---------------------------------------------------------------
if __name__ == "__main__":
    print("=" * 70)
    print("🚀 Hugging Face Web Content Extractor running...")
    print("=" * 70)
    app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))