""" Web Content Extractor - Hugging Face Version -------------------------------------------- ✅ Flask + BeautifulSoup + NLTK ✅ Extracts headings, paragraphs, links, images ✅ Performs NLP analysis (word counts, frequency, stopwords) ✅ Auto language detection """ from flask import Flask, render_template, request, jsonify from flask_cors import CORS import os import requests from bs4 import BeautifulSoup import nltk from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.tokenize import word_tokenize, sent_tokenize import re from langdetect import detect, DetectorFactory # Flask setup app = Flask(__name__) CORS(app) # Fix random seed for langdetect DetectorFactory.seed = 0 # Download required NLTK resources (with full compatibility) for pkg in ["punkt", "punkt_tab", "stopwords"]: try: nltk.download(pkg, quiet=True) except Exception as e: print(f"⚠️ Could not download {pkg}: {e}") # --------------------------------------------------------------- # 1️⃣ Extract Web Content # --------------------------------------------------------------- def extract_content(url): try: print("\n🌐 Fetching website content...") headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/124.0.0.0 Safari/537.36" ) } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html5lib") # Extract various elements headings = [] for i in range(1, 7): tag = f'h{i}' headings += [h.get_text(strip=True) for h in soup.find_all(tag)] paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)] images = [img['src'] for img in soup.find_all('img', src=True)] links = [a['href'] for a in soup.find_all('a', href=True)] text = soup.get_text(separator=' ', strip=True) # Try to detect language try: lang = detect(text[:500]) if text else "unknown" except: lang = "unknown" return { "headings": headings, "paragraphs": paragraphs, "images": images, "links": links, "text": text, "language": lang } except requests.exceptions.HTTPError as e: print(f"❌ HTTP error: {e}") except requests.exceptions.RequestException as e: print(f"❌ Network error: {e}") except Exception as e: print(f"❌ General error while fetching webpage: {e}") return None # --------------------------------------------------------------- # 2️⃣ NLP Text Analysis # --------------------------------------------------------------- def analyze_text(text, lang="english"): if not text: return None print("\n🧠 Analyzing text using NLTK...") cleaned = re.sub(r'[^A-Za-z ]', ' ', text) try: words = word_tokenize(cleaned) sentences = sent_tokenize(text) except LookupError: nltk.download("punkt_tab", quiet=True) words = word_tokenize(cleaned) sentences = sent_tokenize(text) try: sw = stopwords.words(lang) except: sw = stopwords.words("english") filtered = [w.lower() for w in words if w.lower() not in sw and len(w) > 2] freq = FreqDist(filtered) top_words = freq.most_common(10) return { "word_count": len(words), "sentence_count": len(sentences), "unique_words": len(set(filtered)), "top_words": top_words, "stopword_count": len(words) - len(filtered), "filtered_words": filtered[:50] } # --------------------------------------------------------------- # 3️⃣ Flask Routes # --------------------------------------------------------------- @app.route('/') def index(): return render_template('index.html') @app.route('/extract', methods=['POST']) def extract_route(): try: data = request.get_json() url = data.get('url') tag = data.get('tag', 'all') if not url: return jsonify({"error": "No URL provided"}), 400 if not url.startswith("http"): url = "https://" + url content = extract_content(url) if not content: return jsonify({"error": "Failed to fetch content"}), 400 analysis = analyze_text(content.get("text", "")) content["analysis"] = analysis if tag != "all": tag_map = { "h1": "headings", "p": "paragraphs", "img": "images", "a": "links" } result = content.get(tag_map.get(tag, ""), []) return jsonify({ "tag": tag, "results": result, "language": content.get("language"), "analysis": analysis }) return jsonify(content) except Exception as e: print("❌ Backend Error:", e) return jsonify({"error": str(e)}), 500 # --------------------------------------------------------------- # 4️⃣ Run Flask App (Hugging Face compatible) # --------------------------------------------------------------- if __name__ == "__main__": print("=" * 70) print("🚀 Hugging Face Web Content Extractor running...") print("=" * 70) app.run(host="0.0.0.0", port=int(os.environ.get("PORT", 7860)))