Spaces:

ghost613
/

WebCrawler-Dataset-Builder

Sleeping

App Files Files Community

WebCrawler-Dataset-Builder / app.py

ghost613

Upload 2 files

7b91172 verified 21 days ago

raw

history blame contribute delete

12.1 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import time
	import json
	import csv
	import io
	import re
	from collections import deque
	from langdetect import detect, DetectorFactory
	from langdetect.lang_detect_exception import LangDetectException

	DetectorFactory.seed = 42

	LANGUAGE_NAMES = {
	"en": "English", "fr": "French", "de": "German", "es": "Spanish",
	"it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish",
	"ru": "Russian", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
	"ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi",
	"tr": "Turkish", "sv": "Swedish", "da": "Danish", "fi": "Finnish",
	"no": "Norwegian", "cs": "Czech", "hu": "Hungarian", "ro": "Romanian",
	"uk": "Ukrainian", "vi": "Vietnamese", "th": "Thai", "id": "Indonesian",
	"ms": "Malay", "bg": "Bulgarian", "hr": "Croatian", "sk": "Slovak",
	"sl": "Slovenian", "lt": "Lithuanian", "lv": "Latvian", "et": "Estonian",
	"he": "Hebrew", "fa": "Persian", "bn": "Bengali", "ur": "Urdu",
	"sw": "Swahili", "ca": "Catalan", "af": "Afrikaans",
	}

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (compatible; DatasetCrawler/1.0; +https://huggingface.co/spaces)",
	"Accept-Language": "en-US,en;q=0.9",
	}

	def clean_text(text):
	text = re.sub(r'\s+', ' ', text).strip()
	text = re.sub(r'[^\S\n]+', ' ', text)
	return text

	def extract_text_from_page(html, url):
	soup = BeautifulSoup(html, "html.parser")
	for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
	tag.decompose()
	paragraphs = []
	for tag in soup.find_all(["p", "article", "section", "main", "h1", "h2", "h3", "blockquote", "li"]):
	text = clean_text(tag.get_text(separator=" "))
	if len(text) > 50:
	paragraphs.append(text)
	title_tag = soup.find("title")
	title = clean_text(title_tag.get_text()) if title_tag else ""
	return title, paragraphs

	def detect_language(text):
	try:
	return detect(text[:500])
	except LangDetectException:
	return "unknown"

	def topic_matches(text, keywords):
	if not keywords:
	return True
	text_lower = text.lower()
	return any(kw.lower() in text_lower for kw in keywords)

	def crawl(
	seed_urls_text,
	target_language,
	topic_keywords_text,
	max_pages,
	max_depth,
	delay,
	min_text_length,
	progress=gr.Progress(track_tqdm=False),
	):
	seed_urls = [u.strip() for u in seed_urls_text.strip().split("\n") if u.strip()]
	if not seed_urls:
	return "⚠️ Please provide at least one seed URL.", "", None, None

	topic_keywords = [k.strip() for k in topic_keywords_text.split(",") if k.strip()] if topic_keywords_text.strip() else []
	lang_filter = target_language if target_language != "any" else None

	visited = set()
	queue = deque()
	for url in seed_urls:
	queue.append((url, 0))

	collected = []
	logs = []
	page_count = 0

	progress(0, desc="Starting crawl...")

	while queue and page_count < max_pages:
	url, depth = queue.popleft()
	if url in visited or depth > max_depth:
	continue
	visited.add(url)

	try:
	resp = requests.get(url, headers=HEADERS, timeout=10)
	if "text/html" not in resp.headers.get("content-type", ""):
	continue
	resp.encoding = resp.apparent_encoding
	html = resp.text
	except Exception as e:
	logs.append(f"❌ Failed: {url} — {e}")
	continue

	title, paragraphs = extract_text_from_page(html, url)
	full_text = " ".join(paragraphs)

	if len(full_text) < min_text_length:
	logs.append(f"⏭ Skipped (too short): {url}")
	continue

	detected_lang = detect_language(full_text)

	if lang_filter and detected_lang != lang_filter:
	logs.append(f"⏭ Skipped (lang={detected_lang}): {url}")
	continue

	if not topic_matches(full_text, topic_keywords):
	logs.append(f"⏭ Skipped (topic mismatch): {url}")
	continue

	collected.append({
	"url": url,
	"title": title,
	"language": detected_lang,
	"word_count": len(full_text.split()),
	"paragraphs": paragraphs,
	"text": full_text,
	})
	page_count += 1
	logs.append(f"✅ [{page_count}/{max_pages}] {title[:60] or url} (lang={detected_lang}, words={len(full_text.split())})")

	progress(page_count / max_pages, desc=f"Crawled {page_count}/{max_pages} pages")

	# Enqueue links
	if depth < max_depth:
	try:
	soup = BeautifulSoup(html, "html.parser")
	base_domain = urlparse(url).netloc
	for a in soup.find_all("a", href=True):
	href = urljoin(url, a["href"])
	parsed = urlparse(href)
	if parsed.scheme in ("http", "https") and parsed.netloc == base_domain:
	clean = parsed._replace(fragment="").geturl()
	if clean not in visited:
	queue.append((clean, depth + 1))
	except:
	pass

	time.sleep(delay)

	# Build outputs
	stats = f"""## 📊 Crawl Complete

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Pages crawled \| {page_count} \|
	\| URLs visited \| {len(visited)} \|
	\| Text samples collected \| {len(collected)} \|
	\| Total words \| {sum(d['word_count'] for d in collected):,} \|
	\| Language filter \| {LANGUAGE_NAMES.get(lang_filter, lang_filter) if lang_filter else 'Any'} \|
	\| Topic keywords \| {', '.join(topic_keywords) if topic_keywords else 'None (all topics)'} \|
	"""

	log_text = "\n".join(logs[-200:]) # last 200 log lines

	# JSON output
	json_data = json.dumps(
	[{"url": d["url"], "title": d["title"], "language": d["language"], "text": d["text"]} for d in collected],
	ensure_ascii=False,
	indent=2
	)

	# CSV output
	csv_buf = io.StringIO()
	writer = csv.DictWriter(csv_buf, fieldnames=["url", "title", "language", "word_count", "text"])
	writer.writeheader()
	for d in collected:
	writer.writerow({"url": d["url"], "title": d["title"], "language": d["language"], "word_count": d["word_count"], "text": d["text"][:5000]})
	csv_data = csv_buf.getvalue()

	# Save files
	json_path = "/tmp/crawled_dataset.json"
	csv_path = "/tmp/crawled_dataset.csv"
	with open(json_path, "w", encoding="utf-8") as f:
	f.write(json_data)
	with open(csv_path, "w", encoding="utf-8") as f:
	f.write(csv_data)

	preview_rows = []
	for d in collected[:5]:
	preview_rows.append([d["url"], d["title"][:50], d["language"], d["word_count"], d["text"][:200] + "..."])

	return stats, log_text, json_path, csv_path, preview_rows


	# ── UI ───────────────────────────────────────────────────────────────────────

	THEME = gr.themes.Base(
	primary_hue="emerald",
	secondary_hue="teal",
	neutral_hue="zinc",
	font=[gr.themes.GoogleFont("IBM Plex Mono"), gr.themes.GoogleFont("IBM Plex Sans"), "sans-serif"],
	).set(
	body_background_fill="#0f1117",
	body_text_color="#e2e8f0",
	block_background_fill="#1a1f2e",
	block_border_color="#2d3748",
	input_background_fill="#0d1117",
	input_border_color="#374151",
	)

	css = """
	:root {
	--accent: #10b981;
	--accent-dim: #065f46;
	--bg-card: #1a1f2e;
	--text-muted: #6b7280;
	}

	.gradio-container { max-width: 1200px !important; margin: auto; }

	h1.title {
	font-family: 'IBM Plex Mono', monospace !important;
	font-size: 2rem;
	color: #10b981;
	letter-spacing: -0.03em;
	text-align: center;
	margin: 1rem 0 0.25rem;
	}

	.subtitle {
	text-align: center;
	color: #6b7280;
	font-family: 'IBM Plex Sans', sans-serif;
	font-size: 0.9rem;
	margin-bottom: 1.5rem;
	}

	.section-label {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 0.7rem;
	text-transform: uppercase;
	letter-spacing: 0.1em;
	color: #10b981;
	margin-bottom: 0.25rem;
	}

	.crawl-btn {
	background: linear-gradient(135deg, #10b981, #059669) !important;
	color: white !important;
	font-family: 'IBM Plex Mono', monospace !important;
	font-size: 1rem !important;
	letter-spacing: 0.05em !important;
	border-radius: 4px !important;
	height: 48px !important;
	}

	.crawl-btn:hover {
	background: linear-gradient(135deg, #059669, #047857) !important;
	transform: translateY(-1px);
	box-shadow: 0 4px 20px rgba(16,185,129,0.3) !important;
	}

	.stop-btn {
	font-family: 'IBM Plex Mono', monospace !important;
	}

	footer { display: none !important; }
	"""

	lang_choices = [("Any Language", "any")] + [(f"{v} ({k})", k) for k, v in sorted(LANGUAGE_NAMES.items(), key=lambda x: x[1])]

	with gr.Blocks(title="WebCrawler · Dataset Builder") as demo:
	gr.HTML("""
	<h1 class='title'>▸ WebCrawler / Dataset Builder</h1>
	<p class='subtitle'>Crawl the web and extract text datasets filtered by language or topic — ready for NLP & LLM training.</p>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.HTML("<div class='section-label'>🌐 Seed URLs</div>")
	seed_urls = gr.Textbox(
	label="Seed URLs (one per line)",
	placeholder="https://example.com\nhttps://another-site.org",
	lines=5,
	value="https://en.wikipedia.org/wiki/Artificial_intelligence",
	)

	gr.HTML("<div class='section-label'>🔤 Language Filter</div>")
	target_lang = gr.Dropdown(
	label="Target Language",
	choices=lang_choices,
	value="any",
	)

	gr.HTML("<div class='section-label'>🏷️ Topic Keywords (optional)</div>")
	topic_kw = gr.Textbox(
	label="Keywords (comma-separated)",
	placeholder="machine learning, neural network, AI",
	lines=2,
	)

	with gr.Column(scale=1):
	gr.HTML("<div class='section-label'>⚙️ Crawl Settings</div>")
	max_pages = gr.Slider(label="Max Pages", minimum=1, maximum=500, value=20, step=1)
	max_depth = gr.Slider(label="Max Depth", minimum=0, maximum=5, value=2, step=1)
	delay = gr.Slider(label="Delay Between Requests (s)", minimum=0.1, maximum=5.0, value=0.5, step=0.1)
	min_len = gr.Slider(label="Min Text Length (chars)", minimum=100, maximum=5000, value=300, step=100)

	with gr.Row():
	run_btn = gr.Button("▶ START CRAWL", elem_classes="crawl-btn", variant="primary")
	stop_btn = gr.Button("⏹ Stop", elem_classes="stop-btn", variant="stop")

	with gr.Tabs():
	with gr.Tab("📊 Summary"):
	stats_md = gr.Markdown("Results will appear here after crawling.")

	with gr.Tab("📋 Preview"):
	preview_table = gr.Dataframe(
	headers=["URL", "Title", "Lang", "Words", "Text Preview"],
	label="First 5 Results",
	wrap=True,
	)

	with gr.Tab("📜 Logs"):
	log_box = gr.Textbox(label="Crawl Log", lines=20, max_lines=30)

	with gr.Tab("💾 Download"):
	gr.Markdown("### Download your dataset")
	with gr.Row():
	json_file = gr.File(label="📄 JSON Dataset", file_types=[".json"])
	csv_file = gr.File(label="📊 CSV Dataset", file_types=[".csv"])

	crawl_event = run_btn.click(
	fn=crawl,
	inputs=[seed_urls, target_lang, topic_kw, max_pages, max_depth, delay, min_len],
	outputs=[stats_md, log_box, json_file, csv_file, preview_table],
	)
	stop_btn.click(fn=None, cancels=[crawl_event])

	if __name__ == "__main__":
	demo.launch(theme=THEME, css=css)