import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse import time import json import csv import io import re from collections import deque from langdetect import detect, DetectorFactory from langdetect.lang_detect_exception import LangDetectException DetectorFactory.seed = 42 LANGUAGE_NAMES = { "en": "English", "fr": "French", "de": "German", "es": "Spanish", "it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish", "ru": "Russian", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)", "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi", "tr": "Turkish", "sv": "Swedish", "da": "Danish", "fi": "Finnish", "no": "Norwegian", "cs": "Czech", "hu": "Hungarian", "ro": "Romanian", "uk": "Ukrainian", "vi": "Vietnamese", "th": "Thai", "id": "Indonesian", "ms": "Malay", "bg": "Bulgarian", "hr": "Croatian", "sk": "Slovak", "sl": "Slovenian", "lt": "Lithuanian", "lv": "Latvian", "et": "Estonian", "he": "Hebrew", "fa": "Persian", "bn": "Bengali", "ur": "Urdu", "sw": "Swahili", "ca": "Catalan", "af": "Afrikaans", } HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; DatasetCrawler/1.0; +https://huggingface.co/spaces)", "Accept-Language": "en-US,en;q=0.9", } def clean_text(text): text = re.sub(r'\s+', ' ', text).strip() text = re.sub(r'[^\S\n]+', ' ', text) return text def extract_text_from_page(html, url): soup = BeautifulSoup(html, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]): tag.decompose() paragraphs = [] for tag in soup.find_all(["p", "article", "section", "main", "h1", "h2", "h3", "blockquote", "li"]): text = clean_text(tag.get_text(separator=" ")) if len(text) > 50: paragraphs.append(text) title_tag = soup.find("title") title = clean_text(title_tag.get_text()) if title_tag else "" return title, paragraphs def detect_language(text): try: return detect(text[:500]) except LangDetectException: return "unknown" def topic_matches(text, keywords): if not keywords: return True text_lower = text.lower() return any(kw.lower() in text_lower for kw in keywords) def crawl( seed_urls_text, target_language, topic_keywords_text, max_pages, max_depth, delay, min_text_length, progress=gr.Progress(track_tqdm=False), ): seed_urls = [u.strip() for u in seed_urls_text.strip().split("\n") if u.strip()] if not seed_urls: return "⚠️ Please provide at least one seed URL.", "", None, None topic_keywords = [k.strip() for k in topic_keywords_text.split(",") if k.strip()] if topic_keywords_text.strip() else [] lang_filter = target_language if target_language != "any" else None visited = set() queue = deque() for url in seed_urls: queue.append((url, 0)) collected = [] logs = [] page_count = 0 progress(0, desc="Starting crawl...") while queue and page_count < max_pages: url, depth = queue.popleft() if url in visited or depth > max_depth: continue visited.add(url) try: resp = requests.get(url, headers=HEADERS, timeout=10) if "text/html" not in resp.headers.get("content-type", ""): continue resp.encoding = resp.apparent_encoding html = resp.text except Exception as e: logs.append(f"❌ Failed: {url} — {e}") continue title, paragraphs = extract_text_from_page(html, url) full_text = " ".join(paragraphs) if len(full_text) < min_text_length: logs.append(f"⏭ Skipped (too short): {url}") continue detected_lang = detect_language(full_text) if lang_filter and detected_lang != lang_filter: logs.append(f"⏭ Skipped (lang={detected_lang}): {url}") continue if not topic_matches(full_text, topic_keywords): logs.append(f"⏭ Skipped (topic mismatch): {url}") continue collected.append({ "url": url, "title": title, "language": detected_lang, "word_count": len(full_text.split()), "paragraphs": paragraphs, "text": full_text, }) page_count += 1 logs.append(f"✅ [{page_count}/{max_pages}] {title[:60] or url} (lang={detected_lang}, words={len(full_text.split())})") progress(page_count / max_pages, desc=f"Crawled {page_count}/{max_pages} pages") # Enqueue links if depth < max_depth: try: soup = BeautifulSoup(html, "html.parser") base_domain = urlparse(url).netloc for a in soup.find_all("a", href=True): href = urljoin(url, a["href"]) parsed = urlparse(href) if parsed.scheme in ("http", "https") and parsed.netloc == base_domain: clean = parsed._replace(fragment="").geturl() if clean not in visited: queue.append((clean, depth + 1)) except: pass time.sleep(delay) # Build outputs stats = f"""## 📊 Crawl Complete | Metric | Value | |--------|-------| | Pages crawled | {page_count} | | URLs visited | {len(visited)} | | Text samples collected | {len(collected)} | | Total words | {sum(d['word_count'] for d in collected):,} | | Language filter | {LANGUAGE_NAMES.get(lang_filter, lang_filter) if lang_filter else 'Any'} | | Topic keywords | {', '.join(topic_keywords) if topic_keywords else 'None (all topics)'} | """ log_text = "\n".join(logs[-200:]) # last 200 log lines # JSON output json_data = json.dumps( [{"url": d["url"], "title": d["title"], "language": d["language"], "text": d["text"]} for d in collected], ensure_ascii=False, indent=2 ) # CSV output csv_buf = io.StringIO() writer = csv.DictWriter(csv_buf, fieldnames=["url", "title", "language", "word_count", "text"]) writer.writeheader() for d in collected: writer.writerow({"url": d["url"], "title": d["title"], "language": d["language"], "word_count": d["word_count"], "text": d["text"][:5000]}) csv_data = csv_buf.getvalue() # Save files json_path = "/tmp/crawled_dataset.json" csv_path = "/tmp/crawled_dataset.csv" with open(json_path, "w", encoding="utf-8") as f: f.write(json_data) with open(csv_path, "w", encoding="utf-8") as f: f.write(csv_data) preview_rows = [] for d in collected[:5]: preview_rows.append([d["url"], d["title"][:50], d["language"], d["word_count"], d["text"][:200] + "..."]) return stats, log_text, json_path, csv_path, preview_rows # ── UI ─────────────────────────────────────────────────────────────────────── THEME = gr.themes.Base( primary_hue="emerald", secondary_hue="teal", neutral_hue="zinc", font=[gr.themes.GoogleFont("IBM Plex Mono"), gr.themes.GoogleFont("IBM Plex Sans"), "sans-serif"], ).set( body_background_fill="#0f1117", body_text_color="#e2e8f0", block_background_fill="#1a1f2e", block_border_color="#2d3748", input_background_fill="#0d1117", input_border_color="#374151", ) css = """ :root { --accent: #10b981; --accent-dim: #065f46; --bg-card: #1a1f2e; --text-muted: #6b7280; } .gradio-container { max-width: 1200px !important; margin: auto; } h1.title { font-family: 'IBM Plex Mono', monospace !important; font-size: 2rem; color: #10b981; letter-spacing: -0.03em; text-align: center; margin: 1rem 0 0.25rem; } .subtitle { text-align: center; color: #6b7280; font-family: 'IBM Plex Sans', sans-serif; font-size: 0.9rem; margin-bottom: 1.5rem; } .section-label { font-family: 'IBM Plex Mono', monospace; font-size: 0.7rem; text-transform: uppercase; letter-spacing: 0.1em; color: #10b981; margin-bottom: 0.25rem; } .crawl-btn { background: linear-gradient(135deg, #10b981, #059669) !important; color: white !important; font-family: 'IBM Plex Mono', monospace !important; font-size: 1rem !important; letter-spacing: 0.05em !important; border-radius: 4px !important; height: 48px !important; } .crawl-btn:hover { background: linear-gradient(135deg, #059669, #047857) !important; transform: translateY(-1px); box-shadow: 0 4px 20px rgba(16,185,129,0.3) !important; } .stop-btn { font-family: 'IBM Plex Mono', monospace !important; } footer { display: none !important; } """ lang_choices = [("Any Language", "any")] + [(f"{v} ({k})", k) for k, v in sorted(LANGUAGE_NAMES.items(), key=lambda x: x[1])] with gr.Blocks(title="WebCrawler · Dataset Builder") as demo: gr.HTML("""

▸ WebCrawler / Dataset Builder

Crawl the web and extract text datasets filtered by language or topic — ready for NLP & LLM training.

""") with gr.Row(): with gr.Column(scale=1): gr.HTML("
🌐 Seed URLs
") seed_urls = gr.Textbox( label="Seed URLs (one per line)", placeholder="https://example.com\nhttps://another-site.org", lines=5, value="https://en.wikipedia.org/wiki/Artificial_intelligence", ) gr.HTML("
🔤 Language Filter
") target_lang = gr.Dropdown( label="Target Language", choices=lang_choices, value="any", ) gr.HTML("
🏷️ Topic Keywords (optional)
") topic_kw = gr.Textbox( label="Keywords (comma-separated)", placeholder="machine learning, neural network, AI", lines=2, ) with gr.Column(scale=1): gr.HTML("
⚙️ Crawl Settings
") max_pages = gr.Slider(label="Max Pages", minimum=1, maximum=500, value=20, step=1) max_depth = gr.Slider(label="Max Depth", minimum=0, maximum=5, value=2, step=1) delay = gr.Slider(label="Delay Between Requests (s)", minimum=0.1, maximum=5.0, value=0.5, step=0.1) min_len = gr.Slider(label="Min Text Length (chars)", minimum=100, maximum=5000, value=300, step=100) with gr.Row(): run_btn = gr.Button("▶ START CRAWL", elem_classes="crawl-btn", variant="primary") stop_btn = gr.Button("⏹ Stop", elem_classes="stop-btn", variant="stop") with gr.Tabs(): with gr.Tab("📊 Summary"): stats_md = gr.Markdown("*Results will appear here after crawling.*") with gr.Tab("📋 Preview"): preview_table = gr.Dataframe( headers=["URL", "Title", "Lang", "Words", "Text Preview"], label="First 5 Results", wrap=True, ) with gr.Tab("📜 Logs"): log_box = gr.Textbox(label="Crawl Log", lines=20, max_lines=30) with gr.Tab("💾 Download"): gr.Markdown("### Download your dataset") with gr.Row(): json_file = gr.File(label="📄 JSON Dataset", file_types=[".json"]) csv_file = gr.File(label="📊 CSV Dataset", file_types=[".csv"]) crawl_event = run_btn.click( fn=crawl, inputs=[seed_urls, target_lang, topic_kw, max_pages, max_depth, delay, min_len], outputs=[stats_md, log_box, json_file, csv_file, preview_table], ) stop_btn.click(fn=None, cancels=[crawl_event]) if __name__ == "__main__": demo.launch(theme=THEME, css=css)