Spaces:

ghost613
/

WebCrawler-Dataset-Builder

Sleeping

File size: 12,110 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
import csv
import io
import re
from collections import deque
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

DetectorFactory.seed = 42

LANGUAGE_NAMES = {
    "en": "English", "fr": "French", "de": "German", "es": "Spanish",
    "it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish",
    "ru": "Russian", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
    "ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi",
    "tr": "Turkish", "sv": "Swedish", "da": "Danish", "fi": "Finnish",
    "no": "Norwegian", "cs": "Czech", "hu": "Hungarian", "ro": "Romanian",
    "uk": "Ukrainian", "vi": "Vietnamese", "th": "Thai", "id": "Indonesian",
    "ms": "Malay", "bg": "Bulgarian", "hr": "Croatian", "sk": "Slovak",
    "sl": "Slovenian", "lt": "Lithuanian", "lv": "Latvian", "et": "Estonian",
    "he": "Hebrew", "fa": "Persian", "bn": "Bengali", "ur": "Urdu",
    "sw": "Swahili", "ca": "Catalan", "af": "Afrikaans",
}

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; DatasetCrawler/1.0; +https://huggingface.co/spaces)",
    "Accept-Language": "en-US,en;q=0.9",
}

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\S\n]+', ' ', text)
    return text

def extract_text_from_page(html, url):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
        tag.decompose()
    paragraphs = []
    for tag in soup.find_all(["p", "article", "section", "main", "h1", "h2", "h3", "blockquote", "li"]):
        text = clean_text(tag.get_text(separator=" "))
        if len(text) > 50:
            paragraphs.append(text)
    title_tag = soup.find("title")
    title = clean_text(title_tag.get_text()) if title_tag else ""
    return title, paragraphs

def detect_language(text):
    try:
        return detect(text[:500])
    except LangDetectException:
        return "unknown"

def topic_matches(text, keywords):
    if not keywords:
        return True
    text_lower = text.lower()
    return any(kw.lower() in text_lower for kw in keywords)

def crawl(
    seed_urls_text,
    target_language,
    topic_keywords_text,
    max_pages,
    max_depth,
    delay,
    min_text_length,
    progress=gr.Progress(track_tqdm=False),
):
    seed_urls = [u.strip() for u in seed_urls_text.strip().split("\n") if u.strip()]
    if not seed_urls:
        return "⚠️ Please provide at least one seed URL.", "", None, None

    topic_keywords = [k.strip() for k in topic_keywords_text.split(",") if k.strip()] if topic_keywords_text.strip() else []
    lang_filter = target_language if target_language != "any" else None

    visited = set()
    queue = deque()
    for url in seed_urls:
        queue.append((url, 0))

    collected = []
    logs = []
    page_count = 0

    progress(0, desc="Starting crawl...")

    while queue and page_count < max_pages:
        url, depth = queue.popleft()
        if url in visited or depth > max_depth:
            continue
        visited.add(url)

        try:
            resp = requests.get(url, headers=HEADERS, timeout=10)
            if "text/html" not in resp.headers.get("content-type", ""):
                continue
            resp.encoding = resp.apparent_encoding
            html = resp.text
        except Exception as e:
            logs.append(f"❌ Failed: {url} — {e}")
            continue

        title, paragraphs = extract_text_from_page(html, url)
        full_text = " ".join(paragraphs)

        if len(full_text) < min_text_length:
            logs.append(f"⏭ Skipped (too short): {url}")
            continue

        detected_lang = detect_language(full_text)

        if lang_filter and detected_lang != lang_filter:
            logs.append(f"⏭ Skipped (lang={detected_lang}): {url}")
            continue

        if not topic_matches(full_text, topic_keywords):
            logs.append(f"⏭ Skipped (topic mismatch): {url}")
            continue

        collected.append({
            "url": url,
            "title": title,
            "language": detected_lang,
            "word_count": len(full_text.split()),
            "paragraphs": paragraphs,
            "text": full_text,
        })
        page_count += 1
        logs.append(f"✅ [{page_count}/{max_pages}] {title[:60] or url} (lang={detected_lang}, words={len(full_text.split())})")

        progress(page_count / max_pages, desc=f"Crawled {page_count}/{max_pages} pages")

        # Enqueue links
        if depth < max_depth:
            try:
                soup = BeautifulSoup(html, "html.parser")
                base_domain = urlparse(url).netloc
                for a in soup.find_all("a", href=True):
                    href = urljoin(url, a["href"])
                    parsed = urlparse(href)
                    if parsed.scheme in ("http", "https") and parsed.netloc == base_domain:
                        clean = parsed._replace(fragment="").geturl()
                        if clean not in visited:
                            queue.append((clean, depth + 1))
            except:
                pass

        time.sleep(delay)

    # Build outputs
    stats = f"""## 📊 Crawl Complete

| Metric | Value |
|--------|-------|
| Pages crawled | {page_count} |
| URLs visited | {len(visited)} |
| Text samples collected | {len(collected)} |
| Total words | {sum(d['word_count'] for d in collected):,} |
| Language filter | {LANGUAGE_NAMES.get(lang_filter, lang_filter) if lang_filter else 'Any'} |
| Topic keywords | {', '.join(topic_keywords) if topic_keywords else 'None (all topics)'} |
"""

    log_text = "\n".join(logs[-200:])  # last 200 log lines

    # JSON output
    json_data = json.dumps(
        [{"url": d["url"], "title": d["title"], "language": d["language"], "text": d["text"]} for d in collected],
        ensure_ascii=False,
        indent=2
    )

    # CSV output
    csv_buf = io.StringIO()
    writer = csv.DictWriter(csv_buf, fieldnames=["url", "title", "language", "word_count", "text"])
    writer.writeheader()
    for d in collected:
        writer.writerow({"url": d["url"], "title": d["title"], "language": d["language"], "word_count": d["word_count"], "text": d["text"][:5000]})
    csv_data = csv_buf.getvalue()

    # Save files
    json_path = "/tmp/crawled_dataset.json"
    csv_path = "/tmp/crawled_dataset.csv"
    with open(json_path, "w", encoding="utf-8") as f:
        f.write(json_data)
    with open(csv_path, "w", encoding="utf-8") as f:
        f.write(csv_data)

    preview_rows = []
    for d in collected[:5]:
        preview_rows.append([d["url"], d["title"][:50], d["language"], d["word_count"], d["text"][:200] + "..."])

    return stats, log_text, json_path, csv_path, preview_rows


# ── UI ───────────────────────────────────────────────────────────────────────

THEME = gr.themes.Base(
    primary_hue="emerald",
    secondary_hue="teal",
    neutral_hue="zinc",
    font=[gr.themes.GoogleFont("IBM Plex Mono"), gr.themes.GoogleFont("IBM Plex Sans"), "sans-serif"],
).set(
    body_background_fill="#0f1117",
    body_text_color="#e2e8f0",
    block_background_fill="#1a1f2e",
    block_border_color="#2d3748",
    input_background_fill="#0d1117",
    input_border_color="#374151",
)

css = """
:root {
    --accent: #10b981;
    --accent-dim: #065f46;
    --bg-card: #1a1f2e;
    --text-muted: #6b7280;
}

.gradio-container { max-width: 1200px !important; margin: auto; }

h1.title {
    font-family: 'IBM Plex Mono', monospace !important;
    font-size: 2rem;
    color: #10b981;
    letter-spacing: -0.03em;
    text-align: center;
    margin: 1rem 0 0.25rem;
}

.subtitle {
    text-align: center;
    color: #6b7280;
    font-family: 'IBM Plex Sans', sans-serif;
    font-size: 0.9rem;
    margin-bottom: 1.5rem;
}

.section-label {
    font-family: 'IBM Plex Mono', monospace;
    font-size: 0.7rem;
    text-transform: uppercase;
    letter-spacing: 0.1em;
    color: #10b981;
    margin-bottom: 0.25rem;
}

.crawl-btn {
    background: linear-gradient(135deg, #10b981, #059669) !important;
    color: white !important;
    font-family: 'IBM Plex Mono', monospace !important;
    font-size: 1rem !important;
    letter-spacing: 0.05em !important;
    border-radius: 4px !important;
    height: 48px !important;
}

.crawl-btn:hover {
    background: linear-gradient(135deg, #059669, #047857) !important;
    transform: translateY(-1px);
    box-shadow: 0 4px 20px rgba(16,185,129,0.3) !important;
}

.stop-btn {
    font-family: 'IBM Plex Mono', monospace !important;
}

footer { display: none !important; }
"""

lang_choices = [("Any Language", "any")] + [(f"{v} ({k})", k) for k, v in sorted(LANGUAGE_NAMES.items(), key=lambda x: x[1])]

with gr.Blocks(title="WebCrawler · Dataset Builder") as demo:
    gr.HTML("""
    <h1 class='title'>▸ WebCrawler / Dataset Builder</h1>
    <p class='subtitle'>Crawl the web and extract text datasets filtered by language or topic — ready for NLP & LLM training.</p>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.HTML("<div class='section-label'>🌐 Seed URLs</div>")
            seed_urls = gr.Textbox(
                label="Seed URLs (one per line)",
                placeholder="https://example.com\nhttps://another-site.org",
                lines=5,
                value="https://en.wikipedia.org/wiki/Artificial_intelligence",
            )

            gr.HTML("<div class='section-label'>🔤 Language Filter</div>")
            target_lang = gr.Dropdown(
                label="Target Language",
                choices=lang_choices,
                value="any",
            )

            gr.HTML("<div class='section-label'>🏷️ Topic Keywords (optional)</div>")
            topic_kw = gr.Textbox(
                label="Keywords (comma-separated)",
                placeholder="machine learning, neural network, AI",
                lines=2,
            )

        with gr.Column(scale=1):
            gr.HTML("<div class='section-label'>⚙️ Crawl Settings</div>")
            max_pages = gr.Slider(label="Max Pages", minimum=1, maximum=500, value=20, step=1)
            max_depth = gr.Slider(label="Max Depth", minimum=0, maximum=5, value=2, step=1)
            delay = gr.Slider(label="Delay Between Requests (s)", minimum=0.1, maximum=5.0, value=0.5, step=0.1)
            min_len = gr.Slider(label="Min Text Length (chars)", minimum=100, maximum=5000, value=300, step=100)

    with gr.Row():
        run_btn = gr.Button("▶  START CRAWL", elem_classes="crawl-btn", variant="primary")
        stop_btn = gr.Button("⏹ Stop", elem_classes="stop-btn", variant="stop")

    with gr.Tabs():
        with gr.Tab("📊 Summary"):
            stats_md = gr.Markdown("*Results will appear here after crawling.*")

        with gr.Tab("📋 Preview"):
            preview_table = gr.Dataframe(
                headers=["URL", "Title", "Lang", "Words", "Text Preview"],
                label="First 5 Results",
                wrap=True,
            )

        with gr.Tab("📜 Logs"):
            log_box = gr.Textbox(label="Crawl Log", lines=20, max_lines=30)

        with gr.Tab("💾 Download"):
            gr.Markdown("### Download your dataset")
            with gr.Row():
                json_file = gr.File(label="📄 JSON Dataset", file_types=[".json"])
                csv_file = gr.File(label="📊 CSV Dataset", file_types=[".csv"])

    crawl_event = run_btn.click(
        fn=crawl,
        inputs=[seed_urls, target_lang, topic_kw, max_pages, max_depth, delay, min_len],
        outputs=[stats_md, log_box, json_file, csv_file, preview_table],
    )
    stop_btn.click(fn=None, cancels=[crawl_event])

if __name__ == "__main__":
    demo.launch(theme=THEME, css=css)