ghost613's picture
Upload 2 files
7b91172 verified
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
import json
import csv
import io
import re
from collections import deque
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
DetectorFactory.seed = 42
LANGUAGE_NAMES = {
"en": "English", "fr": "French", "de": "German", "es": "Spanish",
"it": "Italian", "pt": "Portuguese", "nl": "Dutch", "pl": "Polish",
"ru": "Russian", "zh-cn": "Chinese (Simplified)", "zh-tw": "Chinese (Traditional)",
"ja": "Japanese", "ko": "Korean", "ar": "Arabic", "hi": "Hindi",
"tr": "Turkish", "sv": "Swedish", "da": "Danish", "fi": "Finnish",
"no": "Norwegian", "cs": "Czech", "hu": "Hungarian", "ro": "Romanian",
"uk": "Ukrainian", "vi": "Vietnamese", "th": "Thai", "id": "Indonesian",
"ms": "Malay", "bg": "Bulgarian", "hr": "Croatian", "sk": "Slovak",
"sl": "Slovenian", "lt": "Lithuanian", "lv": "Latvian", "et": "Estonian",
"he": "Hebrew", "fa": "Persian", "bn": "Bengali", "ur": "Urdu",
"sw": "Swahili", "ca": "Catalan", "af": "Afrikaans",
}
HEADERS = {
"User-Agent": "Mozilla/5.0 (compatible; DatasetCrawler/1.0; +https://huggingface.co/spaces)",
"Accept-Language": "en-US,en;q=0.9",
}
def clean_text(text):
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'[^\S\n]+', ' ', text)
return text
def extract_text_from_page(html, url):
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript", "form"]):
tag.decompose()
paragraphs = []
for tag in soup.find_all(["p", "article", "section", "main", "h1", "h2", "h3", "blockquote", "li"]):
text = clean_text(tag.get_text(separator=" "))
if len(text) > 50:
paragraphs.append(text)
title_tag = soup.find("title")
title = clean_text(title_tag.get_text()) if title_tag else ""
return title, paragraphs
def detect_language(text):
try:
return detect(text[:500])
except LangDetectException:
return "unknown"
def topic_matches(text, keywords):
if not keywords:
return True
text_lower = text.lower()
return any(kw.lower() in text_lower for kw in keywords)
def crawl(
seed_urls_text,
target_language,
topic_keywords_text,
max_pages,
max_depth,
delay,
min_text_length,
progress=gr.Progress(track_tqdm=False),
):
seed_urls = [u.strip() for u in seed_urls_text.strip().split("\n") if u.strip()]
if not seed_urls:
return "⚠️ Please provide at least one seed URL.", "", None, None
topic_keywords = [k.strip() for k in topic_keywords_text.split(",") if k.strip()] if topic_keywords_text.strip() else []
lang_filter = target_language if target_language != "any" else None
visited = set()
queue = deque()
for url in seed_urls:
queue.append((url, 0))
collected = []
logs = []
page_count = 0
progress(0, desc="Starting crawl...")
while queue and page_count < max_pages:
url, depth = queue.popleft()
if url in visited or depth > max_depth:
continue
visited.add(url)
try:
resp = requests.get(url, headers=HEADERS, timeout=10)
if "text/html" not in resp.headers.get("content-type", ""):
continue
resp.encoding = resp.apparent_encoding
html = resp.text
except Exception as e:
logs.append(f"❌ Failed: {url} β€” {e}")
continue
title, paragraphs = extract_text_from_page(html, url)
full_text = " ".join(paragraphs)
if len(full_text) < min_text_length:
logs.append(f"⏭ Skipped (too short): {url}")
continue
detected_lang = detect_language(full_text)
if lang_filter and detected_lang != lang_filter:
logs.append(f"⏭ Skipped (lang={detected_lang}): {url}")
continue
if not topic_matches(full_text, topic_keywords):
logs.append(f"⏭ Skipped (topic mismatch): {url}")
continue
collected.append({
"url": url,
"title": title,
"language": detected_lang,
"word_count": len(full_text.split()),
"paragraphs": paragraphs,
"text": full_text,
})
page_count += 1
logs.append(f"βœ… [{page_count}/{max_pages}] {title[:60] or url} (lang={detected_lang}, words={len(full_text.split())})")
progress(page_count / max_pages, desc=f"Crawled {page_count}/{max_pages} pages")
# Enqueue links
if depth < max_depth:
try:
soup = BeautifulSoup(html, "html.parser")
base_domain = urlparse(url).netloc
for a in soup.find_all("a", href=True):
href = urljoin(url, a["href"])
parsed = urlparse(href)
if parsed.scheme in ("http", "https") and parsed.netloc == base_domain:
clean = parsed._replace(fragment="").geturl()
if clean not in visited:
queue.append((clean, depth + 1))
except:
pass
time.sleep(delay)
# Build outputs
stats = f"""## πŸ“Š Crawl Complete
| Metric | Value |
|--------|-------|
| Pages crawled | {page_count} |
| URLs visited | {len(visited)} |
| Text samples collected | {len(collected)} |
| Total words | {sum(d['word_count'] for d in collected):,} |
| Language filter | {LANGUAGE_NAMES.get(lang_filter, lang_filter) if lang_filter else 'Any'} |
| Topic keywords | {', '.join(topic_keywords) if topic_keywords else 'None (all topics)'} |
"""
log_text = "\n".join(logs[-200:]) # last 200 log lines
# JSON output
json_data = json.dumps(
[{"url": d["url"], "title": d["title"], "language": d["language"], "text": d["text"]} for d in collected],
ensure_ascii=False,
indent=2
)
# CSV output
csv_buf = io.StringIO()
writer = csv.DictWriter(csv_buf, fieldnames=["url", "title", "language", "word_count", "text"])
writer.writeheader()
for d in collected:
writer.writerow({"url": d["url"], "title": d["title"], "language": d["language"], "word_count": d["word_count"], "text": d["text"][:5000]})
csv_data = csv_buf.getvalue()
# Save files
json_path = "/tmp/crawled_dataset.json"
csv_path = "/tmp/crawled_dataset.csv"
with open(json_path, "w", encoding="utf-8") as f:
f.write(json_data)
with open(csv_path, "w", encoding="utf-8") as f:
f.write(csv_data)
preview_rows = []
for d in collected[:5]:
preview_rows.append([d["url"], d["title"][:50], d["language"], d["word_count"], d["text"][:200] + "..."])
return stats, log_text, json_path, csv_path, preview_rows
# ── UI ───────────────────────────────────────────────────────────────────────
THEME = gr.themes.Base(
primary_hue="emerald",
secondary_hue="teal",
neutral_hue="zinc",
font=[gr.themes.GoogleFont("IBM Plex Mono"), gr.themes.GoogleFont("IBM Plex Sans"), "sans-serif"],
).set(
body_background_fill="#0f1117",
body_text_color="#e2e8f0",
block_background_fill="#1a1f2e",
block_border_color="#2d3748",
input_background_fill="#0d1117",
input_border_color="#374151",
)
css = """
:root {
--accent: #10b981;
--accent-dim: #065f46;
--bg-card: #1a1f2e;
--text-muted: #6b7280;
}
.gradio-container { max-width: 1200px !important; margin: auto; }
h1.title {
font-family: 'IBM Plex Mono', monospace !important;
font-size: 2rem;
color: #10b981;
letter-spacing: -0.03em;
text-align: center;
margin: 1rem 0 0.25rem;
}
.subtitle {
text-align: center;
color: #6b7280;
font-family: 'IBM Plex Sans', sans-serif;
font-size: 0.9rem;
margin-bottom: 1.5rem;
}
.section-label {
font-family: 'IBM Plex Mono', monospace;
font-size: 0.7rem;
text-transform: uppercase;
letter-spacing: 0.1em;
color: #10b981;
margin-bottom: 0.25rem;
}
.crawl-btn {
background: linear-gradient(135deg, #10b981, #059669) !important;
color: white !important;
font-family: 'IBM Plex Mono', monospace !important;
font-size: 1rem !important;
letter-spacing: 0.05em !important;
border-radius: 4px !important;
height: 48px !important;
}
.crawl-btn:hover {
background: linear-gradient(135deg, #059669, #047857) !important;
transform: translateY(-1px);
box-shadow: 0 4px 20px rgba(16,185,129,0.3) !important;
}
.stop-btn {
font-family: 'IBM Plex Mono', monospace !important;
}
footer { display: none !important; }
"""
lang_choices = [("Any Language", "any")] + [(f"{v} ({k})", k) for k, v in sorted(LANGUAGE_NAMES.items(), key=lambda x: x[1])]
with gr.Blocks(title="WebCrawler Β· Dataset Builder") as demo:
gr.HTML("""
<h1 class='title'>β–Έ WebCrawler / Dataset Builder</h1>
<p class='subtitle'>Crawl the web and extract text datasets filtered by language or topic β€” ready for NLP & LLM training.</p>
""")
with gr.Row():
with gr.Column(scale=1):
gr.HTML("<div class='section-label'>🌐 Seed URLs</div>")
seed_urls = gr.Textbox(
label="Seed URLs (one per line)",
placeholder="https://example.com\nhttps://another-site.org",
lines=5,
value="https://en.wikipedia.org/wiki/Artificial_intelligence",
)
gr.HTML("<div class='section-label'>πŸ”€ Language Filter</div>")
target_lang = gr.Dropdown(
label="Target Language",
choices=lang_choices,
value="any",
)
gr.HTML("<div class='section-label'>🏷️ Topic Keywords (optional)</div>")
topic_kw = gr.Textbox(
label="Keywords (comma-separated)",
placeholder="machine learning, neural network, AI",
lines=2,
)
with gr.Column(scale=1):
gr.HTML("<div class='section-label'>βš™οΈ Crawl Settings</div>")
max_pages = gr.Slider(label="Max Pages", minimum=1, maximum=500, value=20, step=1)
max_depth = gr.Slider(label="Max Depth", minimum=0, maximum=5, value=2, step=1)
delay = gr.Slider(label="Delay Between Requests (s)", minimum=0.1, maximum=5.0, value=0.5, step=0.1)
min_len = gr.Slider(label="Min Text Length (chars)", minimum=100, maximum=5000, value=300, step=100)
with gr.Row():
run_btn = gr.Button("β–Ά START CRAWL", elem_classes="crawl-btn", variant="primary")
stop_btn = gr.Button("⏹ Stop", elem_classes="stop-btn", variant="stop")
with gr.Tabs():
with gr.Tab("πŸ“Š Summary"):
stats_md = gr.Markdown("*Results will appear here after crawling.*")
with gr.Tab("πŸ“‹ Preview"):
preview_table = gr.Dataframe(
headers=["URL", "Title", "Lang", "Words", "Text Preview"],
label="First 5 Results",
wrap=True,
)
with gr.Tab("πŸ“œ Logs"):
log_box = gr.Textbox(label="Crawl Log", lines=20, max_lines=30)
with gr.Tab("πŸ’Ύ Download"):
gr.Markdown("### Download your dataset")
with gr.Row():
json_file = gr.File(label="πŸ“„ JSON Dataset", file_types=[".json"])
csv_file = gr.File(label="πŸ“Š CSV Dataset", file_types=[".csv"])
crawl_event = run_btn.click(
fn=crawl,
inputs=[seed_urls, target_lang, topic_kw, max_pages, max_depth, delay, min_len],
outputs=[stats_md, log_box, json_file, csv_file, preview_table],
)
stop_btn.click(fn=None, cancels=[crawl_event])
if __name__ == "__main__":
demo.launch(theme=THEME, css=css)