Spaces:

letitbE
/

image2wiki

Sleeping

File size: 7,449 Bytes

ece157f

"""
Сбор данных с ru.wikipedia.org для fine-tuning CLIP.
Собирает случайные статьи с изображениями — пары (картинка, текст).

Использование:
    python collect_data.py
    python collect_data.py --max-total 10000
    python collect_data.py --max-total 10000 --resume
"""

import argparse
import hashlib
import json
import time
from pathlib import Path
from urllib.parse import unquote

import requests
from tqdm import tqdm

API_URL = "https://ru.wikipedia.org/w/api.php"
SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://ru.wikipedia.org/",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
})

SKIP_IMAGE_EXTENSIONS = {".svg", ".gif", ".ogg", ".ogv", ".webm", ".pdf", ".djvu"}

DATA_DIR = Path("data")
IMAGES_DIR = DATA_DIR / "images"
METADATA_FILE = DATA_DIR / "metadata.jsonl"
CHECKPOINT_FILE = DATA_DIR / "checkpoint.json"


def api_query(**params):
    """Запрос к MediaWiki API с rate limiting."""
    params.setdefault("format", "json")
    params.setdefault("action", "query")
    time.sleep(0.1)
    resp = SESSION.get(API_URL, params=params, timeout=30)
    resp.raise_for_status()
    return resp.json()


def get_random_titles(count: int = 20) -> list[str]:
    """Получить случайные заголовки статей (namespace 0 = основные статьи)."""
    data = api_query(list="random", rnnamespace=0, rnlimit=count)
    return [p["title"] for p in data.get("query", {}).get("random", [])]


def get_article_data(titles: list[str]) -> dict:
    """Получить extract + thumbnail для пачки статей (до 20)."""
    data = api_query(
        titles="|".join(titles),
        prop="extracts|pageimages",
        exintro=True,
        explaintext=True,
        exsectionformat="plain",
        piprop="thumbnail",
        pithumbsize=512,
        pilimit="max",
    )
    pages = data.get("query", {}).get("pages", {})
    results = {}
    for page_id, page in pages.items():
        if int(page_id) < 0:
            continue
        title = page.get("title", "")
        extract = page.get("extract", "").strip()
        thumb = page.get("thumbnail", {})
        image_url = thumb.get("source", "")
        results[title] = {"extract": extract, "image_url": image_url}
    return results


def download_image(url: str, save_path: Path, max_retries: int = 3) -> bool:
    """Скачать изображение с retry и exponential backoff."""
    for attempt in range(max_retries):
        try:
            time.sleep(0.2 + attempt * 2)
            resp = SESSION.get(url, timeout=30, stream=True)
            if resp.status_code == 429:
                wait = int(resp.headers.get("Retry-After", 5 * (attempt + 1)))
                tqdm.write(f"  ⏳ Rate limited, waiting {wait}s...")
                time.sleep(wait)
                continue
            resp.raise_for_status()
            with open(save_path, "wb") as f:
                for chunk in resp.iter_content(8192):
                    f.write(chunk)
            return True
        except requests.exceptions.HTTPError as e:
            if "429" in str(e) and attempt < max_retries - 1:
                time.sleep(5 * (attempt + 1))
                continue
            tqdm.write(f"  ⚠ Download failed: {e}")
            return False
        except Exception as e:
            tqdm.write(f"  ⚠ Download failed: {e}")
            return False
    return False


def image_filename(title: str, url: str) -> str:
    ext = Path(unquote(url)).suffix.lower().split("?")[0]
    if not ext or len(ext) > 5:
        ext = ".jpg"
    safe_name = hashlib.md5(title.encode()).hexdigest()[:12]
    return f"{safe_name}{ext}"


def load_checkpoint() -> set[str]:
    if CHECKPOINT_FILE.exists():
        with open(CHECKPOINT_FILE) as f:
            return set(json.load(f).get("collected_titles", []))
    return set()


def save_checkpoint(collected: set[str]):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"collected_titles": list(collected)}, f, ensure_ascii=False)


def main():
    parser = argparse.ArgumentParser(description="Collect random Wikipedia image-text pairs")
    parser.add_argument("--max-total", type=int, default=10000, help="Total pairs to collect")
    parser.add_argument("--resume", action="store_true", help="Resume from checkpoint")
    args = parser.parse_args()

    IMAGES_DIR.mkdir(parents=True, exist_ok=True)

    collected = load_checkpoint() if args.resume else set()
    mode = "a" if args.resume and METADATA_FILE.exists() else "w"

    total = len(collected)
    skipped = 0
    pbar = tqdm(total=args.max_total, initial=total, desc="Collecting")

    with open(METADATA_FILE, mode, encoding="utf-8") as meta_f:
        while total < args.max_total:
            # Берём пачку случайных статей
            random_titles = get_random_titles(20)
            # Фильтруем уже собранные
            new_titles = [t for t in random_titles if t not in collected]
            if not new_titles:
                continue

            # Получаем данные статей
            article_data = get_article_data(new_titles)

            for title, info in article_data.items():
                if total >= args.max_total:
                    break
                if title in collected:
                    continue

                extract = info["extract"]
                image_url = info["image_url"]

                # Пропуск статей без текста или картинки
                if not extract or len(extract) < 50:
                    skipped += 1
                    continue
                if not image_url:
                    skipped += 1
                    continue

                # Пропуск не-фото форматов
                ext = Path(unquote(image_url)).suffix.lower().split("?")[0]
                if ext in SKIP_IMAGE_EXTENSIONS:
                    skipped += 1
                    continue

                # Скачиваем
                fname = image_filename(title, image_url)
                img_path = IMAGES_DIR / fname

                if not img_path.exists():
                    if not download_image(image_url, img_path):
                        skipped += 1
                        continue

                record = {
                    "title": title,
                    "text": extract,
                    "image_path": str(img_path),
                    "image_url": image_url,
                }
                meta_f.write(json.dumps(record, ensure_ascii=False) + "\n")
                meta_f.flush()

                collected.add(title)
                total += 1
                pbar.update(1)

            # Checkpoint каждые 100 статей
            if total % 100 < 20:
                save_checkpoint(collected)
                pbar.set_postfix(skipped=skipped)

    save_checkpoint(collected)
    pbar.close()
    print(f"\nDone! Collected {total} pairs (skipped {skipped} without image/text).")
    print(f"Images: {IMAGES_DIR}")
    print(f"Metadata: {METADATA_FILE}")


if __name__ == "__main__":
    main()