ml-agent / eval /scrape_discussions /discussions_scraper.py
akseljoonas's picture
akseljoonas HF Staff
thinking if we want eval or not
2b6a536
import sys
import time
from pathlib import Path
import requests
from tenacity import (
retry,
retry_if_exception_type,
stop_after_attempt,
wait_exponential,
)
# Add parent directory to path to import models
sys.path.insert(0, str(Path(__file__).parent.parent))
from models import Discussion, QuestionAndSolution
BASE_URL = "https://discuss.huggingface.co"
# configure retry decorator for your requests
@retry(
stop=stop_after_attempt(5),
wait=wait_exponential(multiplier=1, min=1, max=60),
retry=retry_if_exception_type(requests.HTTPError),
)
def safe_get(url, **kwargs):
resp = requests.get(url, **kwargs)
if resp.status_code == 422:
# read retry‐after header if present
retry_after = resp.headers.get("Retry-After")
if retry_after:
delay = float(retry_after)
else:
# fallback to guess
delay = 30
print(f"429 hit — waiting {delay} seconds...")
time.sleep(delay)
resp.raise_for_status()
else:
resp.raise_for_status()
return resp
def get_solved_discussions(n_posts: int = 50):
page = 1
discussions = []
while len(discussions) < n_posts:
url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}"
resp = safe_get(url)
topics = resp.json()["topics"]
if not topics:
break
for post in topics:
discussions.append(
Discussion(
title=post["fancy_title"],
url=f"{BASE_URL}/t/{post['slug']}/{post['id']}",
topic_id=post["id"],
category=post["category_id"],
created_at=post["created_at"],
)
)
if len(discussions) >= n_posts:
break
page += 1
time.sleep(0.5) # simple pacing to avoid bursts
return discussions
def get_qa_pair(discussions, start_idx: int = 0):
for discussion in discussions[start_idx:]:
resp = safe_get(discussion.url + ".json")
data = resp.json()
posts = data["post_stream"]["posts"]
accepted_nr = min(
max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1
)
question = posts[0]["cooked"]
solution = posts[accepted_nr]["cooked"]
yield QuestionAndSolution(
discussion_title=discussion.title,
discussion_url=discussion.url,
discussion_topic_id=discussion.topic_id,
discussion_category=discussion.category,
discussion_created_at=discussion.created_at,
question=question,
solution=solution,
thread=posts,
)
time.sleep(0.5)
if __name__ == "__main__":
discussions = get_solved_discussions(n_posts=300)
print(f"Fetched {len(discussions)} discussions")
with open("qa_pairs.jsonl", "a") as f:
for qa_pair in get_qa_pair(discussions):
f.write(qa_pair.model_dump_json() + "\n")