Spaces:
Running
Running
| import sys | |
| import time | |
| from pathlib import Path | |
| import requests | |
| from tenacity import ( | |
| retry, | |
| retry_if_exception_type, | |
| stop_after_attempt, | |
| wait_exponential, | |
| ) | |
| # Add parent directory to path to import models | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from models import Discussion, QuestionAndSolution | |
| BASE_URL = "https://discuss.huggingface.co" | |
| # configure retry decorator for your requests | |
| def safe_get(url, **kwargs): | |
| resp = requests.get(url, **kwargs) | |
| if resp.status_code == 422: | |
| # read retry‐after header if present | |
| retry_after = resp.headers.get("Retry-After") | |
| if retry_after: | |
| delay = float(retry_after) | |
| else: | |
| # fallback to guess | |
| delay = 30 | |
| print(f"429 hit — waiting {delay} seconds...") | |
| time.sleep(delay) | |
| resp.raise_for_status() | |
| else: | |
| resp.raise_for_status() | |
| return resp | |
| def get_solved_discussions(n_posts: int = 50): | |
| page = 1 | |
| discussions = [] | |
| while len(discussions) < n_posts: | |
| url = f"{BASE_URL}/search.json?q=status:solved+order:latest&page={page}" | |
| resp = safe_get(url) | |
| topics = resp.json()["topics"] | |
| if not topics: | |
| break | |
| for post in topics: | |
| discussions.append( | |
| Discussion( | |
| title=post["fancy_title"], | |
| url=f"{BASE_URL}/t/{post['slug']}/{post['id']}", | |
| topic_id=post["id"], | |
| category=post["category_id"], | |
| created_at=post["created_at"], | |
| ) | |
| ) | |
| if len(discussions) >= n_posts: | |
| break | |
| page += 1 | |
| time.sleep(0.5) # simple pacing to avoid bursts | |
| return discussions | |
| def get_qa_pair(discussions, start_idx: int = 0): | |
| for discussion in discussions[start_idx:]: | |
| resp = safe_get(discussion.url + ".json") | |
| data = resp.json() | |
| posts = data["post_stream"]["posts"] | |
| accepted_nr = min( | |
| max(data["accepted_answer"]["post_number"] - 1, 0), len(posts) - 1 | |
| ) | |
| question = posts[0]["cooked"] | |
| solution = posts[accepted_nr]["cooked"] | |
| yield QuestionAndSolution( | |
| discussion_title=discussion.title, | |
| discussion_url=discussion.url, | |
| discussion_topic_id=discussion.topic_id, | |
| discussion_category=discussion.category, | |
| discussion_created_at=discussion.created_at, | |
| question=question, | |
| solution=solution, | |
| thread=posts, | |
| ) | |
| time.sleep(0.5) | |
| if __name__ == "__main__": | |
| discussions = get_solved_discussions(n_posts=300) | |
| print(f"Fetched {len(discussions)} discussions") | |
| with open("qa_pairs.jsonl", "a") as f: | |
| for qa_pair in get_qa_pair(discussions): | |
| f.write(qa_pair.model_dump_json() + "\n") | |