Spaces:
Sleeping
Sleeping
File size: 3,815 Bytes
980dc8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
# scripts/scrape_discourse.py
import os
import json
from datetime import datetime
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright, TimeoutError
BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"
CATEGORY_ID = 34
CATEGORY_JSON_URL = f"{BASE_URL}/c/courses/tds-kb/{CATEGORY_ID}.json"
AUTH_STATE_FILE = "auth.json"
DATE_FROM = datetime(2025, 1, 1)
DATE_TO = datetime(2025, 4, 14)
def parse_date(date_str):
try:
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
except ValueError:
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
def login_and_save_auth(playwright):
print("🔐 Login required. Opening browser...")
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
page.goto(f"{BASE_URL}/login")
print("➡️ Log in using Google (IITM email), then press ▶️ in Playwright.")
page.pause()
context.storage_state(path=AUTH_STATE_FILE)
print("✅ Login state saved.")
browser.close()
def is_authenticated(page):
try:
page.goto(CATEGORY_JSON_URL, timeout=10000)
page.wait_for_selector("pre", timeout=5000)
json.loads(page.inner_text("pre"))
return True
except (TimeoutError, json.JSONDecodeError):
return False
def scrape_posts(playwright):
os.makedirs("data", exist_ok=True)
browser = playwright.chromium.launch(headless=True)
context = browser.new_context(storage_state=AUTH_STATE_FILE)
page = context.new_page()
all_topics = []
page_num = 0
while True:
paginated_url = f"{CATEGORY_JSON_URL}?page={page_num}"
print(f"📦 Fetching page {page_num}")
page.goto(paginated_url)
try:
data = json.loads(page.inner_text("pre"))
except:
data = json.loads(page.content())
topics = data.get("topic_list", {}).get("topics", [])
if not topics:
break
all_topics.extend(topics)
page_num += 1
print(f"✅ Found {len(all_topics)} topics")
filtered_posts = []
for topic in all_topics:
created_at = parse_date(topic["created_at"])
if DATE_FROM <= created_at <= DATE_TO:
topic_url = f"{BASE_URL}/t/{topic['slug']}/{topic['id']}.json"
page.goto(topic_url)
try:
topic_data = json.loads(page.inner_text("pre"))
except:
topic_data = json.loads(page.content())
posts = topic_data.get("post_stream", {}).get("posts", [])
for post in posts:
filtered_posts.append({
"topic_id": topic["id"],
"topic_title": topic.get("title"),
"author": post["username"],
"created_at": post["created_at"],
"content": BeautifulSoup(post["cooked"], "html.parser").get_text()
})
with open("data/discourse_posts.json", "w", encoding="utf-8") as f:
json.dump(filtered_posts, f, indent=2)
print(f"✅ Saved {len(filtered_posts)} posts")
browser.close()
def main():
with sync_playwright() as p:
if not os.path.exists(AUTH_STATE_FILE):
login_and_save_auth(p)
else:
browser = p.chromium.launch(headless=True)
context = browser.new_context(storage_state=AUTH_STATE_FILE)
page = context.new_page()
if not is_authenticated(page):
browser.close()
login_and_save_auth(p)
else:
browser.close()
scrape_posts(p)
if __name__ == "__main__":
main()
|