Spaces:
Sleeping
Sleeping
| import httpx | |
| from typing import Dict, Any, Optional | |
| import logging | |
| from src.extractor import extract_from_apollo_state | |
| from src.config import Config | |
| logger = logging.getLogger("AuthorProfile") | |
| async def scrape_author_profile(username: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Actively scrapes the author's profile and 'About' page to build a DNA profile. | |
| """ | |
| if not username: | |
| return None | |
| # Remove @ if present | |
| username = username.replace("@", "") | |
| profile_data = { | |
| "username": username, | |
| "about_page_data": None, | |
| "latest_headlines": [] | |
| } | |
| # 1. Scrape About Page | |
| about_url = f"https://medium.com/@{username}/about" | |
| try: | |
| async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client: | |
| resp = await client.get(about_url, headers=Config.get_headers()) | |
| if resp.status_code == 200: | |
| # Try Apollo first | |
| apollo_data = extract_from_apollo_state(resp.text) | |
| if apollo_data: | |
| # In Apollo state of About page, we look for the User object | |
| # The extract_from_apollo_state function is tuned for Articles (looking for Post). | |
| # We might need to extract User data manually here if the helper fails. | |
| # But let's see if we can reuse the raw json_state. | |
| raw_state = apollo_data.get("json_state") # This is actually the Post object in the helper. | |
| # We need the FULL state. | |
| pass | |
| # If helper didn't give us what we want (because it looks for Post), | |
| # we should probably parse the raw Apollo state ourselves here for User data. | |
| # Let's do a custom extraction for User. | |
| import re | |
| import json | |
| pattern = r'window\.__APOLLO_STATE__\s*=\s*' | |
| match = re.search(pattern, resp.text) | |
| if match: | |
| json_str = resp.text[match.end():] | |
| data, _ = json.JSONDecoder().raw_decode(json_str) | |
| # Find User object | |
| user_obj = None | |
| for key, value in data.items(): | |
| if key.startswith("User:") and value.get("username") == username: | |
| user_obj = value | |
| break | |
| if user_obj: | |
| profile_data["bio"] = user_obj.get("bio") | |
| profile_data["followerCount"] = user_obj.get("socialStats", {}).get("followerCount") | |
| profile_data["createdAt"] = user_obj.get("createdAt") | |
| profile_data["isBookAuthor"] = user_obj.get("isBookAuthor") | |
| except Exception as e: | |
| logger.warning(f"Failed to scrape About page for {username}: {e}") | |
| # 2. Scrape Profile Page for Headlines | |
| profile_url = f"https://medium.com/@{username}" | |
| try: | |
| async with httpx.AsyncClient(timeout=Config.TIMEOUT_MS/1000, follow_redirects=True) as client: | |
| resp = await client.get(profile_url, headers=Config.get_headers()) | |
| if resp.status_code == 200: | |
| import re | |
| import json | |
| pattern = r'window\.__APOLLO_STATE__\s*=\s*' | |
| match = re.search(pattern, resp.text) | |
| if match: | |
| json_str = resp.text[match.end():] | |
| data, _ = json.JSONDecoder().raw_decode(json_str) | |
| # Find Posts | |
| headlines = [] | |
| for key, value in data.items(): | |
| if key.startswith("Post:") and value.get("title"): | |
| headlines.append(value.get("title")) | |
| profile_data["latest_headlines"] = headlines[:10] | |
| except Exception as e: | |
| logger.warning(f"Failed to scrape Profile page for {username}: {e}") | |
| return profile_data | |