import re import time from datetime import datetime from urllib.parse import urljoin import requests from bs4 import BeautifulSoup from api import PKL_FILE, audio_text, generate_item, get_embedding, load_pkl, save_pkl from prompts import SEGMENT_Prompt, STYLE_Prompt, SUMMARY_Prompt OLD_BASE_URL = "https://saratoga623.hatenablog.com/" NEW_API_URL = "https://note.com/api/v2/creators/saratoga623/contents?kind=note&page=" def get_page_content(page_url): response = requests.get(page_url) response.encoding = response.apparent_encoding return response.text def parse_homepage(html): soup = BeautifulSoup(html, "html.parser") articles = soup.find_all("article", class_="entry") blog_infos = [] for article in articles: title_tag = article.find("h1", class_="entry-title") if title_tag and title_tag.find("a"): link = urljoin(OLD_BASE_URL, title_tag.find("a")["href"]) else: continue #