Spaces:
Sleeping
Sleeping
add new blogs
Browse files- .gitignore +4 -1
- api.py +88 -0
- app.py +2 -1
- blog_class.py +12 -232
- design.py +1 -1
- get_blog.py +289 -0
- prompts.py +39 -32
- requirements.txt +2 -1
- resource/2025-04-08-変化を阻害するもの.mp3 +3 -0
- resource/2025-04-09-目的を意識するのはスキルです.mp3 +3 -0
- resource/2025-04-10-文章をダイエットさせる.mp3 +3 -0
- resource/2025-04-11-仕事は決めてなんぼ.mp3 +3 -0
- resource/2025-04-12-精神論も大事です.mp3 +3 -0
- resource/2025-04-14-ヒューマンエラーはゼロになりません.mp3 +3 -0
- resource/2025-04-15-良い行動で良い結果を出す.mp3 +3 -0
- resource/2025-04-16-新幹線の運休と障害訓練.mp3 +3 -0
- resource/2025-04-17-バッファと保険の違い.mp3 +3 -0
- resource/2025-04-18-無理難題に対する対処.mp3 +3 -0
- resource/2025-04-19-2位じゃダメなんでしょうか?.mp3 +3 -0
- resource/2025-04-21-例え話でピンとくる説明をAIで実現.mp3 +3 -0
- resource/2025-04-22-人を変えるのではなく、関わり方を変える.mp3 +3 -0
- resource/2025-04-23-ひとつの経験から多くを学ぶ.mp3 +3 -0
- resource/2025-04-24-失敗の目的.mp3 +3 -0
- resource/2025-04-25-計画書ではふわっとした表現は禁止です.mp3 +3 -0
- resource/2025-04-28-シンプルな判断・決断が出来るように.mp3 +3 -0
- resource/2025-04-28-使える情報と使えない情報.mp3 +3 -0
- resource/2025-04-30-資料は紙芝居のように。ストーリーを作る重要性。.mp3 +3 -0
- resource/2025-05-01-マネジメントが難しい理由.mp3 +3 -0
- resource/2025-05-02-情報を使える情報にする方法.mp3 +3 -0
- resource/2025-05-07-お知らせとお詫び.mp3 +3 -0
- resource/2025-05-07-深く考えることは脳にとって重労働。でも大事なこと。.mp3 +3 -0
- resource/2025-05-08-脱!上司と部下の板挟み.mp3 +3 -0
- resource/2025-05-09-ゴールから逆引きする発想.mp3 +3 -0
- resource/2025-05-12-伝わらない前提で考える.mp3 +3 -0
- resource/2025-05-13-イライラする時は自分の強みを見つけるチャンス.mp3 +3 -0
- resource/knowledge_data.pkl +2 -2
.gitignore
CHANGED
|
@@ -2,4 +2,7 @@ __pycache__
|
|
| 2 |
old
|
| 3 |
.env
|
| 4 |
resource/QA*
|
| 5 |
-
resource.zip
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
old
|
| 3 |
.env
|
| 4 |
resource/QA*
|
| 5 |
+
resource.zip
|
| 6 |
+
faq
|
| 7 |
+
.gradio
|
| 8 |
+
*.old
|
api.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pickle
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from fish_audio_sdk import Session, TTSRequest
|
| 7 |
+
from openai import OpenAI
|
| 8 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 9 |
+
|
| 10 |
+
INFO_audio_ID = os.getenv("INFO_audio_ID")
|
| 11 |
+
client = OpenAI(api_key=os.getenv("gpt"))
|
| 12 |
+
#
|
| 13 |
+
audio_client = Session(os.getenv("audio"))
|
| 14 |
+
|
| 15 |
+
PKL_FILE = "./resource/knowledge_data.pkl"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def load_pkl(file_path):
|
| 19 |
+
if os.path.exists(file_path):
|
| 20 |
+
with open(file_path, "rb") as f:
|
| 21 |
+
return pickle.load(f)
|
| 22 |
+
else:
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def save_pkl(file_name, data):
|
| 27 |
+
sorted_data = dict(
|
| 28 |
+
sorted(
|
| 29 |
+
data.items(),
|
| 30 |
+
key=lambda x: datetime.strptime("-".join(x[0].split("-")[0:3]), "%Y-%m-%d"),
|
| 31 |
+
)
|
| 32 |
+
)
|
| 33 |
+
with open(file_name, "wb") as f:
|
| 34 |
+
pickle.dump(sorted_data, f)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
from dotenv import load_dotenv
|
| 39 |
+
|
| 40 |
+
env_path = Path(os.path.dirname(__file__)) / ".env"
|
| 41 |
+
if env_path.exists():
|
| 42 |
+
load_dotenv(env_path)
|
| 43 |
+
except ImportError:
|
| 44 |
+
pass
|
| 45 |
+
# params
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@retry(
|
| 49 |
+
stop=stop_after_attempt(5),
|
| 50 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 51 |
+
)
|
| 52 |
+
def generate_item(prompt, sys_prompt, model="gpt-4.1-mini", temperature=1.0):
|
| 53 |
+
# print("prompt=", prompt)
|
| 54 |
+
# print("temperature=", temperature)
|
| 55 |
+
response = client.chat.completions.create(
|
| 56 |
+
model=model,
|
| 57 |
+
temperature=temperature,
|
| 58 |
+
messages=[
|
| 59 |
+
{
|
| 60 |
+
"role": "system",
|
| 61 |
+
"content": sys_prompt,
|
| 62 |
+
},
|
| 63 |
+
{"role": "user", "content": prompt},
|
| 64 |
+
],
|
| 65 |
+
)
|
| 66 |
+
return response.choices[0].message.content.strip()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
@retry(
|
| 70 |
+
stop=stop_after_attempt(5),
|
| 71 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 72 |
+
)
|
| 73 |
+
def get_embedding(text):
|
| 74 |
+
response = client.embeddings.create(input=text, model="text-embedding-3-small")
|
| 75 |
+
return [emb.embedding for emb in response.data]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@retry(
|
| 79 |
+
stop=stop_after_attempt(5),
|
| 80 |
+
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 81 |
+
)
|
| 82 |
+
def audio_text(text, audio_path):
|
| 83 |
+
with open(audio_path, "wb") as f:
|
| 84 |
+
for chunk in audio_client.tts(
|
| 85 |
+
TTSRequest(reference_id=INFO_audio_ID, text=text)
|
| 86 |
+
):
|
| 87 |
+
f.write(chunk)
|
| 88 |
+
return audio_path
|
app.py
CHANGED
|
@@ -6,7 +6,8 @@ import time
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
-
from
|
|
|
|
| 10 |
from design import custom_css, custom_head, js_func, params_text, title_html, title_text
|
| 11 |
|
| 12 |
knowledge_cls = knowledge_class()
|
|
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
+
from api import audio_text
|
| 10 |
+
from blog_class import knowledge_class, save_feedback
|
| 11 |
from design import custom_css, custom_head, js_func, params_text, title_html, title_text
|
| 12 |
|
| 13 |
knowledge_cls = knowledge_class()
|
blog_class.py
CHANGED
|
@@ -1,18 +1,9 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import pickle
|
| 3 |
-
import re
|
| 4 |
import time
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
from urllib.parse import urljoin
|
| 8 |
|
| 9 |
import numpy as np
|
| 10 |
-
import requests
|
| 11 |
-
from bs4 import BeautifulSoup
|
| 12 |
-
from fish_audio_sdk import Session, TTSRequest
|
| 13 |
-
from openai import OpenAI
|
| 14 |
-
from tenacity import retry, stop_after_attempt, wait_exponential
|
| 15 |
|
|
|
|
|
|
|
| 16 |
from prompts import (
|
| 17 |
Common_text,
|
| 18 |
Creative_text,
|
|
@@ -21,116 +12,10 @@ from prompts import (
|
|
| 21 |
QA_Prompt_template,
|
| 22 |
REWRITE_Prompt,
|
| 23 |
REWRITE_SYS_Prompt,
|
| 24 |
-
SEGMENT_Prompt,
|
| 25 |
Short_text,
|
| 26 |
-
STYLE_Prompt,
|
| 27 |
-
SUMMARY_Prompt,
|
| 28 |
SYS_Prompt,
|
| 29 |
)
|
| 30 |
|
| 31 |
-
try:
|
| 32 |
-
from dotenv import load_dotenv
|
| 33 |
-
|
| 34 |
-
env_path = Path(os.path.dirname(__file__)) / ".env"
|
| 35 |
-
if env_path.exists():
|
| 36 |
-
load_dotenv(env_path)
|
| 37 |
-
except ImportError:
|
| 38 |
-
pass
|
| 39 |
-
# params
|
| 40 |
-
PKL_FILE = "./resource/knowledge_data.pkl"
|
| 41 |
-
INFO_audio_ID = os.getenv("INFO_audio_ID")
|
| 42 |
-
BASE_URL = "https://saratoga623.hatenablog.com/"
|
| 43 |
-
|
| 44 |
-
client = OpenAI(api_key=os.getenv("gpt"))
|
| 45 |
-
|
| 46 |
-
#
|
| 47 |
-
audio_client = Session(os.getenv("audio"))
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def load_pkl(file_path):
|
| 51 |
-
if os.path.exists(file_path):
|
| 52 |
-
with open(file_path, "rb") as f:
|
| 53 |
-
return pickle.load(f)
|
| 54 |
-
else:
|
| 55 |
-
return {}
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def save_pkl(file_name, data):
|
| 59 |
-
with open(file_name, "wb") as f:
|
| 60 |
-
pickle.dump(data, f)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
def get_page_content(page_url):
|
| 64 |
-
response = requests.get(page_url)
|
| 65 |
-
response.encoding = response.apparent_encoding
|
| 66 |
-
return response.text
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def parse_homepage(html):
|
| 70 |
-
|
| 71 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 72 |
-
articles = soup.find_all("article", class_="entry")
|
| 73 |
-
blog_infos = []
|
| 74 |
-
for article in articles:
|
| 75 |
-
title_tag = article.find("h1", class_="entry-title")
|
| 76 |
-
if title_tag and title_tag.find("a"):
|
| 77 |
-
link = urljoin(BASE_URL, title_tag.find("a")["href"])
|
| 78 |
-
else:
|
| 79 |
-
continue
|
| 80 |
-
# <time> タグ内から公開日を取得する
|
| 81 |
-
time_tag = article.find("time")
|
| 82 |
-
if time_tag:
|
| 83 |
-
year_tag = time_tag.find("span", class_="date-year")
|
| 84 |
-
month_tag = time_tag.find("span", class_="date-month")
|
| 85 |
-
day_tag = time_tag.find("span", class_="date-day")
|
| 86 |
-
if year_tag and month_tag and day_tag:
|
| 87 |
-
pub_date = f"{year_tag.get_text(strip=True)}-{month_tag.get_text(strip=True)}-{day_tag.get_text(strip=True)}"
|
| 88 |
-
else:
|
| 89 |
-
pub_date = "unknown_date"
|
| 90 |
-
else:
|
| 91 |
-
pub_date = "unknown_date"
|
| 92 |
-
blog_infos.append({"date": pub_date, "link": link})
|
| 93 |
-
return blog_infos, soup
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
def get_next_page_url(soup):
|
| 97 |
-
next_page_tag = soup.find("a", string="次のページ")
|
| 98 |
-
if next_page_tag and next_page_tag.has_attr("href"):
|
| 99 |
-
return urljoin(BASE_URL, next_page_tag["href"])
|
| 100 |
-
return None
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
def sort_key(info):
|
| 104 |
-
try:
|
| 105 |
-
return datetime.strptime(info["date"], "%Y-%m-%d")
|
| 106 |
-
except Exception:
|
| 107 |
-
return datetime.max
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
def fetch_blog_text(url):
|
| 111 |
-
response = requests.get(url)
|
| 112 |
-
response.encoding = response.apparent_encoding
|
| 113 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
| 114 |
-
|
| 115 |
-
title_tag = soup.find("h1", class_="entry-title")
|
| 116 |
-
title = title_tag.get_text(strip=True) if title_tag else "no_title"
|
| 117 |
-
|
| 118 |
-
content_tag = soup.find("div", class_="entry-content")
|
| 119 |
-
if not content_tag:
|
| 120 |
-
content_tag = soup.find("div", class_="hatenablog-entry")
|
| 121 |
-
if content_tag:
|
| 122 |
-
|
| 123 |
-
for a in content_tag.find_all("a", class_="keyword"):
|
| 124 |
-
a.unwrap()
|
| 125 |
-
content = content_tag.get_text(strip=False).strip()
|
| 126 |
-
else:
|
| 127 |
-
content = ""
|
| 128 |
-
return title, content
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
def sanitize_filename(filename):
|
| 132 |
-
return re.sub(r'[\\/*?:"<>|]', "", filename)
|
| 133 |
-
|
| 134 |
|
| 135 |
def max_cosine_similarity(v1, v2_list):
|
| 136 |
""" """
|
|
@@ -152,49 +37,6 @@ def max_cosine_similarity(v1, v2_list):
|
|
| 152 |
return np.max(similarities)
|
| 153 |
|
| 154 |
|
| 155 |
-
@retry(
|
| 156 |
-
stop=stop_after_attempt(5),
|
| 157 |
-
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 158 |
-
)
|
| 159 |
-
def generate_item(prompt, sys_prompt, model="gpt-4o-mini", temperature=1.0):
|
| 160 |
-
print("prompt=", prompt)
|
| 161 |
-
print("temperature=", temperature)
|
| 162 |
-
response = client.chat.completions.create(
|
| 163 |
-
model=model,
|
| 164 |
-
temperature=temperature,
|
| 165 |
-
messages=[
|
| 166 |
-
{
|
| 167 |
-
"role": "system",
|
| 168 |
-
"content": sys_prompt,
|
| 169 |
-
},
|
| 170 |
-
{"role": "user", "content": prompt},
|
| 171 |
-
],
|
| 172 |
-
)
|
| 173 |
-
return response.choices[0].message.content.strip()
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
@retry(
|
| 177 |
-
stop=stop_after_attempt(5),
|
| 178 |
-
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 179 |
-
)
|
| 180 |
-
def get_embedding(text):
|
| 181 |
-
response = client.embeddings.create(input=text, model="text-embedding-3-small")
|
| 182 |
-
return [emb.embedding for emb in response.data]
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
@retry(
|
| 186 |
-
stop=stop_after_attempt(5),
|
| 187 |
-
wait=wait_exponential(multiplier=1, min=2, max=10),
|
| 188 |
-
)
|
| 189 |
-
def audio_text(text, audio_path):
|
| 190 |
-
with open(audio_path, "wb") as f:
|
| 191 |
-
for chunk in audio_client.tts(
|
| 192 |
-
TTSRequest(reference_id=INFO_audio_ID, text=text)
|
| 193 |
-
):
|
| 194 |
-
f.write(chunk)
|
| 195 |
-
return audio_path
|
| 196 |
-
|
| 197 |
-
|
| 198 |
def save_feedback(value, liked):
|
| 199 |
if liked:
|
| 200 |
md_text = "text:\n" + value + "\n" + "liked"
|
|
@@ -209,6 +51,7 @@ def save_feedback(value, liked):
|
|
| 209 |
class knowledge_class:
|
| 210 |
def __init__(self):
|
| 211 |
self.knowledge_data = load_pkl(PKL_FILE)
|
|
|
|
| 212 |
self.reference_dict = self.get_reference_dict()
|
| 213 |
# q_v = self.knowledge_data["2024-10-09-プロジェクト計画で重要視すること"][
|
| 214 |
# "vector"
|
|
@@ -239,75 +82,12 @@ class knowledge_class:
|
|
| 239 |
return reference_dict
|
| 240 |
|
| 241 |
def get_new_knowledge(self):
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
while page_url:
|
| 249 |
-
try:
|
| 250 |
-
html = get_page_content(page_url)
|
| 251 |
-
except Exception as e:
|
| 252 |
-
print(f"{page_url} の取得時にエラーが発生しました: {e}")
|
| 253 |
-
break
|
| 254 |
-
infos, soup = parse_homepage(html)
|
| 255 |
-
new_infos = [info for info in infos if info["link"] not in processed_urls]
|
| 256 |
-
new_blog_infos.extend(new_infos)
|
| 257 |
-
page_url = get_next_page_url(soup)
|
| 258 |
-
|
| 259 |
-
if not new_blog_infos:
|
| 260 |
-
print("新しい記事は見つかりませんでした。")
|
| 261 |
-
else:
|
| 262 |
-
print(
|
| 263 |
-
f"新規記事 {len(new_blog_infos)} 件を検出しました。処理を開始します..."
|
| 264 |
-
)
|
| 265 |
-
new_blog_infos.sort(key=sort_key)
|
| 266 |
-
|
| 267 |
-
for info in new_blog_infos:
|
| 268 |
-
blog_url = info["link"]
|
| 269 |
-
pub_date = info["date"]
|
| 270 |
-
print(f"記事を処理中: {blog_url}")
|
| 271 |
-
try:
|
| 272 |
-
title, content = fetch_blog_text(blog_url)
|
| 273 |
-
key_name = f"{pub_date}-{sanitize_filename(title)}"
|
| 274 |
-
print("記事:", key_name)
|
| 275 |
-
summary_text = generate_item(content, SUMMARY_Prompt)
|
| 276 |
-
style_text = generate_item(content, STYLE_Prompt)
|
| 277 |
-
segment_texts = [
|
| 278 |
-
seg.strip()
|
| 279 |
-
for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
|
| 280 |
-
if seg.strip()
|
| 281 |
-
]
|
| 282 |
-
texts_vector = get_embedding([title, summary_text] + segment_texts)
|
| 283 |
-
audio_path = f"./resource/{key_name}.mp3"
|
| 284 |
-
audio_path = audio_text(summary_text, audio_path)
|
| 285 |
-
dict_item = {
|
| 286 |
-
key_name: {
|
| 287 |
-
"title": title,
|
| 288 |
-
"text": content,
|
| 289 |
-
"url": blog_url,
|
| 290 |
-
"style": style_text,
|
| 291 |
-
"summary": summary_text,
|
| 292 |
-
"audio": audio_path,
|
| 293 |
-
"segments": segment_texts,
|
| 294 |
-
"vector": texts_vector,
|
| 295 |
-
}
|
| 296 |
-
}
|
| 297 |
-
|
| 298 |
-
except Exception as e:
|
| 299 |
-
print(f"{blog_url} の処理中にエラーが発生しました: {e}")
|
| 300 |
-
raise
|
| 301 |
-
|
| 302 |
-
self.knowledge_data.update(dict_item)
|
| 303 |
-
time.sleep(1)
|
| 304 |
-
save_pkl(PKL_FILE, self.knowledge_data)
|
| 305 |
-
time.sleep(1)
|
| 306 |
-
self.knowledge_data = load_pkl(PKL_FILE)
|
| 307 |
-
self.reference_dict = self.get_reference_dict()
|
| 308 |
-
time.sleep(1)
|
| 309 |
-
|
| 310 |
-
print(f"PKLファイルの更新が完了しました。新規記事数: {len(new_blog_infos)}")
|
| 311 |
|
| 312 |
def find_top_info(self, question_vector):
|
| 313 |
results = []
|
|
@@ -351,7 +131,7 @@ class knowledge_class:
|
|
| 351 |
s_text=full_prompt,
|
| 352 |
)
|
| 353 |
answer_text = generate_item(
|
| 354 |
-
user_prompt, SYS_Prompt, model="gpt-
|
| 355 |
)
|
| 356 |
md_text = user_prompt + "\n 応答: \n" + answer_text
|
| 357 |
timestamp = int(time.time() * 1000)
|
|
@@ -414,7 +194,7 @@ class knowledge_class:
|
|
| 414 |
# "./resource/2025-03-03-機嫌良く働くと仕事は上手く進む.mp3",
|
| 415 |
# )
|
| 416 |
|
| 417 |
-
rewrite_question = generate_item(rw_prompt, REWRITE_SYS_Prompt, model="gpt-
|
| 418 |
print("rewrite_question:", rewrite_question)
|
| 419 |
# prompt = DEFAULT_TEMPLATE.format(chat_history=chat_history, question=query)
|
| 420 |
# get rewrite question
|
|
@@ -433,7 +213,7 @@ class knowledge_class:
|
|
| 433 |
answer_text = generate_item(
|
| 434 |
user_prompt,
|
| 435 |
SYS_Prompt,
|
| 436 |
-
model="gpt-
|
| 437 |
temperature=temperature,
|
| 438 |
)
|
| 439 |
md_text = user_prompt + "\n 応答: \n" + answer_text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import time
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
from api import PKL_FILE, generate_item, get_embedding, load_pkl, save_pkl
|
| 6 |
+
from get_blog import get_new_blog_content, get_old_blog_content
|
| 7 |
from prompts import (
|
| 8 |
Common_text,
|
| 9 |
Creative_text,
|
|
|
|
| 12 |
QA_Prompt_template,
|
| 13 |
REWRITE_Prompt,
|
| 14 |
REWRITE_SYS_Prompt,
|
|
|
|
| 15 |
Short_text,
|
|
|
|
|
|
|
| 16 |
SYS_Prompt,
|
| 17 |
)
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def max_cosine_similarity(v1, v2_list):
|
| 21 |
""" """
|
|
|
|
| 37 |
return np.max(similarities)
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def save_feedback(value, liked):
|
| 41 |
if liked:
|
| 42 |
md_text = "text:\n" + value + "\n" + "liked"
|
|
|
|
| 51 |
class knowledge_class:
|
| 52 |
def __init__(self):
|
| 53 |
self.knowledge_data = load_pkl(PKL_FILE)
|
| 54 |
+
# print(self.knowledge_data)
|
| 55 |
self.reference_dict = self.get_reference_dict()
|
| 56 |
# q_v = self.knowledge_data["2024-10-09-プロジェクト計画で重要視すること"][
|
| 57 |
# "vector"
|
|
|
|
| 82 |
return reference_dict
|
| 83 |
|
| 84 |
def get_new_knowledge(self):
|
| 85 |
+
self.knowledge_data = get_old_blog_content(self.knowledge_data)
|
| 86 |
+
self.knowledge_data = get_new_blog_content(self.knowledge_data)
|
| 87 |
+
save_pkl(PKL_FILE, self.knowledge_data)
|
| 88 |
+
self.knowledge_data = load_pkl(PKL_FILE)
|
| 89 |
+
self.reference_dict = self.get_reference_dict()
|
| 90 |
+
print("PKLファイルの更新が完了しました。")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
def find_top_info(self, question_vector):
|
| 93 |
results = []
|
|
|
|
| 131 |
s_text=full_prompt,
|
| 132 |
)
|
| 133 |
answer_text = generate_item(
|
| 134 |
+
user_prompt, SYS_Prompt, model="gpt-4.1", temperature=temperature
|
| 135 |
)
|
| 136 |
md_text = user_prompt + "\n 応答: \n" + answer_text
|
| 137 |
timestamp = int(time.time() * 1000)
|
|
|
|
| 194 |
# "./resource/2025-03-03-機嫌良く働くと仕事は上手く進む.mp3",
|
| 195 |
# )
|
| 196 |
|
| 197 |
+
rewrite_question = generate_item(rw_prompt, REWRITE_SYS_Prompt, model="gpt-4.1")
|
| 198 |
print("rewrite_question:", rewrite_question)
|
| 199 |
# prompt = DEFAULT_TEMPLATE.format(chat_history=chat_history, question=query)
|
| 200 |
# get rewrite question
|
|
|
|
| 213 |
answer_text = generate_item(
|
| 214 |
user_prompt,
|
| 215 |
SYS_Prompt,
|
| 216 |
+
model="gpt-4.1",
|
| 217 |
temperature=temperature,
|
| 218 |
)
|
| 219 |
md_text = user_prompt + "\n 応答: \n" + answer_text
|
design.py
CHANGED
|
@@ -72,7 +72,7 @@ title_html = """
|
|
| 72 |
<div style="display: flex; align-items: center; justify-content: center;">
|
| 73 |
<img src="https://cdn.profile-image.st-hatena.com/users/saratoga623/profile.png?1728512391" style="width:50px; height:50px; margin-right:10px;">
|
| 74 |
<h1 style="margin: 0;">
|
| 75 |
-
<a href="https://
|
| 76 |
プロジェクトマネジメント勉強会 <i class='fa fa-lightbulb-o'></i>
|
| 77 |
</a>
|
| 78 |
</h1>
|
|
|
|
| 72 |
<div style="display: flex; align-items: center; justify-content: center;">
|
| 73 |
<img src="https://cdn.profile-image.st-hatena.com/users/saratoga623/profile.png?1728512391" style="width:50px; height:50px; margin-right:10px;">
|
| 74 |
<h1 style="margin: 0;">
|
| 75 |
+
<a href="https://note.com/saratoga623" target="_blank" style="text-decoration: none; color: inherit;">
|
| 76 |
プロジェクトマネジメント勉強会 <i class='fa fa-lightbulb-o'></i>
|
| 77 |
</a>
|
| 78 |
</h1>
|
get_blog.py
ADDED
|
@@ -0,0 +1,289 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import time
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from urllib.parse import urljoin
|
| 5 |
+
|
| 6 |
+
import requests
|
| 7 |
+
from bs4 import BeautifulSoup
|
| 8 |
+
|
| 9 |
+
from api import PKL_FILE, audio_text, generate_item, get_embedding, load_pkl, save_pkl
|
| 10 |
+
from prompts import SEGMENT_Prompt, STYLE_Prompt, SUMMARY_Prompt
|
| 11 |
+
|
| 12 |
+
OLD_BASE_URL = "https://saratoga623.hatenablog.com/"
|
| 13 |
+
NEW_API_URL = "https://note.com/api/v2/creators/saratoga623/contents?kind=note&page="
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_page_content(page_url):
|
| 17 |
+
response = requests.get(page_url)
|
| 18 |
+
response.encoding = response.apparent_encoding
|
| 19 |
+
return response.text
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse_homepage(html):
|
| 23 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 24 |
+
articles = soup.find_all("article", class_="entry")
|
| 25 |
+
blog_infos = []
|
| 26 |
+
for article in articles:
|
| 27 |
+
title_tag = article.find("h1", class_="entry-title")
|
| 28 |
+
if title_tag and title_tag.find("a"):
|
| 29 |
+
link = urljoin(OLD_BASE_URL, title_tag.find("a")["href"])
|
| 30 |
+
else:
|
| 31 |
+
continue
|
| 32 |
+
# <time> タグ内から公開日を取得する
|
| 33 |
+
time_tag = article.find("time")
|
| 34 |
+
if time_tag:
|
| 35 |
+
year_tag = time_tag.find("span", class_="date-year")
|
| 36 |
+
month_tag = time_tag.find("span", class_="date-month")
|
| 37 |
+
day_tag = time_tag.find("span", class_="date-day")
|
| 38 |
+
if year_tag and month_tag and day_tag:
|
| 39 |
+
pub_date = f"{year_tag.get_text(strip=True)}-{month_tag.get_text(strip=True)}-{day_tag.get_text(strip=True)}"
|
| 40 |
+
else:
|
| 41 |
+
pub_date = "unknown_date"
|
| 42 |
+
else:
|
| 43 |
+
pub_date = "unknown_date"
|
| 44 |
+
blog_infos.append({"date": pub_date, "link": link})
|
| 45 |
+
return blog_infos, soup
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def get_next_page_url(soup):
|
| 49 |
+
next_page_tag = soup.find("a", string="次のページ")
|
| 50 |
+
if next_page_tag and next_page_tag.has_attr("href"):
|
| 51 |
+
return urljoin(OLD_BASE_URL, next_page_tag["href"])
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def sort_key(info):
|
| 56 |
+
try:
|
| 57 |
+
return datetime.strptime(info["date"], "%Y-%m-%d")
|
| 58 |
+
except Exception:
|
| 59 |
+
return datetime.max
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def fetch_blog_text(url):
|
| 63 |
+
response = requests.get(url)
|
| 64 |
+
response.encoding = response.apparent_encoding
|
| 65 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 66 |
+
|
| 67 |
+
title_tag = soup.find("h1", class_="entry-title")
|
| 68 |
+
title = title_tag.get_text(strip=True) if title_tag else "no_title"
|
| 69 |
+
|
| 70 |
+
content_tag = soup.find("div", class_="entry-content")
|
| 71 |
+
if not content_tag:
|
| 72 |
+
content_tag = soup.find("div", class_="hatenablog-entry")
|
| 73 |
+
if content_tag:
|
| 74 |
+
|
| 75 |
+
for a in content_tag.find_all("a", class_="keyword"):
|
| 76 |
+
a.unwrap()
|
| 77 |
+
content = content_tag.get_text(strip=False).strip()
|
| 78 |
+
else:
|
| 79 |
+
content = ""
|
| 80 |
+
return title, content
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def sanitize_filename(filename):
|
| 84 |
+
return re.sub(r'[\\/*?:"<>|]', "", filename)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def get_article_content(url):
|
| 88 |
+
""""""
|
| 89 |
+
headers = {
|
| 90 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
response = requests.get(url, headers=headers, timeout=10)
|
| 95 |
+
response.raise_for_status()
|
| 96 |
+
except requests.RequestException as e:
|
| 97 |
+
print(f"{url} error: {e}")
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
soup = BeautifulSoup(response.text, "lxml")
|
| 101 |
+
|
| 102 |
+
content_div = soup.find(
|
| 103 |
+
"div", {"data-name": "body", "class": "note-common-styles__textnote-body"}
|
| 104 |
+
) or soup.find("div", class_="o-noteContent__body")
|
| 105 |
+
|
| 106 |
+
if content_div:
|
| 107 |
+
content = content_div.get_text(separator="\n", strip=True)
|
| 108 |
+
return content
|
| 109 |
+
else:
|
| 110 |
+
print("{url} can not find content")
|
| 111 |
+
return None
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def get_old_blog_content(knowledge_data):
|
| 115 |
+
|
| 116 |
+
print(f"[{datetime.now()}]: {OLD_BASE_URL}中に、記事の検出を開始します...")
|
| 117 |
+
processed_urls = set()
|
| 118 |
+
for _, v in knowledge_data.items():
|
| 119 |
+
processed_urls.add(v.get("url", ""))
|
| 120 |
+
new_blog_infos = []
|
| 121 |
+
page_url = OLD_BASE_URL
|
| 122 |
+
while page_url:
|
| 123 |
+
try:
|
| 124 |
+
html = get_page_content(page_url)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
print(f"{page_url} の取得時にエラーが発生しました: {e}")
|
| 127 |
+
break
|
| 128 |
+
infos, soup = parse_homepage(html)
|
| 129 |
+
new_infos = [info for info in infos if info["link"] not in processed_urls]
|
| 130 |
+
new_blog_infos.extend(new_infos)
|
| 131 |
+
page_url = get_next_page_url(soup)
|
| 132 |
+
time.sleep(1)
|
| 133 |
+
|
| 134 |
+
if not new_blog_infos:
|
| 135 |
+
print(f"{OLD_BASE_URL}中に、新しい記事は見つかりませんでした。")
|
| 136 |
+
else:
|
| 137 |
+
print(
|
| 138 |
+
f"{OLD_BASE_URL}中に、新規記事 {len(new_blog_infos)} 件を検出しました。処理を開始します..."
|
| 139 |
+
)
|
| 140 |
+
new_blog_infos.sort(key=sort_key)
|
| 141 |
+
for info in new_blog_infos:
|
| 142 |
+
blog_url = info["link"]
|
| 143 |
+
pub_date = info["date"]
|
| 144 |
+
print(f"記事を処理中: {blog_url}")
|
| 145 |
+
try:
|
| 146 |
+
title, content = fetch_blog_text(blog_url)
|
| 147 |
+
key_name = f"{pub_date}-{sanitize_filename(title)}"
|
| 148 |
+
print("記事:", key_name)
|
| 149 |
+
summary_text = generate_item(content, SUMMARY_Prompt)
|
| 150 |
+
style_text = generate_item(content, STYLE_Prompt)
|
| 151 |
+
segment_texts = [
|
| 152 |
+
seg.strip()
|
| 153 |
+
for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
|
| 154 |
+
if seg.strip()
|
| 155 |
+
]
|
| 156 |
+
texts_vector = get_embedding([title, summary_text] + segment_texts)
|
| 157 |
+
audio_path = f"./resource/{key_name}.mp3"
|
| 158 |
+
audio_path = audio_text(summary_text, audio_path)
|
| 159 |
+
dict_item = {
|
| 160 |
+
key_name: {
|
| 161 |
+
"title": title,
|
| 162 |
+
"text": content,
|
| 163 |
+
"url": blog_url,
|
| 164 |
+
"style": style_text,
|
| 165 |
+
"summary": summary_text,
|
| 166 |
+
"audio": audio_path,
|
| 167 |
+
"segments": segment_texts,
|
| 168 |
+
"vector": texts_vector,
|
| 169 |
+
}
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
print(f"{blog_url} の処理中にエラーが発生しました: {e}")
|
| 174 |
+
raise
|
| 175 |
+
|
| 176 |
+
knowledge_data.update(dict_item)
|
| 177 |
+
time.sleep(1)
|
| 178 |
+
save_pkl(PKL_FILE, knowledge_data)
|
| 179 |
+
time.sleep(1)
|
| 180 |
+
knowledge_data = load_pkl(PKL_FILE)
|
| 181 |
+
time.sleep(1)
|
| 182 |
+
print(
|
| 183 |
+
f"{OLD_BASE_URL}中に、新規記事の更新が完了しました。記事数: {len(new_blog_infos)}"
|
| 184 |
+
)
|
| 185 |
+
return knowledge_data
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def get_new_blog_content(knowledge_data):
|
| 189 |
+
print(f"[{datetime.now()}]: {NEW_API_URL}中に、記事の検出を開始します...")
|
| 190 |
+
all_articles = []
|
| 191 |
+
processed_title = set()
|
| 192 |
+
for _, v in knowledge_data.items():
|
| 193 |
+
processed_title.add(v.get("title", ""))
|
| 194 |
+
|
| 195 |
+
headers = {
|
| 196 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
| 197 |
+
}
|
| 198 |
+
page = 1
|
| 199 |
+
while True:
|
| 200 |
+
api_url = f"{NEW_API_URL}{page}"
|
| 201 |
+
try:
|
| 202 |
+
response = requests.get(api_url, headers=headers, timeout=10)
|
| 203 |
+
response.raise_for_status()
|
| 204 |
+
except requests.RequestException as e:
|
| 205 |
+
print(f"{api_url} の取得時にエラーが発生しました: {e}")
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
json_data = response.json()
|
| 209 |
+
# check last
|
| 210 |
+
is_last_page = json_data.get("data", {}).get("isLastPage", True)
|
| 211 |
+
if is_last_page:
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
notes = json_data.get("data", {}).get("contents", [])
|
| 215 |
+
if not notes:
|
| 216 |
+
break
|
| 217 |
+
|
| 218 |
+
for item in notes:
|
| 219 |
+
if isinstance(item, dict):
|
| 220 |
+
note_url = item.get("noteUrl")
|
| 221 |
+
title = item.get("name")
|
| 222 |
+
publish_at = item.get("publishAt")
|
| 223 |
+
if title in processed_title:
|
| 224 |
+
continue
|
| 225 |
+
if note_url and title:
|
| 226 |
+
print(f"Note URL: {note_url}, Title: {title}, Time: {publish_at}")
|
| 227 |
+
all_articles.append(
|
| 228 |
+
{
|
| 229 |
+
"title": title,
|
| 230 |
+
"url": note_url,
|
| 231 |
+
"timestamp": publish_at, # 加入时间戳
|
| 232 |
+
}
|
| 233 |
+
)
|
| 234 |
+
page += 1
|
| 235 |
+
time.sleep(1)
|
| 236 |
+
if not all_articles:
|
| 237 |
+
print(f"{NEW_API_URL}中に、新しい記事は見つかりませんでした。")
|
| 238 |
+
else:
|
| 239 |
+
print(
|
| 240 |
+
f"{NEW_API_URL}中に、新規記事 {len(all_articles)} 件を検出しました。処理を開始します..."
|
| 241 |
+
)
|
| 242 |
+
for article in all_articles:
|
| 243 |
+
title = article["title"]
|
| 244 |
+
timestamp = article["timestamp"]
|
| 245 |
+
url = article["url"]
|
| 246 |
+
print(f"記事を処理中: {url}")
|
| 247 |
+
try:
|
| 248 |
+
content = get_article_content(url)
|
| 249 |
+
if not content:
|
| 250 |
+
continue
|
| 251 |
+
key_name = f"{timestamp[:10]}-{sanitize_filename(title)}" ##TODO
|
| 252 |
+
print("記事:", key_name)
|
| 253 |
+
summary_text = generate_item(content, SUMMARY_Prompt)
|
| 254 |
+
style_text = generate_item(content, STYLE_Prompt)
|
| 255 |
+
segment_texts = [
|
| 256 |
+
seg.strip()
|
| 257 |
+
for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
|
| 258 |
+
if seg.strip()
|
| 259 |
+
]
|
| 260 |
+
texts_vector = get_embedding([title, summary_text] + segment_texts)
|
| 261 |
+
audio_path = f"./resource/{key_name}.mp3"
|
| 262 |
+
audio_path = audio_text(summary_text, audio_path)
|
| 263 |
+
dict_item = {
|
| 264 |
+
key_name: {
|
| 265 |
+
"title": title,
|
| 266 |
+
"text": content,
|
| 267 |
+
"url": url,
|
| 268 |
+
"style": style_text,
|
| 269 |
+
"summary": summary_text,
|
| 270 |
+
"audio": audio_path,
|
| 271 |
+
"segments": segment_texts,
|
| 272 |
+
"vector": texts_vector,
|
| 273 |
+
}
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
print(f"{url} の処理中にエラーが発生しました: {e}")
|
| 278 |
+
raise
|
| 279 |
+
|
| 280 |
+
knowledge_data.update(dict_item)
|
| 281 |
+
time.sleep(1)
|
| 282 |
+
save_pkl(PKL_FILE, knowledge_data)
|
| 283 |
+
time.sleep(1)
|
| 284 |
+
knowledge_data = load_pkl(PKL_FILE)
|
| 285 |
+
time.sleep(1)
|
| 286 |
+
print(
|
| 287 |
+
f"{NEW_API_URL}中に、新規記事の更新が完了しました。記事数: {len(all_articles)}"
|
| 288 |
+
)
|
| 289 |
+
return knowledge_data
|
prompts.py
CHANGED
|
@@ -1,38 +1,41 @@
|
|
| 1 |
SYS_Prompt = """
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
SUMMARY_Prompt = """
|
| 6 |
-
|
| 7 |
-
以下の文章の要約を作成してください。要約は100字以内で簡潔にまとめてください。
|
| 8 |
"""
|
| 9 |
|
| 10 |
STYLE_Prompt = """
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
"""
|
|
|
|
| 16 |
SEGMENT_Prompt = """
|
| 17 |
-
|
| 18 |
-
以下の文章を内容に基づいて意味的に段落に分割してください。各段落は独立した文章で、段落ごとに改行を入れてください。
|
| 19 |
"""
|
| 20 |
|
| 21 |
QA_Prompt_template = """
|
| 22 |
-
|
| 23 |
-
検索された情報の内容、文脈、コミュニケーションスタイルに適した応答を提供してください。
|
| 24 |
|
| 25 |
-
|
| 26 |
{q_text}
|
| 27 |
-
|
|
|
|
| 28 |
{r_text}
|
| 29 |
-
|
|
|
|
| 30 |
- {c_text}
|
| 31 |
-
-
|
| 32 |
-
-
|
| 33 |
- {s_text}
|
| 34 |
-
-
|
|
|
|
| 35 |
"""
|
|
|
|
|
|
|
| 36 |
REWRITE_SYS_Prompt = """
|
| 37 |
Given a conversation (between Human and Assistant) and a follow up message from Human, \
|
| 38 |
rewrite the message to be a standalone question that captures all relevant context \
|
|
@@ -48,28 +51,32 @@ REWRITE_Prompt = """
|
|
| 48 |
<Standalone question>
|
| 49 |
"""
|
| 50 |
QA_chat_Prompt_template = """
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
【ユーザーの会話履歴】
|
| 54 |
{h_text}
|
| 55 |
-
|
| 56 |
{q_text}
|
| 57 |
-
|
| 58 |
{r_text}
|
| 59 |
-
|
| 60 |
- {c_text}
|
| 61 |
-
-
|
| 62 |
-
-
|
| 63 |
- {s_text}
|
| 64 |
-
-
|
|
|
|
| 65 |
"""
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
"""
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
"""
|
| 72 |
-
Short_text = """
|
|
|
|
| 73 |
"""
|
| 74 |
-
Full_text = """
|
|
|
|
| 75 |
"""
|
|
|
|
| 1 |
SYS_Prompt = """
|
| 2 |
+
You are a specialist in linguistic analysis, skilled at identifying, emulating, and adapting diverse communication styles and patterns of thought. Always respond in Japanese.
|
| 3 |
"""
|
| 4 |
|
| 5 |
SUMMARY_Prompt = """
|
| 6 |
+
You are an expert in text analysis. Summarize the following passage in Japanese, using no more than 100 characters.
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
STYLE_Prompt = """
|
| 10 |
+
You are an expert in text analysis. For the text below, identify:
|
| 11 |
+
1. Key features of the author’s writing style (e.g., honorific usage, tone, sentence structure, terminology).
|
| 12 |
+
2. The author’s patterns of reasoning (e.g., logical organization, inference methods, attitude).
|
| 13 |
+
Present your analysis in Japanese.
|
| 14 |
"""
|
| 15 |
+
|
| 16 |
SEGMENT_Prompt = """
|
| 17 |
+
You are an expert in text analysis. Divide the following text into semantically coherent paragraphs—each representing a distinct idea. Separate paragraphs with a blank line. Provide the output in Japanese.
|
|
|
|
| 18 |
"""
|
| 19 |
|
| 20 |
QA_Prompt_template = """
|
| 21 |
+
Fine-tune your wording to match the user’s question precisely, creating a natural and seamless conversational flow. Provide answers that are appropriate to the content, context, and communication style of the retrieved information.
|
|
|
|
| 22 |
|
| 23 |
+
User Question:
|
| 24 |
{q_text}
|
| 25 |
+
|
| 26 |
+
Retrieved Information:
|
| 27 |
{r_text}
|
| 28 |
+
|
| 29 |
+
Task Requirements:
|
| 30 |
- {c_text}
|
| 31 |
+
- Emulate the original author’s writing style.
|
| 32 |
+
- Emulate the original author’s thought process.
|
| 33 |
- {s_text}
|
| 34 |
+
- Respond in a dialogue format rather than as a narrative.
|
| 35 |
+
- Preserve the output format: reply only with the response text in Japanese, without any labels such as “ユーザー:” or “AI:”.
|
| 36 |
"""
|
| 37 |
+
|
| 38 |
+
|
| 39 |
REWRITE_SYS_Prompt = """
|
| 40 |
Given a conversation (between Human and Assistant) and a follow up message from Human, \
|
| 41 |
rewrite the message to be a standalone question that captures all relevant context \
|
|
|
|
| 51 |
<Standalone question>
|
| 52 |
"""
|
| 53 |
QA_chat_Prompt_template = """
|
| 54 |
+
Based on the user’s latest query, dynamically adjust your phrasing to ensure natural, seamless conversation flow. Tailor your response to the content, context, and communication style of the retrieved information.
|
| 55 |
+
<Conversation History>
|
|
|
|
| 56 |
{h_text}
|
| 57 |
+
<User’s Latest Question>
|
| 58 |
{q_text}
|
| 59 |
+
<Retrieved Information>
|
| 60 |
{r_text}
|
| 61 |
+
<Task Requirements>
|
| 62 |
- {c_text}
|
| 63 |
+
- Imitate the original author’s writing style.
|
| 64 |
+
- Mirror the original author’s train of thought.
|
| 65 |
- {s_text}
|
| 66 |
+
- Respond purely in dialogue form—do not include labels such as “User:” or “AI:”; return only the reply.
|
| 67 |
+
Ensure your answer itself is written in Japanese.
|
| 68 |
"""
|
| 69 |
+
|
| 70 |
+
Creative_text = """
|
| 71 |
+
Use the retrieved information as a reference for your conversational reply. Feel free to go beyond the source material by incorporating original ideas or perspectives where appropriate. Emphasize creativity to enrich the dialogue.
|
| 72 |
"""
|
| 73 |
+
|
| 74 |
+
Common_text = """
|
| 75 |
+
Use the retrieved information as a reference and answer in dialogue form.
|
| 76 |
"""
|
| 77 |
+
Short_text = """
|
| 78 |
+
Keep your response concise. Aim for 3–5 sentences that capture the key points.
|
| 79 |
"""
|
| 80 |
+
Full_text = """
|
| 81 |
+
Provide a comprehensive and thorough answer without gaps or ambiguity. Include any necessary background or assumptions, structure your response into clear thematic paragraphs, and ensure logical progression. Anticipate and address potential follow-up questions so the reader feels fully informed.
|
| 82 |
"""
|
requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ openai
|
|
| 3 |
# zyphra
|
| 4 |
tenacity
|
| 5 |
beautifulsoup4
|
| 6 |
-
fish-audio-sdk
|
|
|
|
|
|
| 3 |
# zyphra
|
| 4 |
tenacity
|
| 5 |
beautifulsoup4
|
| 6 |
+
fish-audio-sdk
|
| 7 |
+
lxml
|
resource/2025-04-08-変化を阻害するもの.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff86e41a2c419f8286f7a48736e4142d3bf5c0819ae834a1fe650e12a06b709a
|
| 3 |
+
size 221100
|
resource/2025-04-09-目的を意識するのはスキルです.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2605e3afaa416572dc75364dc79e1c7c68cc86d5a8b44625cecd0ba19aac49bc
|
| 3 |
+
size 258298
|
resource/2025-04-10-文章をダイエットさせる.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:586e275ea22a3a9eb4964945e94cb6ead76fbe5e1062d7354bf0e558ccf2c9c3
|
| 3 |
+
size 190171
|
resource/2025-04-11-仕事は決めてなんぼ.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9948713ab28621a872e0b84dc1d4b33dd547a501cf3477ceeff215adf22d5973
|
| 3 |
+
size 242834
|
resource/2025-04-12-精神論も大事です.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aa8040cbf2b3c9ae7b482cb333e11cf3ef00974f64e073e35732f5ae6d980332
|
| 3 |
+
size 247431
|
resource/2025-04-14-ヒューマンエラーはゼロになりません.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:569d89b9692bf1d2f3ad1b67d8afb3a996149fee1541e1066655db10d1b9e048
|
| 3 |
+
size 224444
|
resource/2025-04-15-良い行動で良い結果を出す.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a476984f8f79892a7bc5cbd95a6bae7bf579a14b514d538cea108ed86b1a449
|
| 3 |
+
size 266657
|
resource/2025-04-16-新幹線の運休と障害訓練.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dcd9956b30632bdcc5590309d1bd50944ca53fc841439f2d6226addd08fb40ca
|
| 3 |
+
size 183484
|
resource/2025-04-17-バッファと保険の違い.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eeb419576c21edaf02ebdd7bc58ff8dccd90f3e7ca808dbcb1cf431f328069d3
|
| 3 |
+
size 228623
|
resource/2025-04-18-無理難題に対する対処.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a369971d3b6b626adbf37bdfcf836164266b6f48b05fb2b4926552100578bb2
|
| 3 |
+
size 274599
|
resource/2025-04-19-2位じゃダメなんでしょうか?.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34765042bdef737a0d7274525ab70f70515cc7da0c20a57b2d932136de4a97b1
|
| 3 |
+
size 224862
|
resource/2025-04-21-例え話でピンとくる説明をAIで実現.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2de3a8ba7824c9bfe1a6c931b64286593d6fa740d94541e7f35b73645925f185
|
| 3 |
+
size 239908
|
resource/2025-04-22-人を変えるのではなく、関わり方を変える.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3acf7f665821a6b09634451971ad2be0f15e4ac235a9bb2d7464b0ac9673a47d
|
| 3 |
+
size 191425
|
resource/2025-04-23-ひとつの経験から多くを学ぶ.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c58f535b88c8f9c9a48e206f4cc0c4a4f8b8c159536da741fbe8dba1c995b3e
|
| 3 |
+
size 223608
|
resource/2025-04-24-失敗の目的.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:411825a6b0eb816f6b43ae7363d6c270a53b92acd57e24ddf307fd0cd215413a
|
| 3 |
+
size 213159
|
resource/2025-04-25-計画書ではふわっとした表現は禁止です.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d5bd87b1653ed1ef852e9e016ba04df33a7e6dce7a5c96a551ffaea35488c8c4
|
| 3 |
+
size 271255
|
resource/2025-04-28-シンプルな判断・決断が出来るように.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46ab982dafc173cdd9ca1e2e5b8a015921eeea701e2967309b7842da656c5fdc
|
| 3 |
+
size 161750
|
resource/2025-04-28-使える情報と使えない情報.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6d0a0bd0c9e40dadf4ed2902fe85c91961246fe6faaf9fb815049a98271bca86
|
| 3 |
+
size 215249
|
resource/2025-04-30-資料は紙芝居のように。ストーリーを作る重要性。.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d74ff1cfc9a013ad4149f0575001b73d8ac4a47ccdc853e7e3a21ac37da4e02
|
| 3 |
+
size 264986
|
resource/2025-05-01-マネジメントが難しい理由.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22926c0aff32767900e6cb2678542bcb68478205ce8ed22afe45d20162dccaae
|
| 3 |
+
size 199784
|
resource/2025-05-02-情報を使える情報にする方法.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5b10762ddcc2b4af7133dad1eaeceb2ef0312d07056658c537bc771f5c1afd59
|
| 3 |
+
size 230295
|
resource/2025-05-07-お知らせとお詫び.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b1f96bbf44c58e4d092b9df39ad8000a150ae0c7dccff1db593a9fb83dfdc52d
|
| 3 |
+
size 204800
|
resource/2025-05-07-深く考えることは脳にとって重労働。でも大事なこと。.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:63c992a6f7c411a40c7fa079b7b14c64e65aa0c598e8be1def671482a93c141a
|
| 3 |
+
size 243670
|
resource/2025-05-08-脱!上司と部下の板挟み.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9f79f2e79020bb052bf6cfe445502341435cd51726fa0eac76e378c8ed9a61be
|
| 3 |
+
size 241998
|
resource/2025-05-09-ゴールから逆引きする発想.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30fe1e280281396c6f2ec4fbf6e8b8814036d4248f94ac18c753859308e6b5a0
|
| 3 |
+
size 280032
|
resource/2025-05-12-伝わらない前提で考える.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2cd3e6df79252edd81d5e42465e4ab749c941da8bfea733a3a3e8ba6e3e648c4
|
| 3 |
+
size 231549
|
resource/2025-05-13-イライラする時は自分の強みを見つけるチャンス.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6cabb8e05785f213d944aff385bea13e6158a5ef73b54c97d309c070b534da67
|
| 3 |
+
size 209397
|
resource/knowledge_data.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a6b00127c041be6e19f00b16d16dd5236ed5d03ec65273cc5f6b13c40f013b3
|
| 3 |
+
size 21481467
|