Spaces:

yeelou
/

FAQ-workshop

Sleeping

App Files Files Community

yeelou commited on May 13, 2025

Commit

53d54bd

1 Parent(s): f64b1f0

add new blogs

Browse files

Files changed (36) hide show

.gitignore +4 -1
api.py +88 -0
app.py +2 -1
blog_class.py +12 -232
design.py +1 -1
get_blog.py +289 -0
prompts.py +39 -32
requirements.txt +2 -1
resource/2025-04-08-変化を阻害するもの.mp3 +3 -0
resource/2025-04-09-目的を意識するのはスキルです.mp3 +3 -0
resource/2025-04-10-文章をダイエットさせる.mp3 +3 -0
resource/2025-04-11-仕事は決めてなんぼ.mp3 +3 -0
resource/2025-04-12-精神論も大事です.mp3 +3 -0
resource/2025-04-14-ヒューマンエラーはゼロになりません.mp3 +3 -0
resource/2025-04-15-良い行動で良い結果を出す.mp3 +3 -0
resource/2025-04-16-新幹線の運休と障害訓練.mp3 +3 -0
resource/2025-04-17-バッファと保険の違い.mp3 +3 -0
resource/2025-04-18-無理難題に対する対処.mp3 +3 -0
resource/2025-04-19-2位じゃダメなんでしょうか？.mp3 +3 -0
resource/2025-04-21-例え話でピンとくる説明をAIで実現.mp3 +3 -0
resource/2025-04-22-人を変えるのではなく、関わり方を変える.mp3 +3 -0
resource/2025-04-23-ひとつの経験から多くを学ぶ.mp3 +3 -0
resource/2025-04-24-失敗の目的.mp3 +3 -0
resource/2025-04-25-計画書ではふわっとした表現は禁止です.mp3 +3 -0
resource/2025-04-28-シンプルな判断・決断が出来るように.mp3 +3 -0
resource/2025-04-28-使える情報と使えない情報.mp3 +3 -0
resource/2025-04-30-資料は紙芝居のように。ストーリーを作る重要性。.mp3 +3 -0
resource/2025-05-01-マネジメントが難しい理由.mp3 +3 -0
resource/2025-05-02-情報を使える情報にする方法.mp3 +3 -0
resource/2025-05-07-お知らせとお詫び.mp3 +3 -0
resource/2025-05-07-深く考えることは脳にとって重労働。でも大事なこと。.mp3 +3 -0
resource/2025-05-08-脱！上司と部下の板挟み.mp3 +3 -0
resource/2025-05-09-ゴールから逆引きする発想.mp3 +3 -0
resource/2025-05-12-伝わらない前提で考える.mp3 +3 -0
resource/2025-05-13-イライラする時は自分の強みを見つけるチャンス.mp3 +3 -0
resource/knowledge_data.pkl +2 -2

.gitignore CHANGED Viewed

@@ -2,4 +2,7 @@ __pycache__
 old
 .env
 resource/QA*
-resource.zip

 old
 .env
 resource/QA*
+resource.zip
+faq
+.gradio
+*.old

api.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import pickle
+from datetime import datetime
+from pathlib import Path
+from fish_audio_sdk import Session, TTSRequest
+from openai import OpenAI
+from tenacity import retry, stop_after_attempt, wait_exponential
+INFO_audio_ID = os.getenv("INFO_audio_ID")
+client = OpenAI(api_key=os.getenv("gpt"))
+#
+audio_client = Session(os.getenv("audio"))
+PKL_FILE = "./resource/knowledge_data.pkl"
+def load_pkl(file_path):
+    if os.path.exists(file_path):
+        with open(file_path, "rb") as f:
+            return pickle.load(f)
+    else:
+        return {}
+def save_pkl(file_name, data):
+    sorted_data = dict(
+        sorted(
+            data.items(),
+            key=lambda x: datetime.strptime("-".join(x[0].split("-")[0:3]), "%Y-%m-%d"),
+        )
+    )
+    with open(file_name, "wb") as f:
+        pickle.dump(sorted_data, f)
+try:
+    from dotenv import load_dotenv
+    env_path = Path(os.path.dirname(__file__)) / ".env"
+    if env_path.exists():
+        load_dotenv(env_path)
+except ImportError:
+    pass
+# params
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=2, max=10),
+)
+def generate_item(prompt, sys_prompt, model="gpt-4.1-mini", temperature=1.0):
+    # print("prompt=", prompt)
+    # print("temperature=", temperature)
+    response = client.chat.completions.create(
+        model=model,
+        temperature=temperature,
+        messages=[
+            {
+                "role": "system",
+                "content": sys_prompt,
+            },
+            {"role": "user", "content": prompt},
+        ],
+    )
+    return response.choices[0].message.content.strip()
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=2, max=10),
+)
+def get_embedding(text):
+    response = client.embeddings.create(input=text, model="text-embedding-3-small")
+    return [emb.embedding for emb in response.data]
+@retry(
+    stop=stop_after_attempt(5),
+    wait=wait_exponential(multiplier=1, min=2, max=10),
+)
+def audio_text(text, audio_path):
+    with open(audio_path, "wb") as f:
+        for chunk in audio_client.tts(
+            TTSRequest(reference_id=INFO_audio_ID, text=text)
+        ):
+            f.write(chunk)
+    return audio_path

app.py CHANGED Viewed

@@ -6,7 +6,8 @@ import time
 import gradio as gr
-from blog_class import audio_text, knowledge_class, save_feedback
 from design import custom_css, custom_head, js_func, params_text, title_html, title_text
 knowledge_cls = knowledge_class()

 import gradio as gr
+from api import audio_text
+from blog_class import knowledge_class, save_feedback
 from design import custom_css, custom_head, js_func, params_text, title_html, title_text
 knowledge_cls = knowledge_class()

blog_class.py CHANGED Viewed

@@ -1,18 +1,9 @@
-import os
-import pickle
-import re
 import time
-from datetime import datetime
-from pathlib import Path
-from urllib.parse import urljoin
 import numpy as np
-import requests
-from bs4 import BeautifulSoup
-from fish_audio_sdk import Session, TTSRequest
-from openai import OpenAI
-from tenacity import retry, stop_after_attempt, wait_exponential
 from prompts import (
     Common_text,
     Creative_text,
@@ -21,116 +12,10 @@ from prompts import (
     QA_Prompt_template,
     REWRITE_Prompt,
     REWRITE_SYS_Prompt,
-    SEGMENT_Prompt,
     Short_text,
-    STYLE_Prompt,
-    SUMMARY_Prompt,
     SYS_Prompt,
 )
-try:
-    from dotenv import load_dotenv
-    env_path = Path(os.path.dirname(__file__)) / ".env"
-    if env_path.exists():
-        load_dotenv(env_path)
-except ImportError:
-    pass
-# params
-PKL_FILE = "./resource/knowledge_data.pkl"
-INFO_audio_ID = os.getenv("INFO_audio_ID")
-BASE_URL = "https://saratoga623.hatenablog.com/"
-client = OpenAI(api_key=os.getenv("gpt"))
-#
-audio_client = Session(os.getenv("audio"))
-def load_pkl(file_path):
-    if os.path.exists(file_path):
-        with open(file_path, "rb") as f:
-            return pickle.load(f)
-    else:
-        return {}
-def save_pkl(file_name, data):
-    with open(file_name, "wb") as f:
-        pickle.dump(data, f)
-def get_page_content(page_url):
-    response = requests.get(page_url)
-    response.encoding = response.apparent_encoding
-    return response.text
-def parse_homepage(html):
-    soup = BeautifulSoup(html, "html.parser")
-    articles = soup.find_all("article", class_="entry")
-    blog_infos = []
-    for article in articles:
-        title_tag = article.find("h1", class_="entry-title")
-        if title_tag and title_tag.find("a"):
-            link = urljoin(BASE_URL, title_tag.find("a")["href"])
-        else:
-            continue
-        # <time> タグ内から公開日を取得する
-        time_tag = article.find("time")
-        if time_tag:
-            year_tag = time_tag.find("span", class_="date-year")
-            month_tag = time_tag.find("span", class_="date-month")
-            day_tag = time_tag.find("span", class_="date-day")
-            if year_tag and month_tag and day_tag:
-                pub_date = f"{year_tag.get_text(strip=True)}-{month_tag.get_text(strip=True)}-{day_tag.get_text(strip=True)}"
-            else:
-                pub_date = "unknown_date"
-        else:
-            pub_date = "unknown_date"
-        blog_infos.append({"date": pub_date, "link": link})
-    return blog_infos, soup
-def get_next_page_url(soup):
-    next_page_tag = soup.find("a", string="次のページ")
-    if next_page_tag and next_page_tag.has_attr("href"):
-        return urljoin(BASE_URL, next_page_tag["href"])
-    return None
-def sort_key(info):
-    try:
-        return datetime.strptime(info["date"], "%Y-%m-%d")
-    except Exception:
-        return datetime.max
-def fetch_blog_text(url):
-    response = requests.get(url)
-    response.encoding = response.apparent_encoding
-    soup = BeautifulSoup(response.text, "html.parser")
-    title_tag = soup.find("h1", class_="entry-title")
-    title = title_tag.get_text(strip=True) if title_tag else "no_title"
-    content_tag = soup.find("div", class_="entry-content")
-    if not content_tag:
-        content_tag = soup.find("div", class_="hatenablog-entry")
-    if content_tag:
-        for a in content_tag.find_all("a", class_="keyword"):
-            a.unwrap()
-        content = content_tag.get_text(strip=False).strip()
-    else:
-        content = ""
-    return title, content
-def sanitize_filename(filename):
-    return re.sub(r'[\\/*?:"<>|]', "", filename)
 def max_cosine_similarity(v1, v2_list):
     """ """
@@ -152,49 +37,6 @@ def max_cosine_similarity(v1, v2_list):
     return np.max(similarities)
-@retry(
-    stop=stop_after_attempt(5),
-    wait=wait_exponential(multiplier=1, min=2, max=10),
-)
-def generate_item(prompt, sys_prompt, model="gpt-4o-mini", temperature=1.0):
-    print("prompt=", prompt)
-    print("temperature=", temperature)
-    response = client.chat.completions.create(
-        model=model,
-        temperature=temperature,
-        messages=[
-            {
-                "role": "system",
-                "content": sys_prompt,
-            },
-            {"role": "user", "content": prompt},
-        ],
-    )
-    return response.choices[0].message.content.strip()
-@retry(
-    stop=stop_after_attempt(5),
-    wait=wait_exponential(multiplier=1, min=2, max=10),
-)
-def get_embedding(text):
-    response = client.embeddings.create(input=text, model="text-embedding-3-small")
-    return [emb.embedding for emb in response.data]
-@retry(
-    stop=stop_after_attempt(5),
-    wait=wait_exponential(multiplier=1, min=2, max=10),
-)
-def audio_text(text, audio_path):
-    with open(audio_path, "wb") as f:
-        for chunk in audio_client.tts(
-            TTSRequest(reference_id=INFO_audio_ID, text=text)
-        ):
-            f.write(chunk)
-    return audio_path
 def save_feedback(value, liked):
     if liked:
         md_text = "text:\n" + value + "\n" + "liked"
@@ -209,6 +51,7 @@ def save_feedback(value, liked):
 class knowledge_class:
     def __init__(self):
         self.knowledge_data = load_pkl(PKL_FILE)
         self.reference_dict = self.get_reference_dict()
         # q_v = self.knowledge_data["2024-10-09-プロジェクト計画で重要視すること"][
         #     "vector"
@@ -239,75 +82,12 @@ class knowledge_class:
         return reference_dict
     def get_new_knowledge(self):
-        print(f"[{datetime.now()}] 記事の検出を開始します...")
-        processed_urls = set()
-        for _, v in self.knowledge_data.items():
-            processed_urls.add(v.get("url", ""))
-        new_blog_infos = []
-        page_url = BASE_URL
-        while page_url:
-            try:
-                html = get_page_content(page_url)
-            except Exception as e:
-                print(f"{page_url} の取得時にエラーが発生しました: {e}")
-                break
-            infos, soup = parse_homepage(html)
-            new_infos = [info for info in infos if info["link"] not in processed_urls]
-            new_blog_infos.extend(new_infos)
-            page_url = get_next_page_url(soup)
-        if not new_blog_infos:
-            print("新しい記事は見つかりませんでした。")
-        else:
-            print(
-                f"新規記事 {len(new_blog_infos)} 件を検出しました。処理を開始します..."
-            )
-            new_blog_infos.sort(key=sort_key)
-            for info in new_blog_infos:
-                blog_url = info["link"]
-                pub_date = info["date"]
-                print(f"記事を処理中: {blog_url}")
-                try:
-                    title, content = fetch_blog_text(blog_url)
-                    key_name = f"{pub_date}-{sanitize_filename(title)}"
-                    print("記事:", key_name)
-                    summary_text = generate_item(content, SUMMARY_Prompt)
-                    style_text = generate_item(content, STYLE_Prompt)
-                    segment_texts = [
-                        seg.strip()
-                        for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
-                        if seg.strip()
-                    ]
-                    texts_vector = get_embedding([title, summary_text] + segment_texts)
-                    audio_path = f"./resource/{key_name}.mp3"
-                    audio_path = audio_text(summary_text, audio_path)
-                    dict_item = {
-                        key_name: {
-                            "title": title,
-                            "text": content,
-                            "url": blog_url,
-                            "style": style_text,
-                            "summary": summary_text,
-                            "audio": audio_path,
-                            "segments": segment_texts,
-                            "vector": texts_vector,
-                        }
-                    }
-                except Exception as e:
-                    print(f"{blog_url} の処理中にエラーが発生しました: {e}")
-                    raise
-                self.knowledge_data.update(dict_item)
-                time.sleep(1)
-                save_pkl(PKL_FILE, self.knowledge_data)
-                time.sleep(1)
-                self.knowledge_data = load_pkl(PKL_FILE)
-                self.reference_dict = self.get_reference_dict()
-                time.sleep(1)
-        print(f"PKLファイルの更新が完了しました。新規記事数: {len(new_blog_infos)}")
     def find_top_info(self, question_vector):
         results = []
@@ -351,7 +131,7 @@ class knowledge_class:
             s_text=full_prompt,
         )
         answer_text = generate_item(
-            user_prompt, SYS_Prompt, model="gpt-4o", temperature=temperature
         )
         md_text = user_prompt + "\n 応答: \n" + answer_text
         timestamp = int(time.time() * 1000)
@@ -414,7 +194,7 @@ class knowledge_class:
         #         "./resource/2025-03-03-機嫌良く働くと仕事は上手く進む.mp3",
         #     )
-        rewrite_question = generate_item(rw_prompt, REWRITE_SYS_Prompt, model="gpt-4o")
         print("rewrite_question:", rewrite_question)
         # prompt = DEFAULT_TEMPLATE.format(chat_history=chat_history, question=query)
         # get rewrite question
@@ -433,7 +213,7 @@ class knowledge_class:
         answer_text = generate_item(
             user_prompt,
             SYS_Prompt,
-            model="gpt-4o",
             temperature=temperature,
         )
         md_text = user_prompt + "\n 応答: \n" + answer_text

 import time
 import numpy as np
+from api import PKL_FILE, generate_item, get_embedding, load_pkl, save_pkl
+from get_blog import get_new_blog_content, get_old_blog_content
 from prompts import (
     Common_text,
     Creative_text,
     QA_Prompt_template,
     REWRITE_Prompt,
     REWRITE_SYS_Prompt,
     Short_text,
     SYS_Prompt,
 )
 def max_cosine_similarity(v1, v2_list):
     """ """
     return np.max(similarities)
 def save_feedback(value, liked):
     if liked:
         md_text = "text:\n" + value + "\n" + "liked"
 class knowledge_class:
     def __init__(self):
         self.knowledge_data = load_pkl(PKL_FILE)
+        # print(self.knowledge_data)
         self.reference_dict = self.get_reference_dict()
         # q_v = self.knowledge_data["2024-10-09-プロジェクト計画で重要視すること"][
         #     "vector"
         return reference_dict
     def get_new_knowledge(self):
+        self.knowledge_data = get_old_blog_content(self.knowledge_data)
+        self.knowledge_data = get_new_blog_content(self.knowledge_data)
+        save_pkl(PKL_FILE, self.knowledge_data)
+        self.knowledge_data = load_pkl(PKL_FILE)
+        self.reference_dict = self.get_reference_dict()
+        print("PKLファイルの更新が完了しました。")
     def find_top_info(self, question_vector):
         results = []
             s_text=full_prompt,
         )
         answer_text = generate_item(
+            user_prompt, SYS_Prompt, model="gpt-4.1", temperature=temperature
         )
         md_text = user_prompt + "\n 応答: \n" + answer_text
         timestamp = int(time.time() * 1000)
         #         "./resource/2025-03-03-機嫌良く働くと仕事は上手く進む.mp3",
         #     )
+        rewrite_question = generate_item(rw_prompt, REWRITE_SYS_Prompt, model="gpt-4.1")
         print("rewrite_question:", rewrite_question)
         # prompt = DEFAULT_TEMPLATE.format(chat_history=chat_history, question=query)
         # get rewrite question
         answer_text = generate_item(
             user_prompt,
             SYS_Prompt,
+            model="gpt-4.1",
             temperature=temperature,
         )
         md_text = user_prompt + "\n 応答: \n" + answer_text

design.py CHANGED Viewed

@@ -72,7 +72,7 @@ title_html = """
 <div style="display: flex; align-items: center; justify-content: center;">
     <img src="https://cdn.profile-image.st-hatena.com/users/saratoga623/profile.png?1728512391" style="width:50px; height:50px; margin-right:10px;">
     <h1 style="margin: 0;">
-        <a href="https://saratoga623.hatenablog.com/" target="_blank" style="text-decoration: none; color: inherit;">
             プロジェクトマネジメント勉強会 <i class='fa fa-lightbulb-o'></i>
         </a>
     </h1>

 <div style="display: flex; align-items: center; justify-content: center;">
     <img src="https://cdn.profile-image.st-hatena.com/users/saratoga623/profile.png?1728512391" style="width:50px; height:50px; margin-right:10px;">
     <h1 style="margin: 0;">
+        <a href="https://note.com/saratoga623" target="_blank" style="text-decoration: none; color: inherit;">
             プロジェクトマネジメント勉強会 <i class='fa fa-lightbulb-o'></i>
         </a>
     </h1>

get_blog.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import re
+import time
+from datetime import datetime
+from urllib.parse import urljoin
+import requests
+from bs4 import BeautifulSoup
+from api import PKL_FILE, audio_text, generate_item, get_embedding, load_pkl, save_pkl
+from prompts import SEGMENT_Prompt, STYLE_Prompt, SUMMARY_Prompt
+OLD_BASE_URL = "https://saratoga623.hatenablog.com/"
+NEW_API_URL = "https://note.com/api/v2/creators/saratoga623/contents?kind=note&page="
+def get_page_content(page_url):
+    response = requests.get(page_url)
+    response.encoding = response.apparent_encoding
+    return response.text
+def parse_homepage(html):
+    soup = BeautifulSoup(html, "html.parser")
+    articles = soup.find_all("article", class_="entry")
+    blog_infos = []
+    for article in articles:
+        title_tag = article.find("h1", class_="entry-title")
+        if title_tag and title_tag.find("a"):
+            link = urljoin(OLD_BASE_URL, title_tag.find("a")["href"])
+        else:
+            continue
+        # <time> タグ内から公開日を取得する
+        time_tag = article.find("time")
+        if time_tag:
+            year_tag = time_tag.find("span", class_="date-year")
+            month_tag = time_tag.find("span", class_="date-month")
+            day_tag = time_tag.find("span", class_="date-day")
+            if year_tag and month_tag and day_tag:
+                pub_date = f"{year_tag.get_text(strip=True)}-{month_tag.get_text(strip=True)}-{day_tag.get_text(strip=True)}"
+            else:
+                pub_date = "unknown_date"
+        else:
+            pub_date = "unknown_date"
+        blog_infos.append({"date": pub_date, "link": link})
+    return blog_infos, soup
+def get_next_page_url(soup):
+    next_page_tag = soup.find("a", string="次のページ")
+    if next_page_tag and next_page_tag.has_attr("href"):
+        return urljoin(OLD_BASE_URL, next_page_tag["href"])
+    return None
+def sort_key(info):
+    try:
+        return datetime.strptime(info["date"], "%Y-%m-%d")
+    except Exception:
+        return datetime.max
+def fetch_blog_text(url):
+    response = requests.get(url)
+    response.encoding = response.apparent_encoding
+    soup = BeautifulSoup(response.text, "html.parser")
+    title_tag = soup.find("h1", class_="entry-title")
+    title = title_tag.get_text(strip=True) if title_tag else "no_title"
+    content_tag = soup.find("div", class_="entry-content")
+    if not content_tag:
+        content_tag = soup.find("div", class_="hatenablog-entry")
+    if content_tag:
+        for a in content_tag.find_all("a", class_="keyword"):
+            a.unwrap()
+        content = content_tag.get_text(strip=False).strip()
+    else:
+        content = ""
+    return title, content
+def sanitize_filename(filename):
+    return re.sub(r'[\\/*?:"<>|]', "", filename)
+def get_article_content(url):
+    """"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        print(f"{url} error: {e}")
+        return None
+    soup = BeautifulSoup(response.text, "lxml")
+    content_div = soup.find(
+        "div", {"data-name": "body", "class": "note-common-styles__textnote-body"}
+    ) or soup.find("div", class_="o-noteContent__body")
+    if content_div:
+        content = content_div.get_text(separator="\n", strip=True)
+        return content
+    else:
+        print("{url} can not find content")
+        return None
+def get_old_blog_content(knowledge_data):
+    print(f"[{datetime.now()}]: {OLD_BASE_URL}中に、記事の検出を開始します...")
+    processed_urls = set()
+    for _, v in knowledge_data.items():
+        processed_urls.add(v.get("url", ""))
+    new_blog_infos = []
+    page_url = OLD_BASE_URL
+    while page_url:
+        try:
+            html = get_page_content(page_url)
+        except Exception as e:
+            print(f"{page_url} の取得時にエラーが発生しました: {e}")
+            break
+        infos, soup = parse_homepage(html)
+        new_infos = [info for info in infos if info["link"] not in processed_urls]
+        new_blog_infos.extend(new_infos)
+        page_url = get_next_page_url(soup)
+        time.sleep(1)
+    if not new_blog_infos:
+        print(f"{OLD_BASE_URL}中に、新しい記事は見つかりませんでした。")
+    else:
+        print(
+            f"{OLD_BASE_URL}中に、新規記事 {len(new_blog_infos)} 件を検出しました。処理を開始します..."
+        )
+        new_blog_infos.sort(key=sort_key)
+        for info in new_blog_infos:
+            blog_url = info["link"]
+            pub_date = info["date"]
+            print(f"記事を処理中: {blog_url}")
+            try:
+                title, content = fetch_blog_text(blog_url)
+                key_name = f"{pub_date}-{sanitize_filename(title)}"
+                print("記事:", key_name)
+                summary_text = generate_item(content, SUMMARY_Prompt)
+                style_text = generate_item(content, STYLE_Prompt)
+                segment_texts = [
+                    seg.strip()
+                    for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
+                    if seg.strip()
+                ]
+                texts_vector = get_embedding([title, summary_text] + segment_texts)
+                audio_path = f"./resource/{key_name}.mp3"
+                audio_path = audio_text(summary_text, audio_path)
+                dict_item = {
+                    key_name: {
+                        "title": title,
+                        "text": content,
+                        "url": blog_url,
+                        "style": style_text,
+                        "summary": summary_text,
+                        "audio": audio_path,
+                        "segments": segment_texts,
+                        "vector": texts_vector,
+                    }
+                }
+            except Exception as e:
+                print(f"{blog_url} の処理中にエラーが発生しました: {e}")
+                raise
+            knowledge_data.update(dict_item)
+            time.sleep(1)
+            save_pkl(PKL_FILE, knowledge_data)
+            time.sleep(1)
+            knowledge_data = load_pkl(PKL_FILE)
+            time.sleep(1)
+        print(
+            f"{OLD_BASE_URL}中に、新規記事の更新が完了しました。記事数: {len(new_blog_infos)}"
+        )
+        return knowledge_data
+def get_new_blog_content(knowledge_data):
+    print(f"[{datetime.now()}]: {NEW_API_URL}中に、記事の検出を開始します...")
+    all_articles = []
+    processed_title = set()
+    for _, v in knowledge_data.items():
+        processed_title.add(v.get("title", ""))
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    }
+    page = 1
+    while True:
+        api_url = f"{NEW_API_URL}{page}"
+        try:
+            response = requests.get(api_url, headers=headers, timeout=10)
+            response.raise_for_status()
+        except requests.RequestException as e:
+            print(f"{api_url} の取得時にエラーが発生しました: {e}")
+            break
+        json_data = response.json()
+        # check last
+        is_last_page = json_data.get("data", {}).get("isLastPage", True)
+        if is_last_page:
+            break
+        notes = json_data.get("data", {}).get("contents", [])
+        if not notes:
+            break
+        for item in notes:
+            if isinstance(item, dict):
+                note_url = item.get("noteUrl")
+                title = item.get("name")
+                publish_at = item.get("publishAt")
+                if title in processed_title:
+                    continue
+                if note_url and title:
+                    print(f"Note URL: {note_url}, Title: {title}, Time: {publish_at}")
+                    all_articles.append(
+                        {
+                            "title": title,
+                            "url": note_url,
+                            "timestamp": publish_at,  # 加入时间戳
+                        }
+                    )
+        page += 1
+        time.sleep(1)
+    if not all_articles:
+        print(f"{NEW_API_URL}中に、新しい記事は見つかりませんでした。")
+    else:
+        print(
+            f"{NEW_API_URL}中に、新規記事 {len(all_articles)} 件を検出しました。処理を開始します..."
+        )
+        for article in all_articles:
+            title = article["title"]
+            timestamp = article["timestamp"]
+            url = article["url"]
+            print(f"記事を処理中: {url}")
+            try:
+                content = get_article_content(url)
+                if not content:
+                    continue
+                key_name = f"{timestamp[:10]}-{sanitize_filename(title)}"  ##TODO
+                print("記事:", key_name)
+                summary_text = generate_item(content, SUMMARY_Prompt)
+                style_text = generate_item(content, STYLE_Prompt)
+                segment_texts = [
+                    seg.strip()
+                    for seg in generate_item(content, SEGMENT_Prompt).split("\n\n")
+                    if seg.strip()
+                ]
+                texts_vector = get_embedding([title, summary_text] + segment_texts)
+                audio_path = f"./resource/{key_name}.mp3"
+                audio_path = audio_text(summary_text, audio_path)
+                dict_item = {
+                    key_name: {
+                        "title": title,
+                        "text": content,
+                        "url": url,
+                        "style": style_text,
+                        "summary": summary_text,
+                        "audio": audio_path,
+                        "segments": segment_texts,
+                        "vector": texts_vector,
+                    }
+                }
+            except Exception as e:
+                print(f"{url} の処理中にエラーが発生しました: {e}")
+                raise
+            knowledge_data.update(dict_item)
+            time.sleep(1)
+            save_pkl(PKL_FILE, knowledge_data)
+            time.sleep(1)
+            knowledge_data = load_pkl(PKL_FILE)
+            time.sleep(1)
+        print(
+            f"{NEW_API_URL}中に、新規記事の更新が完了しました。記事数: {len(all_articles)}"
+        )
+        return knowledge_data

prompts.py CHANGED Viewed

@@ -1,38 +1,41 @@
 SYS_Prompt = """
-あなたは言語の専門家であり、さまざまな話し方や思考パターンを分析し、模倣し、適応することに長けています。
 """
 SUMMARY_Prompt = """
-あなたは文章の内容を分析する専門家です。
-以下の文章の要約を作成してください。要約は100字以内で簡潔にまとめてください。
 """
 STYLE_Prompt = """
-あなたは文章の内容を分析する専門家です。
-以下の文章内容を分析して、
-1. 原文の作者の文体を抽出してください。例：敬語、語気、構造、用語など代表的なもの
-2. 原文の作者の思考パターンを抽出してください。例：論理構成、推論、態度など代表的なもの
 """
 SEGMENT_Prompt = """
-あなたは文章の内容を分析する専門家です。
-以下の文章を内容に基づいて意味的に段落に分割してください。各段落は独立した文章で、段落ごとに改行を入れてください。
 """
 QA_Prompt_template = """
-ユーザーの質問に応じて表現を的確に調整し、自然でスムーズな対話を実現し、\
-検索された情報の内容、文脈、コミュニケーションスタイルに適した応答を提供してください。
-【ユーザーの質問】
 {q_text}
-【検索された情報】
 {r_text}
-【タスク要求】
 - {c_text}
-- 原文の作者の文体を模倣すること。
-- 原文の思考パターンを模倣すること。
 - {s_text}
-- 文章風ではなく、対話形式で答えること。
 """
 REWRITE_SYS_Prompt = """
 Given a conversation (between Human and Assistant) and a follow up message from Human, \
 rewrite the message to be a standalone question that captures all relevant context \
@@ -48,28 +51,32 @@ REWRITE_Prompt = """
 <Standalone question>
 """
 QA_chat_Prompt_template = """
-ユーザーの最新質問に応じて表現を的確に調整し、自然でスムーズな対話を実現し、\
-検索された情報の内容、文脈、コミュニケーションスタイルに適した応答を提供してください。
-【ユーザーの会話履歴】
 {h_text}
-【ユーザーの最新質問】
 {q_text}
-【検索された情報】
 {r_text}
-【タスク要求】
 - {c_text}
-- 原文の作者の文体を模倣すること。
-- 原文の作者の思考パターンを模倣すること。
 - {s_text}
-- 文章風ではなく、対話形式で応答内容だけ、答えること。
 """
-Creative_text = """検索された情報内容を参考に、質問に会話の形で回答すること。
-    必要に応じて検索された情報にとらわれず、自由な発想や独自の視点を取り入れて回答してください。
-    創造性を重視し、会話を豊かにすることを意識してください。
 """
-Common_text = """検索された情報内容を参考に、質問に会話の形で回答すること。
 """
-Short_text = """簡潔な回答を心がけること。3-5文程度で要点をまとめることを意識してください。
 """
-Full_text = """情報の抜けや曖昧な部分がないように、包括的かつ丁寧に回答すること。背景情報や前提条件を補足し、論理的な流れを意識して展開すること。回答は複数の段落で構成し、それぞれの段落が明確なテーマに基づいていること。読者が疑問を抱かず、安心して理解できるよう配慮すること。
 """

 SYS_Prompt = """
+You are a specialist in linguistic analysis, skilled at identifying, emulating, and adapting diverse communication styles and patterns of thought. Always respond in Japanese.
 """
 SUMMARY_Prompt = """
+You are an expert in text analysis. Summarize the following passage in Japanese, using no more than 100 characters.
 """
 STYLE_Prompt = """
+You are an expert in text analysis. For the text below, identify:
+1. Key features of the author’s writing style (e.g., honorific usage, tone, sentence structure, terminology).
+2. The author’s patterns of reasoning (e.g., logical organization, inference methods, attitude).
+Present your analysis in Japanese.
 """
 SEGMENT_Prompt = """
+You are an expert in text analysis. Divide the following text into semantically coherent paragraphs—each representing a distinct idea. Separate paragraphs with a blank line. Provide the output in Japanese.
 """
 QA_Prompt_template = """
+Fine-tune your wording to match the user’s question precisely, creating a natural and seamless conversational flow. Provide answers that are appropriate to the content, context, and communication style of the retrieved information.
+User Question:
 {q_text}
+Retrieved Information:
 {r_text}
+Task Requirements:
 - {c_text}
+- Emulate the original author’s writing style.
+- Emulate the original author’s thought process.
 - {s_text}
+- Respond in a dialogue format rather than as a narrative.
+- Preserve the output format: reply only with the response text in Japanese, without any labels such as “ユーザー：” or “AI：”.
 """
 REWRITE_SYS_Prompt = """
 Given a conversation (between Human and Assistant) and a follow up message from Human, \
 rewrite the message to be a standalone question that captures all relevant context \
 <Standalone question>
 """
 QA_chat_Prompt_template = """
+Based on the user’s latest query, dynamically adjust your phrasing to ensure natural, seamless conversation flow. Tailor your response to the content, context, and communication style of the retrieved information.
+<Conversation History>
 {h_text}
+<User’s Latest Question>
 {q_text}
+<Retrieved Information>
 {r_text}
+<Task Requirements>
 - {c_text}
+- Imitate the original author’s writing style.
+- Mirror the original author’s train of thought.
 - {s_text}
+- Respond purely in dialogue form—do not include labels such as “User:” or “AI:”; return only the reply.
+Ensure your answer itself is written in Japanese.
 """
+Creative_text = """
+Use the retrieved information as a reference for your conversational reply. Feel free to go beyond the source material by incorporating original ideas or perspectives where appropriate. Emphasize creativity to enrich the dialogue.
 """
+Common_text = """
+Use the retrieved information as a reference and answer in dialogue form.
 """
+Short_text = """
+Keep your response concise. Aim for 3–5 sentences that capture the key points.
 """
+Full_text = """
+Provide a comprehensive and thorough answer without gaps or ambiguity. Include any necessary background or assumptions, structure your response into clear thematic paragraphs, and ensure logical progression. Anticipate and address potential follow-up questions so the reader feels fully informed.
 """

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ openai
 # zyphra
 tenacity
 beautifulsoup4
-fish-audio-sdk

 # zyphra
 tenacity
 beautifulsoup4
+fish-audio-sdk
+lxml

resource/2025-04-08-変化を阻害するもの.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff86e41a2c419f8286f7a48736e4142d3bf5c0819ae834a1fe650e12a06b709a
+size 221100

resource/2025-04-09-目的を意識するのはスキルです.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2605e3afaa416572dc75364dc79e1c7c68cc86d5a8b44625cecd0ba19aac49bc
+size 258298

resource/2025-04-10-文章をダイエットさせる.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:586e275ea22a3a9eb4964945e94cb6ead76fbe5e1062d7354bf0e558ccf2c9c3
+size 190171

resource/2025-04-11-仕事は決めてなんぼ.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9948713ab28621a872e0b84dc1d4b33dd547a501cf3477ceeff215adf22d5973
+size 242834

resource/2025-04-12-精神論も大事です.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa8040cbf2b3c9ae7b482cb333e11cf3ef00974f64e073e35732f5ae6d980332
+size 247431

resource/2025-04-14-ヒューマンエラーはゼロになりません.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:569d89b9692bf1d2f3ad1b67d8afb3a996149fee1541e1066655db10d1b9e048
+size 224444

resource/2025-04-15-良い行動で良い結果を出す.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a476984f8f79892a7bc5cbd95a6bae7bf579a14b514d538cea108ed86b1a449
+size 266657

resource/2025-04-16-新幹線の運休と障害訓練.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcd9956b30632bdcc5590309d1bd50944ca53fc841439f2d6226addd08fb40ca
+size 183484

resource/2025-04-17-バッファと保険の違い.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eeb419576c21edaf02ebdd7bc58ff8dccd90f3e7ca808dbcb1cf431f328069d3
+size 228623

resource/2025-04-18-無理難題に対する対処.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a369971d3b6b626adbf37bdfcf836164266b6f48b05fb2b4926552100578bb2
+size 274599

resource/2025-04-19-2位じゃダメなんでしょうか？.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34765042bdef737a0d7274525ab70f70515cc7da0c20a57b2d932136de4a97b1
+size 224862

resource/2025-04-21-例え話でピンとくる説明をAIで実現.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2de3a8ba7824c9bfe1a6c931b64286593d6fa740d94541e7f35b73645925f185
+size 239908

resource/2025-04-22-人を変えるのではなく、関わり方を変える.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3acf7f665821a6b09634451971ad2be0f15e4ac235a9bb2d7464b0ac9673a47d
+size 191425

resource/2025-04-23-ひとつの経験から多くを学ぶ.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c58f535b88c8f9c9a48e206f4cc0c4a4f8b8c159536da741fbe8dba1c995b3e
+size 223608

resource/2025-04-24-失敗の目的.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:411825a6b0eb816f6b43ae7363d6c270a53b92acd57e24ddf307fd0cd215413a
+size 213159

resource/2025-04-25-計画書ではふわっとした表現は禁止です.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d5bd87b1653ed1ef852e9e016ba04df33a7e6dce7a5c96a551ffaea35488c8c4
+size 271255

resource/2025-04-28-シンプルな判断・決断が出来るように.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46ab982dafc173cdd9ca1e2e5b8a015921eeea701e2967309b7842da656c5fdc
+size 161750

resource/2025-04-28-使える情報と使えない情報.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6d0a0bd0c9e40dadf4ed2902fe85c91961246fe6faaf9fb815049a98271bca86
+size 215249

resource/2025-04-30-資料は紙芝居のように。ストーリーを作る重要性。.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d74ff1cfc9a013ad4149f0575001b73d8ac4a47ccdc853e7e3a21ac37da4e02
+size 264986

resource/2025-05-01-マネジメントが難しい理由.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22926c0aff32767900e6cb2678542bcb68478205ce8ed22afe45d20162dccaae
+size 199784

resource/2025-05-02-情報を使える情報にする方法.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b10762ddcc2b4af7133dad1eaeceb2ef0312d07056658c537bc771f5c1afd59
+size 230295

resource/2025-05-07-お知らせとお詫び.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f96bbf44c58e4d092b9df39ad8000a150ae0c7dccff1db593a9fb83dfdc52d
+size 204800

resource/2025-05-07-深く考えることは脳にとって重労働。でも大事なこと。.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63c992a6f7c411a40c7fa079b7b14c64e65aa0c598e8be1def671482a93c141a
+size 243670

resource/2025-05-08-脱！上司と部下の板挟み.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f79f2e79020bb052bf6cfe445502341435cd51726fa0eac76e378c8ed9a61be
+size 241998

resource/2025-05-09-ゴールから逆引きする発想.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30fe1e280281396c6f2ec4fbf6e8b8814036d4248f94ac18c753859308e6b5a0
+size 280032

resource/2025-05-12-伝わらない前提で考える.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cd3e6df79252edd81d5e42465e4ab749c941da8bfea733a3a3e8ba6e3e648c4
+size 231549

resource/2025-05-13-イライラする時は自分の強みを見つけるチャンス.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cabb8e05785f213d944aff385bea13e6158a5ef73b54c97d309c070b534da67
+size 209397

resource/knowledge_data.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c8e7fe703ddd333803071b013edeae1da7d0d39d5c9cbf66ca6ed2c7ff82112b
-size 18144304

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a6b00127c041be6e19f00b16d16dd5236ed5d03ec65273cc5f6b13c40f013b3
+size 21481467