Spaces:
Sleeping
Sleeping
Commit ·
aa52b0f
0
Parent(s):
Duplicate from Yasu777/BabyWriter3
Browse files- .gitattributes +34 -0
- README.md +15 -0
- app.py +215 -0
- first.py +229 -0
- requirements.txt +17 -0
- run_third.py +9 -0
- second.py +45 -0
- third.py +99 -0
.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: BabyWriter
|
| 3 |
+
emoji: 📉
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: pink
|
| 6 |
+
python_version: 3.9.13
|
| 7 |
+
pip_version: 23.1.2
|
| 8 |
+
sdk: streamlit
|
| 9 |
+
sdk_version: 1.21.0
|
| 10 |
+
app_file: app.py
|
| 11 |
+
pinned: false
|
| 12 |
+
duplicated_from: Yasu777/BabyWriter3
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
import subprocess
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
import warnings
|
| 6 |
+
from bs4 import BeautifulSoup
|
| 7 |
+
import re
|
| 8 |
+
from tinydb import TinyDB, Query
|
| 9 |
+
from googleapiclient.discovery import build
|
| 10 |
+
import urllib.request
|
| 11 |
+
import urllib.error
|
| 12 |
+
import datetime
|
| 13 |
+
|
| 14 |
+
@st.cache_resource
|
| 15 |
+
def get_top_urls_and_keyword(keyword):
|
| 16 |
+
# SecretsからGoogle APIキーとカスタム検索エンジンIDを取得
|
| 17 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 18 |
+
CUSTOM_SEARCH_ENGINE_ID = os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
| 19 |
+
|
| 20 |
+
# Google Customサーチ結果を取得
|
| 21 |
+
service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
|
| 22 |
+
response = service.cse().list(
|
| 23 |
+
q=keyword,
|
| 24 |
+
cx=CUSTOM_SEARCH_ENGINE_ID,
|
| 25 |
+
lr="lang_ja",
|
| 26 |
+
num=3,
|
| 27 |
+
start=1
|
| 28 |
+
).execute()
|
| 29 |
+
|
| 30 |
+
# 上位3つのサイトURLを取得
|
| 31 |
+
urls = [item['link'] for item in response["items"][:3]]
|
| 32 |
+
|
| 33 |
+
return urls, keyword
|
| 34 |
+
|
| 35 |
+
def get_valid_url(urls):
|
| 36 |
+
for url in urls:
|
| 37 |
+
try:
|
| 38 |
+
response = urllib.request.urlopen(url)
|
| 39 |
+
charset = response.headers.get_content_charset()
|
| 40 |
+
html = response.read().decode(charset)
|
| 41 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 42 |
+
# 抽出したテキストが日本語であるかどうかを判定
|
| 43 |
+
if is_japanese_text(soup.get_text()):
|
| 44 |
+
return url
|
| 45 |
+
except urllib.error.URLError as e:
|
| 46 |
+
print(f"URLエラー: {e.reason}")
|
| 47 |
+
except urllib.error.HTTPError as e:
|
| 48 |
+
print(f"HTTPエラー: {e.code}")
|
| 49 |
+
except:
|
| 50 |
+
print("予期せぬエラーが発生しました。")
|
| 51 |
+
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def is_japanese_text(text):
|
| 56 |
+
# 日本語のテキストであるかどうかを判定する条件を定義
|
| 57 |
+
japanese_pattern = r"[\p{Hiragana}\p{Katakana}\p{Han}ー〜、。「」【】]"
|
| 58 |
+
return bool(re.search(japanese_pattern, text))
|
| 59 |
+
|
| 60 |
+
def is_valid_html(html):
|
| 61 |
+
with warnings.catch_warnings(record=True) as w:
|
| 62 |
+
warnings.simplefilter('always')
|
| 63 |
+
BeautifulSoup(html, 'html.parser')
|
| 64 |
+
return len(w) == 0
|
| 65 |
+
|
| 66 |
+
# データベースへの接続を確立
|
| 67 |
+
db = TinyDB("db.json")
|
| 68 |
+
|
| 69 |
+
# 1日前のタイムスタンプを取得
|
| 70 |
+
current_time = datetime.datetime.now()
|
| 71 |
+
one_day_ago = current_time - datetime.timedelta(days=1)
|
| 72 |
+
|
| 73 |
+
# データベースから1日前のタイムスタンプ以前のログを削除
|
| 74 |
+
db.remove(Query().timestamp.test(lambda x: datetime.datetime.fromisoformat(x) <= one_day_ago))
|
| 75 |
+
|
| 76 |
+
# Webアプリのタイトルと説明
|
| 77 |
+
st.title("記事生成ウェブアプリ")
|
| 78 |
+
st.write("このアプリは、与えられたキーワードを使用して記事を生成します。")
|
| 79 |
+
|
| 80 |
+
# キーワード入力
|
| 81 |
+
new_keyword = st.text_input("キーワード:")
|
| 82 |
+
|
| 83 |
+
# キーワードごとにデータを保存するための識別子
|
| 84 |
+
keyword_id = re.sub(r"\W+", "", new_keyword) if new_keyword else None
|
| 85 |
+
|
| 86 |
+
# データベースから前回のキーワードを取得
|
| 87 |
+
last_keyword = db.search(Query().keyword_id.exists())
|
| 88 |
+
|
| 89 |
+
if new_keyword and (not last_keyword or last_keyword[0]['keyword_id'] != keyword_id):
|
| 90 |
+
|
| 91 |
+
# キーワードが変更された場合は、データベースから前回の結果を削除
|
| 92 |
+
if last_keyword:
|
| 93 |
+
db.remove(Query().keyword_id == last_keyword[0]['keyword_id'])
|
| 94 |
+
# output1.txt、output2.txt、output3.txtの内容をクリアする
|
| 95 |
+
with open("output1.txt", "w") as f:
|
| 96 |
+
f.write("")
|
| 97 |
+
with open("output2.txt", "w") as f:
|
| 98 |
+
f.write("")
|
| 99 |
+
with open("output3.txt", "w") as f:
|
| 100 |
+
f.write("")
|
| 101 |
+
# output0-1.txt、output0-2.txt、output0-3.txtを削除する
|
| 102 |
+
for i in range(1, 4):
|
| 103 |
+
filename = f"output0-{i}.txt"
|
| 104 |
+
if os.path.exists(filename):
|
| 105 |
+
os.remove(filename)
|
| 106 |
+
|
| 107 |
+
# 新しいキーワードが入力されたときにGoogle検索を行う
|
| 108 |
+
if new_keyword:
|
| 109 |
+
urls, keyword = get_top_urls_and_keyword(new_keyword)
|
| 110 |
+
if len(urls) < 3: # Google検索の結果が3つ以上であることを確認
|
| 111 |
+
st.error("Google検索の結果が3つ未満です。別のキーワードを試してみてください。")
|
| 112 |
+
else:
|
| 113 |
+
url1, url2, url3 = urls
|
| 114 |
+
|
| 115 |
+
if keyword_id: # キーワードIDが存在することを確認
|
| 116 |
+
# 出力欄
|
| 117 |
+
output1 = st.empty()
|
| 118 |
+
output2 = st.empty()
|
| 119 |
+
output3 = st.empty()
|
| 120 |
+
|
| 121 |
+
# データベースから編集済みの "output2.txt" を読み込む処理を追加
|
| 122 |
+
result = db.search((Query().name == "output2.txt") & (Query().keyword_id == keyword_id))
|
| 123 |
+
if result:
|
| 124 |
+
editable_output2 = result[0]["content"]
|
| 125 |
+
else:
|
| 126 |
+
editable_output2 = ""
|
| 127 |
+
|
| 128 |
+
# runボタン
|
| 129 |
+
if st.button("記事構成作成", key=f"run_button_{keyword_id}"):
|
| 130 |
+
with st.spinner("タイトル・見出し作成中..."):
|
| 131 |
+
urls, keyword = get_top_urls_and_keyword(new_keyword)
|
| 132 |
+
url1, url2, url3 = urls
|
| 133 |
+
|
| 134 |
+
# 重複チェックと同じサイト内のページチェッ���
|
| 135 |
+
parsed_urls = [urlparse(url) for url in urls]
|
| 136 |
+
if len(urls) != len(set(urls)):
|
| 137 |
+
st.error("異なるURLを入力してください。")
|
| 138 |
+
st.stop()
|
| 139 |
+
elif len(set([url.netloc for url in parsed_urls])) != len(urls):
|
| 140 |
+
st.error("異なるサイトのURLを入力してください。")
|
| 141 |
+
st.stop()
|
| 142 |
+
|
| 143 |
+
subprocess.run(["python3", "first.py", url1, url2, url3])
|
| 144 |
+
|
| 145 |
+
with open("output1.txt", "r", encoding="utf-8") as f:
|
| 146 |
+
content = f.read()
|
| 147 |
+
# "関連するテキスト部分:"とそれ以降の部分を削除
|
| 148 |
+
content = re.sub(r"\n関連するテキスト部分:.*", "", content, flags=re.DOTALL)
|
| 149 |
+
output1.text(content)
|
| 150 |
+
db.upsert({"name": "output1.txt", "content": content, "keyword_id": keyword_id},
|
| 151 |
+
(Query().name == "output1.txt") & (Query().keyword_id == keyword_id)) # データベースに結果を保存
|
| 152 |
+
|
| 153 |
+
subprocess.run(["python3", "second.py", keyword])
|
| 154 |
+
with open("output2.txt", "r", encoding="utf-8") as f:
|
| 155 |
+
editable_output2 = f.read()
|
| 156 |
+
soup = BeautifulSoup(editable_output2, "html.parser")
|
| 157 |
+
h_tags = soup.find_all(re.compile("^h[1-3]$"))
|
| 158 |
+
output2.text(editable_output2)
|
| 159 |
+
existing_docs = db.search((Query().name == "output2.txt") & (Query().keyword_id == keyword_id))
|
| 160 |
+
if existing_docs:
|
| 161 |
+
db.update(
|
| 162 |
+
{"content": editable_output2, "tags": str(h_tags)},
|
| 163 |
+
doc_ids=[doc.doc_id for doc in existing_docs],
|
| 164 |
+
)
|
| 165 |
+
else:
|
| 166 |
+
db.insert(
|
| 167 |
+
{"name": "output2.txt", "content": editable_output2, "tags": str(h_tags),
|
| 168 |
+
"timestamp": current_time.isoformat(), "keyword_id": keyword_id}
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
st.success("処理が完了しました。")
|
| 172 |
+
|
| 173 |
+
# 編集欄を表示し、編集後の内容をeditable_output2に更新
|
| 174 |
+
editable_output2 = st.text_area("output2.txtを編集してください:", value=editable_output2)
|
| 175 |
+
|
| 176 |
+
# Plan-and-Execute Agentsのrun_third.py経由の処理
|
| 177 |
+
if st.button("記事作成"):
|
| 178 |
+
with st.spinner("記事作成中..."):
|
| 179 |
+
subprocess.run(["python3", "run_third.py", editable_output2, keyword_id])
|
| 180 |
+
|
| 181 |
+
# output3.txtの内容を読み込み、出力欄に表示
|
| 182 |
+
with open("output3.txt", "r", encoding="utf-8") as f:
|
| 183 |
+
content = f.read()
|
| 184 |
+
output3.text(content)
|
| 185 |
+
|
| 186 |
+
# 保存ボタン
|
| 187 |
+
if st.button("保存"):
|
| 188 |
+
# h2, h3タグのリミット
|
| 189 |
+
h2_limit = 5
|
| 190 |
+
h3_limit = 10
|
| 191 |
+
|
| 192 |
+
# 編集後のテキストからh2, h3タグの数をカウント
|
| 193 |
+
soup = BeautifulSoup(editable_output2, "html.parser")
|
| 194 |
+
h2_count = len(soup.find_all("h2"))
|
| 195 |
+
h3_count = len(soup.find_all("h3"))
|
| 196 |
+
|
| 197 |
+
# h2, h3タグの数がリミットを超えていないかを確認
|
| 198 |
+
if h2_count > h2_limit or h3_count > h3_limit:
|
| 199 |
+
st.error(f"h2タグの数が{h2_limit}を、h3タグの数が{h3_limit}を超えています。")
|
| 200 |
+
elif not is_valid_html(editable_output2):
|
| 201 |
+
st.error("入力されたテキストは正しいHTML形式ではありません。")
|
| 202 |
+
else:
|
| 203 |
+
content = editable_output2
|
| 204 |
+
with open("output2.txt", "w", encoding="utf-8") as f:
|
| 205 |
+
f.write(content)
|
| 206 |
+
db.upsert({"name": "output2.txt", "content": content, "timestamp": current_time.isoformat(),
|
| 207 |
+
"keyword_id": keyword_id}, (Query().name == "output2.txt") & (Query().keyword_id == keyword_id)) # データベースに変更を保存
|
| 208 |
+
st.write("output2.txt に変更が保存されました。")
|
| 209 |
+
|
| 210 |
+
# クリアボタン
|
| 211 |
+
if st.button("データクリア"):
|
| 212 |
+
db.remove(Query().keyword_id == keyword_id)
|
| 213 |
+
st.write("データベースがクリアされました。")
|
| 214 |
+
else:
|
| 215 |
+
st.warning("キーワードを入力してください。")
|
first.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
import re
|
| 6 |
+
import jaconv
|
| 7 |
+
import sys
|
| 8 |
+
import openai
|
| 9 |
+
from janome.tokenizer import Tokenizer
|
| 10 |
+
from bs4 import BeautifulSoup
|
| 11 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 12 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 13 |
+
import langchain
|
| 14 |
+
from langchain import OpenAI
|
| 15 |
+
from langchain.text_splitter import TokenTextSplitter
|
| 16 |
+
from langchain.prompts import PromptTemplate
|
| 17 |
+
from langchain.chains import LLMChain
|
| 18 |
+
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
|
| 19 |
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
| 20 |
+
from typing import Any, List, Mapping, Optional
|
| 21 |
+
from langchain.chat_models import ChatOpenAI
|
| 22 |
+
import cchardet
|
| 23 |
+
|
| 24 |
+
# APIキーの設定
|
| 25 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 26 |
+
|
| 27 |
+
url1 = sys.argv[1]
|
| 28 |
+
url2 = sys.argv[2]
|
| 29 |
+
url3 = sys.argv[3]
|
| 30 |
+
|
| 31 |
+
urls = [url1, url2, url3]
|
| 32 |
+
|
| 33 |
+
# エラーが発生したURLを保存するファイル
|
| 34 |
+
error_url_file = "error_urls.txt"
|
| 35 |
+
|
| 36 |
+
# エラーが発生したURLを読み込む
|
| 37 |
+
try:
|
| 38 |
+
with open(error_url_file, "r") as f:
|
| 39 |
+
error_urls = f.read().splitlines()
|
| 40 |
+
except FileNotFoundError:
|
| 41 |
+
error_urls = []
|
| 42 |
+
|
| 43 |
+
texts = []
|
| 44 |
+
num_topics = 3
|
| 45 |
+
tfidf_threshold = 0.1 # TF-IDFの閾値
|
| 46 |
+
n_top_words = 10 # 各トピックのトップNのキーワードを抽出
|
| 47 |
+
|
| 48 |
+
stop_words = ["こちら","の", "に", "は", "を", "た", "が", "で", "て", "と", "し", "れ", "さ", "ある", "いる", "も", "する", "から", "な", "こと", "として", "い", "や", "れる", "など", "なっ", "ない", "この", "ため", "その", "あっ", "よう", "また", "もの", "という", "あり", "まで", "られ", "なる", "へ", "か", "だ", "これ", "によって", "により", "おり", "より", "による", "ず", "なり", "られる", "において", "ば", "なかっ", "なく", "しかし", "について", "せ", "だっ", "その後", "できる", "それ", "う", "ので", "なお", "のみ", "でき", "き", "つ", "における", "および", "いう", "さらに", "でも", "ら", "たり", "その他", "または", "ながら", "つつ", "とも", "これら", "ところ", "ここ", "です", "ます", "ましょ", "ください"]
|
| 49 |
+
|
| 50 |
+
# janomeの初期化
|
| 51 |
+
t = Tokenizer()
|
| 52 |
+
|
| 53 |
+
def url_to_filepath(url):
|
| 54 |
+
return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_")
|
| 55 |
+
|
| 56 |
+
def extract_text_from_url(url, output_file):
|
| 57 |
+
try:
|
| 58 |
+
response = requests.get(url)
|
| 59 |
+
response.raise_for_status() # Raises stored HTTPError, if one occurred.
|
| 60 |
+
encoding = cchardet.detect(response.content)['encoding']
|
| 61 |
+
response.encoding = encoding
|
| 62 |
+
text = response.text
|
| 63 |
+
text = re.sub(r"\d{3,}", "", text)
|
| 64 |
+
text = re.sub(r"<table.*?/table>", "", text, flags=re.DOTALL)
|
| 65 |
+
text = jaconv.h2z(text, kana=False, digit=True, ascii=True)
|
| 66 |
+
text = jaconv.z2h(text, kana=True, digit=False, ascii=True)
|
| 67 |
+
|
| 68 |
+
# ノーブレークスペースを通常のスペースに置換
|
| 69 |
+
text = text.replace('\xa0', ' ')
|
| 70 |
+
|
| 71 |
+
soup = BeautifulSoup(text, "html.parser")
|
| 72 |
+
p_tags = soup.find_all("p")
|
| 73 |
+
output_text = ""
|
| 74 |
+
for p in p_tags:
|
| 75 |
+
output_text += p.get_text()
|
| 76 |
+
output_text = output_text.replace("\n", "")
|
| 77 |
+
|
| 78 |
+
# ノーブレークスペースを通常のスペースに置換
|
| 79 |
+
output_text = output_text.replace('\xa0', ' ')
|
| 80 |
+
|
| 81 |
+
output_dir = os.path.dirname(os.path.abspath(output_file))
|
| 82 |
+
os.makedirs(output_dir, exist_ok=True) # ディレクトリを作成
|
| 83 |
+
|
| 84 |
+
with open(output_file, "w", encoding=encoding) as f:
|
| 85 |
+
f.write(output_text)
|
| 86 |
+
|
| 87 |
+
return output_text
|
| 88 |
+
except requests.HTTPError as http_err:
|
| 89 |
+
print(f'HTTP error occurred: {http_err}') # Python 3.6
|
| 90 |
+
except Exception as err:
|
| 91 |
+
print(f'Other error occurred: {err}') # Python 3.6
|
| 92 |
+
# エラーが発生したURLを記録
|
| 93 |
+
error_urls.append(url)
|
| 94 |
+
with open(error_url_file, "w") as f:
|
| 95 |
+
for error_url in error_urls:
|
| 96 |
+
f.write(error_url + "\n")
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
def extract_text_from_urls(urls: List[str]) -> List[str]:
|
| 100 |
+
extracted_texts = []
|
| 101 |
+
for i, url in enumerate(urls):
|
| 102 |
+
output_file = f"output0-{i+1}.txt"
|
| 103 |
+
if os.path.exists(output_file):
|
| 104 |
+
print(f"File already exists: {output_file}")
|
| 105 |
+
with open(output_file, "r", encoding="utf-8") as f:
|
| 106 |
+
text = f.read()
|
| 107 |
+
else:
|
| 108 |
+
print(f"Extracting text from: {url}")
|
| 109 |
+
text = extract_text_from_url(url, output_file)
|
| 110 |
+
if text and text != "エラー":
|
| 111 |
+
extracted_texts.append(text)
|
| 112 |
+
print("Extracted texts:", extracted_texts) # テキストの出力
|
| 113 |
+
return extracted_texts
|
| 114 |
+
|
| 115 |
+
# エラーが発生したURLをスキップ
|
| 116 |
+
urls = [url for url in urls if url not in error_urls]
|
| 117 |
+
|
| 118 |
+
extracted_texts = extract_text_from_urls(urls)
|
| 119 |
+
|
| 120 |
+
combined_text = "" # 合わせたテキスト
|
| 121 |
+
for i, url in enumerate(urls):
|
| 122 |
+
output_file = f"output0-{i+1}.txt"
|
| 123 |
+
output_text = extract_text_from_url(url, output_file)
|
| 124 |
+
texts.append(output_text)
|
| 125 |
+
combined_text += output_text + " " # 合わせたテキストに追加
|
| 126 |
+
|
| 127 |
+
# LDA for combined text
|
| 128 |
+
combined_text = combined_text.lower() # テキストを小文字に変換
|
| 129 |
+
tokens = [token.surface for token in t.tokenize(combined_text)] # テキストをトークン化
|
| 130 |
+
words = [word for word in tokens if word not in stop_words] # ストップワードを削除
|
| 131 |
+
|
| 132 |
+
if words:
|
| 133 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words)
|
| 134 |
+
X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
|
| 135 |
+
print("Number of texts:", len(texts)) # テキストの数を出力
|
| 136 |
+
print("Shape of X:", X.shape) # Xの形状を出力
|
| 137 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 138 |
+
|
| 139 |
+
# LDA
|
| 140 |
+
lda = LatentDirichletAllocation(n_components=num_topics)
|
| 141 |
+
X_lda = lda.fit_transform(X)
|
| 142 |
+
|
| 143 |
+
# Extract top keywords for each topic
|
| 144 |
+
topic_keywords = [[] for _ in range(num_topics)] # Store topic keywords
|
| 145 |
+
for topic_idx, topic in enumerate(lda.components_):
|
| 146 |
+
top_keyword_indices = topic.argsort()[:-n_top_words - 1:-1]
|
| 147 |
+
topic_keywords[topic_idx].extend([feature_names[i] for i in top_keyword_indices])
|
| 148 |
+
|
| 149 |
+
# Write topic keywords to output1.txt
|
| 150 |
+
with open("output1.txt", "w", encoding="utf-8") as f:
|
| 151 |
+
f.write("出現頻度の高いキーワードTOP{} :\n".format(n_top_words))
|
| 152 |
+
f.write("\n".join([", ".join(topic) for topic in topic_keywords]))
|
| 153 |
+
f.write("\n\n")
|
| 154 |
+
else:
|
| 155 |
+
print("No words found for LDA processing.")
|
| 156 |
+
|
| 157 |
+
# TF-IDF Vectorization
|
| 158 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words)
|
| 159 |
+
X = vectorizer.fit_transform([' '.join([token.surface for token in t.tokenize(text)]) for text in texts])
|
| 160 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 161 |
+
|
| 162 |
+
# TF-IDFスコアが閾値以上の特徴語を抽出
|
| 163 |
+
high_tfidf_features = []
|
| 164 |
+
for text_id in range(len(texts)):
|
| 165 |
+
text = texts[text_id].lower() # テキストを小文字に変換
|
| 166 |
+
tokens = [token.surface for token in t.tokenize(text)] # テキストをトークン化
|
| 167 |
+
words = [word for word in tokens if word not in stop_words] # ストップワードを削除
|
| 168 |
+
|
| 169 |
+
if not words:
|
| 170 |
+
continue
|
| 171 |
+
|
| 172 |
+
vectorizer = TfidfVectorizer(stop_words=stop_words)
|
| 173 |
+
X = vectorizer.fit_transform([' '.join(words)]) # ボキャブラリを作成するためにテキストを使用
|
| 174 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 175 |
+
|
| 176 |
+
feature_index= X.nonzero()[1]
|
| 177 |
+
top_keywords = [feature_names[i] for i in feature_index if X[0, i] >= tfidf_threshold][:n_top_words]
|
| 178 |
+
high_tfidf_features.append(top_keywords)
|
| 179 |
+
|
| 180 |
+
# Write high TF-IDF features to output1.txt
|
| 181 |
+
with open("output1.txt", "a", encoding="utf-8") as f:
|
| 182 |
+
f.write("重要なキーワード:\n")
|
| 183 |
+
f.write(", ".join(top_keywords))
|
| 184 |
+
f.write("\n\n")
|
| 185 |
+
|
| 186 |
+
# Extract text subjects and related text parts
|
| 187 |
+
model_name = "gpt-3.5-turbo-0613"
|
| 188 |
+
llm = ChatOpenAI(model_name=model_name, temperature=0.7)
|
| 189 |
+
text_splitter = TokenTextSplitter(chunk_size=5000, chunk_overlap=500)
|
| 190 |
+
document_splits = []
|
| 191 |
+
|
| 192 |
+
for file_path in ["output0-1.txt", "output0-2.txt", "output0-3.txt"]:
|
| 193 |
+
with open(file_path, "rb") as file:
|
| 194 |
+
content = file.read()
|
| 195 |
+
encoding = cchardet.detect(content)['encoding']
|
| 196 |
+
if encoding is None:
|
| 197 |
+
print(f"Warning: Could not determine encoding for {file_path}. File might contain binary data. Skipping this file.")
|
| 198 |
+
continue
|
| 199 |
+
try:
|
| 200 |
+
text = content.decode(encoding)
|
| 201 |
+
document_splits.extend(text_splitter.create_documents([text]))
|
| 202 |
+
except UnicodeDecodeError:
|
| 203 |
+
print(f"Error: Failed to decode {file_path} using {encoding}. Skipping this file.")
|
| 204 |
+
continue
|
| 205 |
+
|
| 206 |
+
prompt_subject = PromptTemplate(
|
| 207 |
+
input_variables=["text"],
|
| 208 |
+
template="""Text: {text}
|
| 209 |
+
Textの主題を抽出し、主題:〇〇という形で教えてください。Please tell me in Japanese.:
|
| 210 |
+
*主題:
|
| 211 |
+
*"""
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
chain_subject = LLMChain(llm=llm, prompt=prompt_subject, verbose=True)
|
| 215 |
+
map_reduce_chain = MapReduceDocumentsChain(
|
| 216 |
+
llm_chain=chain_subject,
|
| 217 |
+
combine_document_chain=StuffDocumentsChain(llm_chain=chain_subject, verbose=True),
|
| 218 |
+
verbose=True
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
subjects = map_reduce_chain.run(input_documents=document_splits, token_max=50000)
|
| 222 |
+
print(subjects)
|
| 223 |
+
|
| 224 |
+
# Write the extracted subjects to output1.txt
|
| 225 |
+
with open("output1.txt", "a", encoding="utf-8") as f:
|
| 226 |
+
f.write("主題:\n")
|
| 227 |
+
for subject in subjects:
|
| 228 |
+
f.write(subject + " ")
|
| 229 |
+
f.write("\n")
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.23.1
|
| 2 |
+
openai==0.27.2
|
| 3 |
+
beautifulsoup4==4.11.2
|
| 4 |
+
lxml==4.9.2
|
| 5 |
+
jaconv==0.3.4
|
| 6 |
+
requests==2.28.2
|
| 7 |
+
tinydb==4.7.1
|
| 8 |
+
scikit-learn==1.2.2
|
| 9 |
+
janome==0.4.2
|
| 10 |
+
langchain==0.0.228
|
| 11 |
+
tiktoken==0.4.0
|
| 12 |
+
google-api-python-client==2.87.0
|
| 13 |
+
oauth2client<4.0.0
|
| 14 |
+
cchardet==2.1.7
|
| 15 |
+
wikipedia==1.4.0
|
| 16 |
+
asyncio==3.4.3
|
| 17 |
+
momento==1.6.1
|
run_third.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import third
|
| 3 |
+
|
| 4 |
+
editable_output2 = sys.argv[1]
|
| 5 |
+
keyword_id = sys.argv[2]
|
| 6 |
+
|
| 7 |
+
if __name__ == "__main__":
|
| 8 |
+
import asyncio
|
| 9 |
+
asyncio.run(third.main(editable_output2, keyword_id))
|
second.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import openai
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
# APIキーの設定
|
| 8 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 9 |
+
|
| 10 |
+
# 検索キーワード
|
| 11 |
+
keyword = sys.argv[1]
|
| 12 |
+
|
| 13 |
+
# output.txtからテキストを読み込む
|
| 14 |
+
with open("output1.txt", "r", encoding="utf-8") as f:
|
| 15 |
+
output1_text = f.read()
|
| 16 |
+
|
| 17 |
+
# GPT-3モデルを使用して、指定されたキーワードとテキストに基づいて見出しを生成する
|
| 18 |
+
response = openai.ChatCompletion.create(
|
| 19 |
+
model="gpt-3.5-turbo-0613",
|
| 20 |
+
messages=[
|
| 21 |
+
{"role": "system", "content": "あなたはライティングの専門家です。PREP法(結論・理由・具体例・結論の流れ)に則って見出しを構成してください。テキストは必ず日本語で生成してください。テキスト内に似た文章を作らないでください。同じ単語は3回まで使っていいものとします。"},
|
| 22 |
+
{"role": "user", "content": f'(検索キーワード)の感情分析を行った結果に基づいて、興味や関心を引きつける「(検索キーワード)を含めたタイトル」、興味や関心を引きつける「見出し」を生成してください。そしてそれぞれの見出しに沿って「小見出し」を適宜生成してください。タイトル、見出し、小見出しはPREP法に基づいて一貫性のあるものとしてください。タイトル、見出しおよび小見出しは順に「<h1>(検索キーワード)を含めたタイトル</h1>」「<h2>見出し</h2>」「<h3>1-1 小見出し</h3>、<h3>1-2 小見出し</h3>」という形にしてください。タイトル、見出し、そして小見出しは検索者が求める情報を整理して記載するようにしてください。トピッククラスターモデルを意識し、(テキスト)内の異なるトピックに明確な関連性がある場合は異なるトピックをまとめ「記事を一つ作成=タイトルを一つ作成」します。異なるトピックに明確な関連性がない場合は「記事を適宜複数作成=タイトルを適宜複数作成」してください。(テキスト)および(検索キーワード)から検索ニーズを考慮してください。なお、似たような見出しは避けてください。テキスト:\"{output1_text}\" 検索キーワード:\"{keyword}\"'
|
| 23 |
+
},
|
| 24 |
+
],
|
| 25 |
+
temperature=0.7,
|
| 26 |
+
max_tokens=2000,
|
| 27 |
+
|
| 28 |
+
)
|
| 29 |
+
result = response.choices[0]["message"]["content"].strip()
|
| 30 |
+
|
| 31 |
+
# output2.txtにまとめたテキストを書き込む
|
| 32 |
+
with open("output2.txt", "w", encoding="utf-8") as f:
|
| 33 |
+
f.write(result)
|
| 34 |
+
|
| 35 |
+
# output2.txtが存在するか確認
|
| 36 |
+
if os.path.exists("output2.txt"):
|
| 37 |
+
print("output2.txt exists.")
|
| 38 |
+
else:
|
| 39 |
+
print("output2.txt does not exist.")
|
| 40 |
+
|
| 41 |
+
# output2.txtの内容を表示
|
| 42 |
+
with open("output2.txt", "r", encoding="utf-8") as f:
|
| 43 |
+
content = f.read()
|
| 44 |
+
print("Content of output2.txt:")
|
| 45 |
+
print(content)
|
third.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import openai
|
| 5 |
+
import json
|
| 6 |
+
from langchain.chat_models import ChatOpenAI
|
| 7 |
+
from langchain.experimental.plan_and_execute import PlanAndExecute, load_agent_executor, load_chat_planner
|
| 8 |
+
from langchain.llms import OpenAI
|
| 9 |
+
from langchain.utilities import GoogleSearchAPIWrapper
|
| 10 |
+
from langchain.agents.tools import Tool
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
+
import asyncio
|
| 13 |
+
from datetime import timedelta
|
| 14 |
+
|
| 15 |
+
# APIキーと検索エンジンIDの設定
|
| 16 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
| 17 |
+
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
| 18 |
+
CUSTOM_SEARCH_ENGINE_ID = os.getenv("CUSTOM_SEARCH_ENGINE_ID")
|
| 19 |
+
|
| 20 |
+
async def main(editable_output2, keyword_id):
|
| 21 |
+
search = GoogleSearchAPIWrapper(google_cse_id=CUSTOM_SEARCH_ENGINE_ID, google_api_key=GOOGLE_API_KEY)
|
| 22 |
+
|
| 23 |
+
tools = [
|
| 24 |
+
Tool(
|
| 25 |
+
name = "Search",
|
| 26 |
+
func=search.run,
|
| 27 |
+
description="useful for when you need to answer questions about current events"
|
| 28 |
+
),
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
# 「Planner」,「Executor」, および Agentの定義
|
| 32 |
+
model_name = "gpt-3.5-turbo-16k"
|
| 33 |
+
llm = ChatOpenAI(model_name=model_name, temperature=0, max_tokens=1000)
|
| 34 |
+
planner = load_chat_planner(llm)
|
| 35 |
+
executor = load_agent_executor(llm, tools, verbose=True)
|
| 36 |
+
agent = PlanAndExecute(planner=planner, executor=executor, verbose=True,
|
| 37 |
+
suffix='Answer should be in Japanese.')
|
| 38 |
+
|
| 39 |
+
# editable_output2 is the text of the article structure (h1, h2, h3...).
|
| 40 |
+
soup = BeautifulSoup(editable_output2, 'html.parser')
|
| 41 |
+
|
| 42 |
+
# Use the text of the h1 tag as the purpose.
|
| 43 |
+
h1_text = soup.find('h1').get_text()
|
| 44 |
+
|
| 45 |
+
# Use the text of the h2 tags as part of the purpose.
|
| 46 |
+
h2_texts = [h2.get_text() for h2 in soup.find_all('h2')]
|
| 47 |
+
|
| 48 |
+
# Use the text of the h3 tags as part of the purpose.
|
| 49 |
+
h3_texts = [h3.get_text() for h3 in soup.find_all('h3')]
|
| 50 |
+
|
| 51 |
+
# Generate the purpose.
|
| 52 |
+
purpose = f"about {h1_text}, focusing particularly on {' and '.join(h2_texts)}, to investigate the latest information and details"
|
| 53 |
+
|
| 54 |
+
# Specify the type of information you want to research.
|
| 55 |
+
if "人物" in h1_text or any("人物" in h2_text for h2_text in h2_texts):
|
| 56 |
+
# If the topic is about a person, specify that you want to research their name and career.
|
| 57 |
+
purpose += " including the person's name and career"
|
| 58 |
+
elif "商品" in h1_text or any("商品" in h2_text for h2_text in h2_texts):
|
| 59 |
+
# If the topic is about a product, specify that you want to research the brand name, product name, and price.
|
| 60 |
+
purpose += " including the brand name, product name, and price"
|
| 61 |
+
elif "イベント" in h1_text or any("イベント" in h2_text for h2_text in h2_texts):
|
| 62 |
+
# If the topic is about an event, specify that you want to research the event's content, schedule, and venue.
|
| 63 |
+
purpose += " including the event's content, schedule, and venue"
|
| 64 |
+
|
| 65 |
+
# Convert the purpose into an instruction in the form of a question.
|
| 66 |
+
instruction = f"Can you research {purpose} and include specific details such as names, ages, careers, product names, service names, store names, locations, and times in your response?"
|
| 67 |
+
|
| 68 |
+
# Run the instruction
|
| 69 |
+
output_text = agent.run(instruction)
|
| 70 |
+
|
| 71 |
+
# Prepare the system message
|
| 72 |
+
system_message = {
|
| 73 |
+
"role": "system",
|
| 74 |
+
"content": "あなたはプロのライターです。"
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
# Prepare the user message
|
| 78 |
+
user_message = {
|
| 79 |
+
"role": "user",
|
| 80 |
+
"content": f'"{h1_text}"という記事タイトルに沿った導入文を日本語で作成してください。その後、各見出し"{", ".join(h2_texts)}"についての導入文を作成してください。導入文はそれぞれの見出しの直下にある"{", ".join(h3_texts)}"の内容を考慮に入れて作成してください。その後、"{", ".join(h3_texts)}"についての詳細な本文を提供してください。各セクションは、読者の興味を引き、記事の主題を明確に示すものであるべきです。最後に、記事全体のまとめ、<h2>まとめ</h2>としてクローズしてください。以下に取得した情報を示します:{output_text}'
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Generate a new text using the ChatCompletion API
|
| 84 |
+
response = openai.ChatCompletion.create(
|
| 85 |
+
model="gpt-3.5-turbo-16k",
|
| 86 |
+
messages=[system_message, user_message],
|
| 87 |
+
temperature=0.7,
|
| 88 |
+
)
|
| 89 |
+
result = response.choices[0]["message"]["content"]
|
| 90 |
+
|
| 91 |
+
# Save the generated message to output3.txt
|
| 92 |
+
with open('output3.txt', 'w', encoding='utf-8') as f:
|
| 93 |
+
if isinstance(result, dict):
|
| 94 |
+
f.write(json.dumps(result, ensure_ascii=False, indent=4))
|
| 95 |
+
else:
|
| 96 |
+
f.write(result)
|
| 97 |
+
|
| 98 |
+
# Print the generated message
|
| 99 |
+
print(result)
|