BabyWriterPRO.v-llama3 / article_generator.py
Yasu777's picture
Update article_generator.py
5cf4648 verified
import os
import json
import requests
from bs4 import BeautifulSoup
import gradio as gr
from concurrent.futures import ThreadPoolExecutor, as_completed
from groq import Groq
# Groq APIの設定
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
# Tavily APIのカスタムツールを定義
class EnhancedTavilySearchTool:
def search(self, queries):
combined_query = " | ".join(queries) # クエリを結合して一つのリクエストで処理
if len(combined_query) < 5:
combined_query += " details"
params = {
'api_key': os.getenv('TAVILY_API_KEY'),
'query': combined_query,
'max_results': 20,
'detail_level': 'high',
'search_depth': 'advanced'
}
response = requests.post('https://api.tavily.com/search', json=params)
if response.status_code == 200:
try:
data = response.json()
if 'results' in data:
return data['results']
else:
print("警告: レスポンスに 'results' キーが存在しません")
return []
except ValueError:
print("JSON レスポンスのデコードエラー")
return []
else:
raise Exception(f"Tavily APIからのデータ取得に失敗しました: {response.status_code}, {response.text}")
# 重複を排除するヘルパー関数
def remove_duplicates(text_list):
seen = set()
result = []
for text in text_list:
if text not in seen:
seen.add(text)
result.append(text)
return result
# 初期データをTavily検索で収集する関数
def perform_initial_tavily_search(h2_texts, h3_texts):
tavily_search_tool = EnhancedTavilySearchTool()
queries = []
for idx, h2_text in enumerate(h2_texts):
h3_for_this_h2 = [h3 for h3 in h3_texts if h3.startswith(f"{idx+1}-")]
if not h3_for_this_h2 and h2_text.strip() != "まとめ": # "まとめ" セクションを除外
print(f"No matching h3 elements found for h2: {h2_text} at index {idx+1}")
continue
query = f"{h2_text} {' '.join(h3_for_this_h2)}"
queries.append(query)
print("Performing Tavily search with queries:", queries)
responses = tavily_search_tool.search(queries)
response_dict = {}
for i, query in enumerate(queries):
if i < len(responses): # 応答リストの範囲内にあることを確認
response_dict[query] = responses[i]
else:
response_dict[query] = "No response received"
return response_dict
def save_preloaded_tavily_data(data):
with open("preloaded_tavily_data.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Preloaded Tavily data saved.")
def load_preloaded_tavily_data():
with open("preloaded_tavily_data.json", "r", encoding="utf-8") as f:
print("Preloaded Tavily data loaded.")
return json.load(f)
# Groq APIを使用してテキストを生成するヘルパー関数
def generate_text_with_groq(prompt):
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a professional web researcher and writer. You provide detailed information on the requested content, using place notes or tables appropriately. You avoid direct copying or similar expressions from the data and create your own content. You excel at crafting witty sentences, always starting each sentence with a different word and consistently using the polite 'desu/masu' style in Japanese. You do not preface your responses with 'The following is an original text continuing from XX.' You always respond in Japanese without adding unnecessary explanations or supplementary information."},
{"role": "user", "content": prompt}
],
model="llama3-70b-8192",
temperature=0.5,
max_tokens=3000
)
return chat_completion.choices[0].message.content.strip()
# 各<h3>タグの下にあるテキストを特定
def find_texts_under_h3(soup):
h3_elements = soup.find_all('h3')
h3_texts = {}
for h3 in h3_elements:
h3_id = h3.get('id')
texts = []
next_sibling = h3.find_next_sibling()
while next_sibling and next_sibling.name != 'h3':
if next_sibling.name == 'p':
texts.append(next_sibling.get_text(strip=True))
next_sibling = next_sibling.find_next_sibling()
h3_texts[h3_id] = texts
return h3_texts
# 記事のセクションをGroq APIで拡張する関数
def expand_h3_sections(soup, h3_texts, preloaded_data):
h3_elements = soup.find_all('h3')
def process_h3_element(h3):
h3_text = h3.get_text(strip=True)
section_id = h3.get('id', None)
if section_id is None:
print(f"Warning: h3 element '{h3_text}' has no ID.")
return None
# 既存のテキストを取得
existing_texts = h3_texts.get(section_id, [])
last_sentence = existing_texts[-1] if existing_texts else ""
context = preloaded_data.get(f"{h3_text} {section_id}", "")
# プロンプトの改善:既存テキストと最後の文を含める
prompt = f"{h3_text}の内容を踏まえて、'{last_sentence} {' '.join(existing_texts)}'に続く内容のオリジナルの文章をステップバイステップで生成し、状況に応じて適宜、箇所書きや表を使って、直接的なコピーまたは近いフレーズを避けて日本語に翻訳しなさい。あなたの返答や見出しは必要なく、そのまま文章やテキストのみを固定観念や偏見を排して生成しなさい。文体は「ですます調」に統一しなさい。参考リンクを[数字]形式で文中に挿入し、リンクは文章の最後にまとめて「References:」として列挙し、各リンクを[数字] https://link.example.com の形式で提示しなさい。こちらが背景情報です:\n{context}"
expanded_text = generate_text_with_groq(prompt)
new_paragraph = soup.new_tag('p')
new_paragraph.string = expanded_text
# h3タグの次の要素を取得し、その後の要素を探す
next_sibling = h3.find_next_sibling()
if next_sibling:
next_sibling.insert_after(new_paragraph) # 次の要素が存在する場合のみ挿入を行う
else:
h3.parent.append(new_paragraph) # h3タグの親が存在する場合、親に直接追加
with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(process_h3_element, h3) for h3 in h3_elements]
for future in as_completed(futures):
future.result()
return soup
# 記事を拡張する関数
def process_standalone_h2(soup):
h2_elements = soup.find_all('h2')
for h2 in h2_elements:
if not h2.find_next_sibling(lambda tag: tag.name == 'h3'):
# 'まとめ'のような<h3>タグがないセクションを処理
preloaded_data = load_preloaded_tavily_data()
key = f"{h2.get_text()}"
context = preloaded_data.get(key, "このセクションに関する具体的な情報はありません。")
prompt = f"「{h2.get_text()}」について詳しく説明してください。こちらが背景情報です:\n{context}"
expanded_text = generate_text_with_groq(prompt)
new_paragraph = soup.new_tag('p')
new_paragraph.string = expanded_text
h2.insert_after(new_paragraph)
def process_summary_section(soup, cached_responses):
summary_section = soup.find('h2', text='まとめ')
if summary_section:
# まとめの内容を検索結果やAI生成結果から取得
summary_key = "まとめ"
summary_data = cached_responses.get(summary_key, "まとめの具体的な内容は現在利用可能ではありません。")
new_paragraph = soup.new_tag('p')
new_paragraph.string = summary_data
summary_section.insert_after(new_paragraph)
def generate_article(editable_output2):
print("Starting article generation...")
# HTML解析
soup = BeautifulSoup(editable_output2, 'html.parser')
h1_text = soup.find('h1').get_text()
h2_texts = [h2.get_text() for h2 in soup.find_all('h2')]
h3_texts = [h3.get_text() for h3 in soup.find_all('h3')]
# <h3>タグにIDを付与
def add_ids_to_h3(soup):
h3_elements = soup.find_all('h3')
for idx, h3 in enumerate(h3_elements, start=1):
h3['id'] = f"h3-{idx}"
return soup
soup_with_ids = add_ids_to_h3(soup)
# 初期のTavily検索
print("Performing initial Tavily search...")
cached_responses = perform_initial_tavily_search(h2_texts, h3_texts)
save_preloaded_tavily_data(cached_responses)
reference_urls = [response.get('url', 'URL not found') for response in cached_responses.values() if isinstance(response, dict)]
system_message = {
"role": "system",
"content": "You are a professional writer and programmer. You will follow the instructions below to create HTML content, ensuring that all sections retain the correct HTML tags and attributes, and that the id attribute is set correctly. You avoid direct copying or similar expressions from the data and create your own content. You excel at crafting witty sentences, always starting each sentence with a different word and consistently using the polite 'desu/masu' style in Japanese. You will always respond in Japanese."
}
research_summary = "\n".join([json.dumps(result) for result in cached_responses.values()])
instructions = []
# IDを含むHTMLプロンプトの作成
instructions.append(f"""
<h1 id="title">{h1_text}</h1>
<p>「{h1_text}」に関する導入文を日本語で作成し、固定観念や偏見を排して生成しなさい。文体は「ですます調」に統一しなさい。直接的なコピーまたは近いフレーズを避けて、オリジナルな独自のコンテンツにしなさい。</p>""")
sentences = research_summary.split('。')
max_questions_per_h3 = 2
for idx, h2_text in enumerate(h2_texts):
h3_for_this_h2 = [h3 for h3 in h3_texts if h3.startswith(f"{idx+1}-")]
instructions.append(f"""
<div id="section-{idx+1}">
<h2 id="h2-{idx+1}">{h2_text}</h2>
<p>「{h2_text}」に関する導入文を日本語で作成し、固定観念や偏見を排して生成しなさい。文体は「ですます調」に統一しなさい。この導入文は、以下の小見出しの内容を考慮し、オリジナルな独自のコンテンツにしなさい:{"、".join(h3_for_this_h2)}。</p>""")
for h3_idx, h3 in enumerate(h3_for_this_h2):
related_sentences = [sentence for sentence in sentences if h3 in sentence][:max_questions_per_h3]
if related_sentences:
content_for_h3 = "。".join(related_sentences) + "。"
instructions.append(f"""
<h3 id="h3-{idx+1}-{h3_idx+1}">{h3}</h3>
<p>「{h3}」に関する詳細な内容として、文体は「ですます調」に統一し、以下の情報を固定観念や偏見を排し、オリジナルで独自のコンテンツを日本語で記述しなさい。:{content_for_h3}</p>""")
else:
instructions.append(f"""
<h3 id="h3-{idx+1}-{h3_idx+1}">{h3}</h3>
<p>「{h3}」に関する詳細な内容を日本語で記述してください。オリジナルな内容を心がけてください。</p>""")
instructions.append("</div>") # 各セクションの終わりにdivタグを閉じる
# トークン数を制限するためにメッセージを分割
split_instructions = []
current_chunk = ""
max_tokens_per_chunk = 8000 # トークン数の上限を設定
for instruction in instructions:
if len(current_chunk + instruction) > max_tokens_per_chunk:
split_instructions.append(current_chunk)
current_chunk = instruction
else:
current_chunk += instruction
if current_chunk:
split_instructions.append(current_chunk)
results = []
for i, split_instruction in enumerate(split_instructions):
user_message = {
"role": "user",
"content": f"{i+1}/{len(split_instructions)}: {split_instruction}"
}
try:
print(f"Sending instruction chunk {i+1} of {len(split_instructions)} to Groq...")
response = client.chat.completions.create(
messages=[system_message, user_message],
model="llama3-70b-8192",
temperature=0.6,
max_tokens=5000,
)
generated_text = response.choices[0].message.content
print(f"Generated content for section {i+1}:") # 生成された各セクションの内容を出力
print(generated_text)
results.append(generated_text)
except Exception as e:
error_message = f"Error occurred during ChatCompletion: {str(e)}"
print(error_message) # ログにエラーメッセージを出力
results.append(error_message)
final_result = "\n".join(results)
print("Final generated article content:") # 最終的な記事全体の内容を出力
print(final_result)
# 更新されたHTMLの解析
updated_soup = BeautifulSoup(final_result, 'html.parser')
# 各<h3>タグの下にある既存のテキストを特定
h3_texts = find_texts_under_h3(soup_with_ids)
# h3タグの拡張を行う
expanded_soup = expand_h3_sections(updated_soup, h3_texts, cached_responses)
# 参照URLを本文に追加
reference_section = expanded_soup.new_tag('div')
reference_section['id'] = 'references'
reference_section.append(expanded_soup.new_tag('h2'))
reference_section.h2.string = "参考文献"
reference_list = expanded_soup.new_tag('ul')
for i, url in enumerate(reference_urls, 1):
reference_item = expanded_soup.new_tag('li')
reference_item.string = f"{i}. {url}"
reference_list.append(reference_item)
reference_section.append(reference_list)
expanded_soup.append(reference_section)
final_html = str(expanded_soup)
final_markdown = custom_html_to_markdown(final_html)
with open("output3.txt", "w", encoding="utf-8") as f:
f.write(final_html)
print("Article generation complete. Output saved to output3.txt.")
return final_markdown, final_html
# HTMLをMarkdownに変換する関数
def custom_html_to_markdown(html):
soup = BeautifulSoup(html, 'html.parser')
# 不要なタグの除去
for tag in soup(['html', 'body', 'head', 'div']):
tag.decompose()
# タグごとの処理
for h in soup.find_all('h1'):
h.replace_with(f"# {h.get_text().strip()}\n\n")
for h in soup.find_all('h2'):
h.replace_with(f"## {h.get_text().strip()}\n\n")
for h in soup.find_all('h3'):
h.replace_with(f"### {h.get_text().strip()}\n\n")
for p in soup.find_all('p'):
p.replace_with(f"{p.get_text().strip()}\n\n")
for li in soup.find_all('li'):
li.replace_with(f"* {li.get_text().strip()}\n")
# 最終的なMarkdownテキストの取得
return soup.get_text()
# コンテンツを表示する関数
def display_content(content, format):
if format == "Markdown":
return custom_html_to_markdown(content)
return content
# Gradioアプリの設定
def setup_gradio_interface():
with gr.Blocks(css='''
.gr-markdown h1, .gr-markdown h2, .gr-markdown h3 {
word-wrap: break-word;
overflow-wrap: break-word;
}
''') as app:
with gr.Row():
format_selector = gr.Radio(choices=["Markdown", "HTML"], label="Display Format", value="Markdown")
content_display = gr.Markdown(label="Content", value="")
# 初期のHTMLコンテンツを想定
initial_html_content = "<h1>Welcome</h1><p>This is a sample paragraph in HTML.</p>"
initial_markdown_content = display_content(initial_html_content, "Markdown")
# 最新のコンテンツを保持するための状態
latest_content = gr.Variable(value=initial_html_content)
# 最初のコンテンツを設定
content_display.value = initial_markdown_content
# フォーマット選択器の変更イベントを設定
def update_content(format_choice):
# 最新のコンテンツに基づいて表示を更新
updated_content = display_content(latest_content.get(), format_choice)
return updated_content
format_selector.change(update_content, inputs=[format_selector], outputs=[content_display])