fudii0921's picture
Create app.py
1f4fa3b verified
import io
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
import gradio as gr
import re
import groq
import os
from dotenv import load_dotenv
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
load_dotenv(verbose=True)
client = groq.Client(api_key=os.environ.get("GROQ_API_KEY"))
# PDFからテキストを抽出する関数
def extract_text_from_pdf(pdf_path):
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle)
page_interpreter = PDFPageInterpreter(resource_manager, converter)
with open(pdf_path, 'rb') as fh:
for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
page_interpreter.process_page(page)
text = fake_file_handle.getvalue()
converter.close()
fake_file_handle.close()
if text:
return text
def split_text(pdf_text):
full_text = pdf_text.strip()
chunks = []
while len(full_text) > 0:
first_part = full_text[:5000]
last_period_index = first_part.rfind(".")
if last_period_index != -1:
first_part = first_part[:last_period_index+1]
chunks.append(first_part)
full_text = full_text[len(first_part):]
return chunks
# Gradioのインターフェース
def summarize_pdf(pdf_path):
pdf_text = extract_text_from_pdf(pdf_path.name)
print("pdftexts:",pdf_text)
chunks = split_text(pdf_text)
'''summaries = []
for chunk in chunks:
summary = f"日本語で要約してもらいます。\n{chunk}\n要約したものは\n・を必ず先頭につけて箇条書きにしてください。\n"
summaries.append(summary)
return "\n\n".join(summaries)'''
parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese"))
#parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese"))
#parser = HtmlParser.from_url(url, Tokenizer("japanese"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 10000) # 5文で要約
return "\n".join(str(sentence) for sentence in summary)
'''
# 分割されたテキストデータを要約して出力する
summary = ""
for i in range(len(chunks)):
tmp = chunks[i]
print("tmp:",tmp)
completion = client.chat.completions.create(
model="llama3-70b-8192",
temperature=0,
messages=[
{"role": "system", "content": "貴方は優秀なアナリストです。"},
{"role": "user", "content": tmp+"を日本語に要約してください。要約したものは\n・を必ず先頭につけて箇条書きにしてください。 必ず、日本語で答えてください。"}
],
)
print("trans:",completion.choices[0].message.content)
# 要約結果をsummaryに追加する
summary += re.sub('[\n.]', '', completion.choices[0].message.content+ '\n')
summary += "\n"
return summary'''
def summarize_html(html_url):
#html_text = extract_text_from_pdf(pdf_path.name)
#print("pdftexts:",pdf_text)
#chunks = split_text(pdf_text)
#parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese"))
#parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese"))
print("myurl:",html_url)
parser = HtmlParser.from_url(html_url, Tokenizer("japanese"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 10000) # 5文で要約
print("summary:",summary)
return "\n".join(str(sentence) for sentence in summary)
# GradioのBlockを設定
with gr.Blocks(css="footer {visibility: hidden;} #custom_button {width: 400px; margin: 0 auto; background-color: #E0E7FF;}", theme=gr.themes.Soft(), title="ハイブリッド・サマリー・エージェント") as smry:
gr.HTML('''<div style="display: flex; justify-content: center; align-items: center; font-size: 20px; font-weight: bold; font-family: 'Noto Sans JP', 'Yu Gothic', 'ヒラギノ角ゴシック', 'メイリオ', sans-serif;">サマリー・エージェント</div>''')
gr.Markdown("PDF SUMMARY")
with gr.Row():
pdf_input = gr.File(label="PDFファイルをアップロード")
with gr.Row():
pdf_output = gr.Textbox(label="要約結果", lines=20)
pdf_input.change(fn=summarize_pdf, inputs=[pdf_input], outputs=[pdf_output])
gr.Markdown("HTML SUMMARY")
with gr.Row():
html_input = gr.Textbox(label="URLを入力",value="https://jp.reuters.com/world/us/VTXXEXTCYRIJFM73HAWI6ZBFKM-2025-05-09/")
with gr.Row():
html_output = gr.Textbox(label="要約結果", lines=20)
html_btn = gr.Button("HTML要約",elem_id="custom_button")
html_btn.click(summarize_html, inputs=[html_input], outputs=[html_output])
#html_input.change(fn=summarize_html, inputs=[html_input], outputs=[html_output])
smry.launch()