Spaces:
Sleeping
Sleeping
| import io | |
| from pdfminer.converter import TextConverter | |
| from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager | |
| from pdfminer.pdfpage import PDFPage | |
| import gradio as gr | |
| import re | |
| import groq | |
| import os | |
| from dotenv import load_dotenv | |
| from sumy.parsers.html import HtmlParser | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lsa import LsaSummarizer | |
| load_dotenv(verbose=True) | |
| client = groq.Client(api_key=os.environ.get("GROQ_API_KEY")) | |
| # PDFからテキストを抽出する関数 | |
| def extract_text_from_pdf(pdf_path): | |
| resource_manager = PDFResourceManager() | |
| fake_file_handle = io.StringIO() | |
| converter = TextConverter(resource_manager, fake_file_handle) | |
| page_interpreter = PDFPageInterpreter(resource_manager, converter) | |
| with open(pdf_path, 'rb') as fh: | |
| for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): | |
| page_interpreter.process_page(page) | |
| text = fake_file_handle.getvalue() | |
| converter.close() | |
| fake_file_handle.close() | |
| if text: | |
| return text | |
| def split_text(pdf_text): | |
| full_text = pdf_text.strip() | |
| chunks = [] | |
| while len(full_text) > 0: | |
| first_part = full_text[:5000] | |
| last_period_index = first_part.rfind(".") | |
| if last_period_index != -1: | |
| first_part = first_part[:last_period_index+1] | |
| chunks.append(first_part) | |
| full_text = full_text[len(first_part):] | |
| return chunks | |
| # Gradioのインターフェース | |
| def summarize_pdf(pdf_path): | |
| pdf_text = extract_text_from_pdf(pdf_path.name) | |
| print("pdftexts:",pdf_text) | |
| chunks = split_text(pdf_text) | |
| '''summaries = [] | |
| for chunk in chunks: | |
| summary = f"日本語で要約してもらいます。\n{chunk}\n要約したものは\n・を必ず先頭につけて箇条書きにしてください。\n" | |
| summaries.append(summary) | |
| return "\n\n".join(summaries)''' | |
| parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese")) | |
| #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese")) | |
| #parser = HtmlParser.from_url(url, Tokenizer("japanese")) | |
| summarizer = LsaSummarizer() | |
| summary = summarizer(parser.document, 10000) # 5文で要約 | |
| return "\n".join(str(sentence) for sentence in summary) | |
| ''' | |
| # 分割されたテキストデータを要約して出力する | |
| summary = "" | |
| for i in range(len(chunks)): | |
| tmp = chunks[i] | |
| print("tmp:",tmp) | |
| completion = client.chat.completions.create( | |
| model="llama3-70b-8192", | |
| temperature=0, | |
| messages=[ | |
| {"role": "system", "content": "貴方は優秀なアナリストです。"}, | |
| {"role": "user", "content": tmp+"を日本語に要約してください。要約したものは\n・を必ず先頭につけて箇条書きにしてください。 必ず、日本語で答えてください。"} | |
| ], | |
| ) | |
| print("trans:",completion.choices[0].message.content) | |
| # 要約結果をsummaryに追加する | |
| summary += re.sub('[\n.]', '', completion.choices[0].message.content+ '\n') | |
| summary += "\n" | |
| return summary''' | |
| def summarize_html(html_url): | |
| #html_text = extract_text_from_pdf(pdf_path.name) | |
| #print("pdftexts:",pdf_text) | |
| #chunks = split_text(pdf_text) | |
| #parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese")) | |
| #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese")) | |
| print("myurl:",html_url) | |
| parser = HtmlParser.from_url(html_url, Tokenizer("japanese")) | |
| summarizer = LsaSummarizer() | |
| summary = summarizer(parser.document, 10000) # 5文で要約 | |
| print("summary:",summary) | |
| return "\n".join(str(sentence) for sentence in summary) | |
| # GradioのBlockを設定 | |
| with gr.Blocks(css="footer {visibility: hidden;} #custom_button {width: 400px; margin: 0 auto; background-color: #E0E7FF;}", theme=gr.themes.Soft(), title="ハイブリッド・サマリー・エージェント") as smry: | |
| gr.HTML('''<div style="display: flex; justify-content: center; align-items: center; font-size: 20px; font-weight: bold; font-family: 'Noto Sans JP', 'Yu Gothic', 'ヒラギノ角ゴシック', 'メイリオ', sans-serif;">サマリー・エージェント</div>''') | |
| gr.Markdown("PDF SUMMARY") | |
| with gr.Row(): | |
| pdf_input = gr.File(label="PDFファイルをアップロード") | |
| with gr.Row(): | |
| pdf_output = gr.Textbox(label="要約結果", lines=20) | |
| pdf_input.change(fn=summarize_pdf, inputs=[pdf_input], outputs=[pdf_output]) | |
| gr.Markdown("HTML SUMMARY") | |
| with gr.Row(): | |
| html_input = gr.Textbox(label="URLを入力",value="https://jp.reuters.com/world/us/VTXXEXTCYRIJFM73HAWI6ZBFKM-2025-05-09/") | |
| with gr.Row(): | |
| html_output = gr.Textbox(label="要約結果", lines=20) | |
| html_btn = gr.Button("HTML要約",elem_id="custom_button") | |
| html_btn.click(summarize_html, inputs=[html_input], outputs=[html_output]) | |
| #html_input.change(fn=summarize_html, inputs=[html_input], outputs=[html_output]) | |
| smry.launch() | |