fudii0921 commited on
Commit
1f4fa3b
·
verified ·
1 Parent(s): 3b75bf8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -0
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ from pdfminer.converter import TextConverter
3
+ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
4
+ from pdfminer.pdfpage import PDFPage
5
+ import gradio as gr
6
+ import re
7
+ import groq
8
+ import os
9
+ from dotenv import load_dotenv
10
+ from sumy.parsers.html import HtmlParser
11
+ from sumy.parsers.plaintext import PlaintextParser
12
+ from sumy.nlp.tokenizers import Tokenizer
13
+ from sumy.summarizers.lsa import LsaSummarizer
14
+
15
+ load_dotenv(verbose=True)
16
+ client = groq.Client(api_key=os.environ.get("GROQ_API_KEY"))
17
+
18
+ # PDFからテキストを抽出する関数
19
+ def extract_text_from_pdf(pdf_path):
20
+ resource_manager = PDFResourceManager()
21
+ fake_file_handle = io.StringIO()
22
+ converter = TextConverter(resource_manager, fake_file_handle)
23
+ page_interpreter = PDFPageInterpreter(resource_manager, converter)
24
+
25
+ with open(pdf_path, 'rb') as fh:
26
+ for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
27
+ page_interpreter.process_page(page)
28
+
29
+ text = fake_file_handle.getvalue()
30
+
31
+ converter.close()
32
+ fake_file_handle.close()
33
+
34
+ if text:
35
+ return text
36
+
37
+ def split_text(pdf_text):
38
+ full_text = pdf_text.strip()
39
+ chunks = []
40
+ while len(full_text) > 0:
41
+ first_part = full_text[:5000]
42
+ last_period_index = first_part.rfind(".")
43
+ if last_period_index != -1:
44
+ first_part = first_part[:last_period_index+1]
45
+
46
+ chunks.append(first_part)
47
+ full_text = full_text[len(first_part):]
48
+
49
+ return chunks
50
+
51
+ # Gradioのインターフェース
52
+ def summarize_pdf(pdf_path):
53
+ pdf_text = extract_text_from_pdf(pdf_path.name)
54
+ print("pdftexts:",pdf_text)
55
+ chunks = split_text(pdf_text)
56
+ '''summaries = []
57
+ for chunk in chunks:
58
+ summary = f"日本語で要約してもらいます。\n{chunk}\n要約したものは\n・を必ず先頭につけて箇条書きにしてください。\n"
59
+ summaries.append(summary)
60
+ return "\n\n".join(summaries)'''
61
+
62
+ parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese"))
63
+ #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese"))
64
+ #parser = HtmlParser.from_url(url, Tokenizer("japanese"))
65
+ summarizer = LsaSummarizer()
66
+ summary = summarizer(parser.document, 10000) # 5文で要約
67
+
68
+ return "\n".join(str(sentence) for sentence in summary)
69
+
70
+ '''
71
+ # 分割されたテキストデータを要約して出力する
72
+ summary = ""
73
+ for i in range(len(chunks)):
74
+ tmp = chunks[i]
75
+ print("tmp:",tmp)
76
+
77
+ completion = client.chat.completions.create(
78
+ model="llama3-70b-8192",
79
+ temperature=0,
80
+ messages=[
81
+ {"role": "system", "content": "貴方は優秀なアナリストです。"},
82
+ {"role": "user", "content": tmp+"を日本語に要約してください。要約したものは\n・を必ず先頭につけて箇条書きにしてください。 必ず、日本語で答えてください。"}
83
+ ],
84
+ )
85
+
86
+ print("trans:",completion.choices[0].message.content)
87
+
88
+ # 要約結果をsummaryに追加する
89
+ summary += re.sub('[\n.]', '', completion.choices[0].message.content+ '\n')
90
+ summary += "\n"
91
+
92
+ return summary'''
93
+
94
+
95
+ def summarize_html(html_url):
96
+ #html_text = extract_text_from_pdf(pdf_path.name)
97
+ #print("pdftexts:",pdf_text)
98
+ #chunks = split_text(pdf_text)
99
+
100
+
101
+ #parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese"))
102
+ #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese"))
103
+ print("myurl:",html_url)
104
+ parser = HtmlParser.from_url(html_url, Tokenizer("japanese"))
105
+ summarizer = LsaSummarizer()
106
+ summary = summarizer(parser.document, 10000) # 5文で要約
107
+ print("summary:",summary)
108
+
109
+ return "\n".join(str(sentence) for sentence in summary)
110
+
111
+ # GradioのBlockを設定
112
+ with gr.Blocks(css="footer {visibility: hidden;} #custom_button {width: 400px; margin: 0 auto; background-color: #E0E7FF;}", theme=gr.themes.Soft(), title="ハイブリッド・サマリー・エージェント") as smry:
113
+ gr.HTML('''<div style="display: flex; justify-content: center; align-items: center; font-size: 20px; font-weight: bold; font-family: 'Noto Sans JP', 'Yu Gothic', 'ヒラギノ角ゴシック', 'メイリオ', sans-serif;">サマリー・エージェント</div>''')
114
+
115
+ gr.Markdown("PDF SUMMARY")
116
+ with gr.Row():
117
+ pdf_input = gr.File(label="PDFファイルをアップロード")
118
+ with gr.Row():
119
+ pdf_output = gr.Textbox(label="要約結果", lines=20)
120
+
121
+ pdf_input.change(fn=summarize_pdf, inputs=[pdf_input], outputs=[pdf_output])
122
+
123
+ gr.Markdown("HTML SUMMARY")
124
+ with gr.Row():
125
+ html_input = gr.Textbox(label="URLを入力",value="https://jp.reuters.com/world/us/VTXXEXTCYRIJFM73HAWI6ZBFKM-2025-05-09/")
126
+ with gr.Row():
127
+ html_output = gr.Textbox(label="要約結果", lines=20)
128
+
129
+ html_btn = gr.Button("HTML要約",elem_id="custom_button")
130
+ html_btn.click(summarize_html, inputs=[html_input], outputs=[html_output])
131
+ #html_input.change(fn=summarize_html, inputs=[html_input], outputs=[html_output])
132
+
133
+ smry.launch()