import io from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage import gradio as gr import re import groq import os from dotenv import load_dotenv from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer load_dotenv(verbose=True) client = groq.Client(api_key=os.environ.get("GROQ_API_KEY")) # PDFからテキストを抽出する関数 def extract_text_from_pdf(pdf_path): resource_manager = PDFResourceManager() fake_file_handle = io.StringIO() converter = TextConverter(resource_manager, fake_file_handle) page_interpreter = PDFPageInterpreter(resource_manager, converter) with open(pdf_path, 'rb') as fh: for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): page_interpreter.process_page(page) text = fake_file_handle.getvalue() converter.close() fake_file_handle.close() if text: return text def split_text(pdf_text): full_text = pdf_text.strip() chunks = [] while len(full_text) > 0: first_part = full_text[:5000] last_period_index = first_part.rfind(".") if last_period_index != -1: first_part = first_part[:last_period_index+1] chunks.append(first_part) full_text = full_text[len(first_part):] return chunks # Gradioのインターフェース def summarize_pdf(pdf_path): pdf_text = extract_text_from_pdf(pdf_path.name) print("pdftexts:",pdf_text) chunks = split_text(pdf_text) '''summaries = [] for chunk in chunks: summary = f"日本語で要約してもらいます。\n{chunk}\n要約したものは\n・を必ず先頭につけて箇条書きにしてください。\n" summaries.append(summary) return "\n\n".join(summaries)''' parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese")) #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese")) #parser = HtmlParser.from_url(url, Tokenizer("japanese")) summarizer = LsaSummarizer() summary = summarizer(parser.document, 10000) # 5文で要約 return "\n".join(str(sentence) for sentence in summary) ''' # 分割されたテキストデータを要約して出力する summary = "" for i in range(len(chunks)): tmp = chunks[i] print("tmp:",tmp) completion = client.chat.completions.create( model="llama3-70b-8192", temperature=0, messages=[ {"role": "system", "content": "貴方は優秀なアナリストです。"}, {"role": "user", "content": tmp+"を日本語に要約してください。要約したものは\n・を必ず先頭につけて箇条書きにしてください。 必ず、日本語で答えてください。"} ], ) print("trans:",completion.choices[0].message.content) # 要約結果をsummaryに追加する summary += re.sub('[\n.]', '', completion.choices[0].message.content+ '\n') summary += "\n" return summary''' def summarize_html(html_url): #html_text = extract_text_from_pdf(pdf_path.name) #print("pdftexts:",pdf_text) #chunks = split_text(pdf_text) #parser = PlaintextParser.from_string(pdf_text, Tokenizer("japanese")) #parser = PlaintextParser.from_file(pdf_path, Tokenizer("japanese")) print("myurl:",html_url) parser = HtmlParser.from_url(html_url, Tokenizer("japanese")) summarizer = LsaSummarizer() summary = summarizer(parser.document, 10000) # 5文で要約 print("summary:",summary) return "\n".join(str(sentence) for sentence in summary) # GradioのBlockを設定 with gr.Blocks(css="footer {visibility: hidden;} #custom_button {width: 400px; margin: 0 auto; background-color: #E0E7FF;}", theme=gr.themes.Soft(), title="ハイブリッド・サマリー・エージェント") as smry: gr.HTML('''