Spaces:
Sleeping
Sleeping
| # This project uses the BART model from Facebook AI Research (FAIR) available at https://huggingface.co/facebook/bart-large-cnn under the Apache License 2.0. | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import fitz # PyMuPDF | |
| import gradio as gr | |
| from transformers import pipeline | |
| import re | |
| # μμ½μ μν λͺ¨λΈ λ‘λ | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| def extract_text_from_pdf(pdf_path): | |
| doc = fitz.open(pdf_path) | |
| text = "" | |
| for page_num in range(doc.page_count): | |
| page = doc.load_page(page_num) | |
| text += page.get_text("text") + "\n" | |
| return text | |
| def find_section(text, section_title): | |
| # μ κ· ννμμ μ¬μ©νμ¬ μΉμ μ λͺ©μ μ°Ύμ΅λλ€. | |
| pattern = re.compile(r'(?i)^.*{}.*$'.format(section_title), re.MULTILINE) | |
| matches = list(pattern.finditer(text)) | |
| if not matches: | |
| return None | |
| start_idx = matches[0].start() | |
| end_idx = text.find('\n\n', start_idx) | |
| if end_idx == -1: | |
| end_idx = len(text) | |
| section_text = text[start_idx:end_idx].strip() | |
| return section_text | |
| def summarize_section(text, section_title, max_length=150): | |
| try: | |
| section_text = find_section(text, section_title) | |
| if section_text: | |
| summary = summarizer(section_text, max_length=max_length, min_length=30, do_sample=False) | |
| return summary[0]['summary_text'] | |
| return f"Section '{section_title}' not found." | |
| except Exception as e: | |
| return f"Error processing section '{section_title}': {str(e)}" | |
| def process_pdf(file): | |
| try: | |
| text = extract_text_from_pdf(file.name) | |
| except Exception as e: | |
| return [f"Error extracting text from PDF: {str(e)}"] * 3 | |
| abstract_summary = summarize_section(text, "abstract") | |
| research_question_summary = summarize_section(text, "research question") | |
| results_summary = summarize_section(text, "results") | |
| return [abstract_summary, research_question_summary, results_summary] | |
| # Gradio μΈν°νμ΄μ€ μ€μ | |
| interface = gr.Interface( | |
| fn=process_pdf, | |
| inputs=gr.File(label="Upload PDF"), | |
| outputs=[ | |
| gr.Textbox(label="Abstract Summary"), | |
| gr.Textbox(label="Research Question Summary"), | |
| gr.Textbox(label="Results Summary") | |
| ] | |
| ) | |
| # μΈν°νμ΄μ€ μ€ν | |
| interface.launch() | |