| Hugging Face's logo |
| Hugging Face |
| Search models, datasets, users... |
| Models |
| Datasets |
| Spaces |
| Posts |
| Docs |
| Solutions |
| Pricing |
| |
| |
| |
| Spaces: |
| |
| andreeabodea |
| / |
| Extract_Project_Report_Section_1 |
| |
| |
| like |
| 0 |
| |
| Logs |
| App |
| Files |
| Community |
| Settings |
| Extract_Project_Report_Section_1 |
| / |
| app.py |
| |
| andreeabodea's picture |
| andreeabodea |
| Update app.py |
| 536f374 |
| VERIFIED |
| about 2 hours ago |
| raw |
| history |
| blame |
| edit |
| delete |
| No virus |
| 5.51 kB |
| import os |
| import pdfplumber |
| import re |
| import gradio as gr |
| from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer |
| from io import BytesIO |
| import torch |
|
|
| """ |
| Extract the text from a section of a PDF file between 'wanted_section' and 'next_section'. |
| Parameters: |
| - path (str): The file path to the PDF file. |
| - wanted_section (str): The section to start extracting text from. |
| - next_section (str): The section to stop extracting text at. |
| Returns: |
| - text (str): The extracted text from the specified section range. |
| """ |
|
|
|
|
| def get_section(path, wanted_section, next_section): |
| print(wanted_section) |
|
|
| |
| doc = pdfplumber.open(BytesIO(path)) |
| start_page = [] |
| end_page = [] |
|
|
| |
| for page in range(len(doc.pages)): |
| if len(doc.pages[page].search(wanted_section, return_chars=False, case=False)) > 0: |
| start_page.append(page) |
| if len(doc.pages[page].search(next_section, return_chars=False, case=False)) > 0: |
| end_page.append(page) |
|
|
| |
| text = [] |
| for page_num in range(max(start_page), max(end_page)+1): |
| page = doc.pages[page_num] |
| text.append(page.extract_text()) |
| text = " ".join(text) |
| final_text = text.replace("\n", " ") |
| return final_text |
|
|
|
|
| def extract_between(big_string, start_string, end_string): |
| |
| pattern = re.escape(start_string) + '(.*?)' + re.escape(end_string) |
| match = re.search(pattern, big_string, re.DOTALL) |
|
|
| if match: |
| |
| return match.group(1) |
| else: |
| |
| return None |
|
|
| def format_section1(section1_text): |
| result_section1_dict = {} |
|
|
| result_section1_dict['TOPIC'] = extract_between(section1_text, "Sektor", "EZ-Programm") |
| result_section1_dict['PROGRAM'] = extract_between(section1_text, "Sektor", "EZ-Programm") |
| result_section1_dict['PROJECT DESCRIPTION'] = extract_between(section1_text, "EZ-Programmziel", "Datum der letzten BE") |
| result_section1_dict['PROJECT NAME'] = extract_between(section1_text, "Modul", "Modulziel") |
| result_section1_dict['OBJECTIVE'] = extract_between(section1_text, "Modulziel", "Berichtszeitraum") |
| result_section1_dict['PROGRESS'] = extract_between(section1_text, "Zielerreichung des Moduls", "Massnahme im Zeitplan") |
| result_section1_dict['STATUS'] = extract_between(section1_text, "Massnahme im Zeitplan", "Risikoeinschätzung") |
| result_section1_dict['RECOMMENDATIONS'] = extract_between(section1_text, "Vorschläge zur Modulanpas-", "Voraussichtliche") |
|
|
| return result_section1_dict |
|
|
| def answer_questions(text,language="de"): |
| |
| model_name = "deepset/gelectra-large-germanquad" |
| model = AutoModelForQuestionAnswering.from_pretrained(model_name) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
| |
| qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer) |
| questions = [ |
| "Welches ist das Titel des Moduls?", |
| "Welches ist das Sektor oder das Kernthema?", |
| "Welches ist das Land?", |
| "Zu welchem Program oder EZ-Programm gehort das Projekt?" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ] |
|
|
| |
| answers_dict = {} |
|
|
| for question in questions: |
| result = qa_pipeline(question=question, context=text) |
| |
| |
| answers_dict[question] = result['answer'] |
| return answers_dict |
|
|
|
|
| def process_pdf(path): |
| results_dict = {} |
| results_dict["1. Kurzbeschreibung"] = \ |
| get_section(path, "1. Kurzbeschreibung", "2. Einordnung des Moduls") |
| answers = answer_questions(results_dict["1. Kurzbeschreibung"]) |
| return answers |
|
|
| def get_first_page_text(file_data): |
| doc = pdfplumber.open(BytesIO(file_data)) |
| if len(doc.pages): |
| return doc.pages[0].extract_text() |
|
|
| if __name__ == "__main__": |
| |
| |
| |
| demo = gr.Interface(fn=process_pdf, |
| inputs=gr.File(type="binary", label="Upload PDF"), |
| outputs=gr.Textbox(label="Extracted Text"), |
| title="PDF Text Extractor", |
| description="Upload a PDF file to extract.") |
| demo.launch() |