Uploaded code for extraction of the text from the sections of a PDF monitoring report of a project

Browse files

Files changed (1) hide show

extraction_project_report.py +340 -0

extraction_project_report.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import os
+import pandas as pd
+import pdfplumber
+import re
+import fitz  # PyMuPDF
+import json
+files = [f for f in os.listdir("/Users/andreeabodea/") if f.endswith(".pdf")]
+print(files)
+"""
+Extract the text from a section of a PDF file  between 'wanted_section' and 'next_section'.
+Parameters:
+- path (str): The file path to the PDF file.
+- wanted_section (str): The section to start extracting text from.
+- next_section (str): The section to stop extracting text at.
+Returns:
+- text (str): The extracted text from the specified section range.
+"""
+def get_section(path, wanted_section, next_section):
+    print(wanted_section)
+    # Open the PDF file
+    doc = pdfplumber.open(path)
+    start_page = []
+    end_page = []
+    # Find the all the pages for the specified sections
+    for page in range(len(doc.pages)):
+        if len(doc.pages[page].search(wanted_section, return_chars = False, case = False)) > 0:
+            start_page.append(page)
+        if len(doc.pages[page].search(next_section, return_chars = False, case = False)) > 0:
+            end_page.append(page)
+    print(max(start_page))
+    print(max(end_page))
+    # Extract the text between the start and end page of the wanted section
+    text = []
+    for page_num in range(max(start_page), max(end_page)):
+        page = doc.pages[page_num]
+        text.append(page.extract_text())
+    text = " ".join(text)
+    new_text = text.replace("\n", " ")
+    special_char_unicode_list = ["\u00e4", "\u00f6", "\u00fc", "\u00df"]
+    special_char_replacement_list = ["ae", "oe", "ue", "ss"]
+    for index, special_char in enumerate(special_char_unicode_list):
+        final_text = new_text.replace(special_char, special_char_replacement_list[index])
+    return final_text
+for file in files:
+    print("for each pdf file...")
+    path = "/Users/andreeabodea/" + file
+    pdf = pdfplumber.open(path)
+    print(path)
+    results_dict = {}
+    results_dict["2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm"] = \
+        get_section(path, "2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm", "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls")
+    results_dict["2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm"] = \
+        get_section(path,"2.1 Aktualisierte Einordnung des Moduls in das EZ-Programm", "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls")
+    results_dict["2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls"] = \
+        get_section(path, "2.2 Andere Entwicklungsmaßnahmen im konkreten Interventionsbereich des Moduls", "3. Entwicklungen im Interventionsbereich")
+    results_dict["3. Entwicklungen im Interventionsbereich"] = \
+        get_section(path, "3. Entwicklungen im Interventionsbereich", "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren")
+    results_dict["4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren"] = \
+        get_section(path, "4.1 Bewertungen von Zielen, Zielgruppen, Wirkungshypothesen und Indikatoren", "4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums")
+    results_dict["4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums"] = \
+        get_section(path, "4.2 Umgesetzte Maßnahmen / Aktivitäten während des Berichtszeitraums", "4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit")
+    results_dict["4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit des Vorhabens"] = \
+        get_section(path, "4.3 Umsetzung von Maßnahmen zur Sicherstellung der nachhaltigen Wirksamkeit", "4.4 Laufzeit und Zeitplan")
+    results_dict["4.4 Laufzeit und Zeitplan"] = \
+        get_section(path, "4.4 Laufzeit und Zeitplan", "4.5 Entstandene Kosten und Kostenverschiebungen")
+    results_dict["4.5 Entstandene Kosten und Kostenverschiebungen"] = \
+        get_section(path, "4.5 Entstandene Kosten und Kostenverschiebungen", "4.6 Bewertung der Wirkungen und Risiken")
+    results_dict["4.6 Bewertung der Wirkungen und Risiken"] = \
+        get_section(path, "4.6 Bewertung der Wirkungen und Risiken", "5. Übergeordnete Empfehlungen")
+    results_dict["5.1 Empfehlungen und Merkposten für den Politik- und Schwerpunktdialog"] = \
+        get_section(path, "5.1 Empfehlungen und Merkposten für den Politik- und Schwerpunktdialog", "5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige EZ-Programme")
+    results_dict["5.2 Lernerfahrungen, die für die Länderstrategie und zukünftige EZ-Programme interessant sein könnten"] = \
+        get_section(path, "5.2 Lernerfahrungen", "6. Testat")
+    results_dict["6. Testat (TZ)"] = \
+        get_section(path, "6. Testat", "Anlage 1: Wirkungsmatrix des Moduls")
+    print(results_dict)
+    json_string = json.dumps(results_dict, indent=4)
+    print(json_string)
+"""
+def extract_section_text(pdf_path, start_section, end_section=None):
+    Extract text from a specific section of a PDF.
+    :param pdf_path: Path to the PDF file.
+    :param start_section: The title of the section to start extracting text.
+    :param end_section: The title of the section to stop extracting text (optional).
+    :return: Extracted text from the specified section.
+    text = ""
+    section_started = False
+    with fitz.open(pdf_path) as doc:  # Open the PDF
+        for page in doc:  # Iterate through each page
+            page_text = page.get_text("text")  # Extract text from the current page
+            if start_section in page_text and not section_started:
+                # Start section found
+                section_started = True
+                text += page_text
+            elif section_started:
+                if end_section and end_section in page_text:
+                    # End section found, stop reading further
+                    break
+                else:
+                    # Continue adding text from the section
+                    text += page_text
+    # Optional: refine text extraction, if necessary
+    if section_started:
+        # If the start section is in the middle of the page, trim the text before it
+        start_index = text.find(start_section)
+        text = text[start_index:]
+        if end_section:
+            # If an end section is specified, trim the text after it
+            end_index = text.find(end_section)
+            if end_index != -1:
+                text = text[:end_index]
+    return text
+# create function to read pdf and extract appendix 1 with results matrix
+def get_appendix(pdf):
+    #for each page, check whether it contains Anlage 1 and Anlage 2 to get relevant pages
+    start_page = []
+    end_page = []
+    for page in range(len(pdf.pages)):
+        if len(pdf.pages[page].search("Anlage 1: Wirkungsmatrix", return_chars=False, case = False)) > 0: # FOR PROJECTS
+        # if len(pdf.pages[page].search("A1 - Wirkungsmatrix", return_chars=False, case=False)) > 0: # FOR PROGRAMS
+            start_page.append(page)
+        if len(pdf.pages[page].search("Anlage 2: Wirkungslogik", return_chars=False, case = False)) > 0: # FOR PROJECTS
+        # if len(pdf.pages[page].search("A2 - Daten", return_chars=False, case = False)) > 0: # FOR PROGRAMS
+            end_page.append(page)
+    # return results
+    return start_page, end_page
+# create function to parse table from results_matrix and transform to dataframe
+def extract_tables_from_pdf(start_page, end_page):
+    # for each page in appendix
+    for page in range(max(start_page), max(end_page)):
+        try:
+            # extract table(s)
+            table = pdf.pages[page].extract_tables()[0]
+        except IndexError:
+            break
+        print(table)
+        # for each row of the table...
+        for row_num in range(len(table)):
+            row = table[row_num]
+            # ...remove the line breakers from the wrapped texts
+            cleaned_row = [item.replace("-\n", "") if item is not None and "-\n" in item
+                           else "None" if item is None
+                           else item for item in row]
+            cleaned_row = [item.replace("\n", " ") if item is not None and "\n" in item
+                           else "None" if item is None
+                           else item for item in cleaned_row]
+            # append row to results_matrix_list
+            results_matrix_list.append(cleaned_row)
+    return results_matrix_list
+# define function to extract programm-infos
+def extract_programm(table_rows_list, file_name):
+    # define empty lists to save results
+    programmziel = []
+    pz_indikator = []
+    basiswert = []
+    zielwert = []
+    istwert = []
+    # for each row in results matrix (list), extract elements
+    for row in table_rows_list:
+        for i in row:
+            if "Programmziel " in i:
+                programmziel.append(i)
+            else:
+                pass
+            if "Programmzielindikator" in i:
+                pz_indikator.append(i)
+            else:
+                pass
+    # extract values from impact indicators
+    for indikator in pz_indikator:
+        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
+            index1 = indikator.index("Basiswert:")
+            index2 = indikator.index("Zielwert:")
+            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
+        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
+            basiswert.append(indikator.split("Basiswert:")[1])
+        else:
+            basiswert.append("")
+        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
+            index1 = indikator.index("Zielwert:")
+            index2 = indikator.index("Istwert:")
+            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
+        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
+            zielwert.append(indikator.split("Zielwert:")[1])
+        else:
+            zielwert.append("")
+        if "Istwert:" in indikator:
+            istwert.append(indikator.split("Istwert:")[1])
+        else:
+            istwert.append("")
+    # create dataframes for each tier (programm, modul, output)
+    programm = p
+    # extract values from outcome indicators
+    for indikator in mz_indikator:
+        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
+            index1 = indikator.index("Basiswert:")
+            index2 = indikator.index("Zielwert:")
+            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
+        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
+            basiswert.append(indikator.split("Basiswert:")[1])
+        else:
+            basiswert.append("")
+        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
+            index1 = indikator.index("Zielwert:")
+            index2 = indikator.index("Istwert:")
+            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
+        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
+            zielwert.append(indikator.split("Zielwert:")[1])
+        else:
+            zielwert.append("")
+        if "Istwert:" in indikator:
+            istwert.append(indikator.split("Istwert:")[1])
+        else:
+            istwert.append("")
+    # create dataframes for each tier (programm, modul, output)
+    outcome = pd.DataFrame.from_dict({"ziel":modulziel, "indikator":mz_indikator,"basiswert": basiswert,
+                                     "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(mz_indikator)},
+                                      orient="index")
+    outcome = outcome.transpose()
+    return outcome
+# define function for outputs
+def extract_outputs(table_rows_list,file_name):
+    # define empty lists to save results
+    output = []
+    output_indikator = []
+    basiswert = []
+    zielwert = []
+    istwert = []
+    # for each row in results matrix (list), extract elements
+    for row in table_rows_list:
+        for i in row:
+            if "Output " in i:
+                output.append(i)
+            else:
+                pass
+            if "Outputindikator" in i:
+                output_indikator.append(i)
+            else:
+                pass
+    # extract values from output indicators
+    for indikator in output_indikator:
+        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
+            index1 = indikator.index("Basiswert:")
+            index2 = indikator.index("Zielwert:")
+            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
+        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
+            basiswert.append(indikator.split("Basiswert:")[1])
+        else:
+            basiswert.append("")
+        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
+            index1 = indikator.index("Zielwert:")
+            index2 = indikator.index("Istwert:")
+            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
+        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
+            zielwert.append(indikator.split("Zielwert:")[1])
+        else:
+            zielwert.append("")
+        if "Istwert:" in indikator:
+            istwert.append(indikator.split("Istwert:")[1])
+        else:
+            istwert.append("")
+    # create dataframes for each tier (programm, modul, output)
+    output = pd.DataFrame.from_dict({"output":output, "indikator":output_indikator, "basiswert": basiswert,
+                                     "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(output_indikator)},
+                                      orient = "index")
+    output = output.transpose()
+    return output
+# apply functions to files
+#Define global dataframes to store results from all files
+programme = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
+outcomes = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
+outputs = pd.DataFrame(columns = ["output", "indikator", "basiswert", "zielwert", "istwert", "datei"])
+    print("...and extract table and store as list")
+    results_matrix_list = extract_tables_from_pdf(start_page, end_page)
+    print("...extract programm information")
+    programm = extract_programm(results_matrix_list, file)
+    print("...extract modul information")
+    outcome = extract_modul(results_matrix_list, file)
+    print("...extract outputs")
+    output = extract_outputs(results_matrix_list, file)
+    print("...add results from extract functions to global dataframe")
+    programme = pd.concat([programme, programm], ignore_index=True)
+    outcomes = pd.concat([outcomes, outcome], ignore_index=True)
+    outputs = pd.concat([outputs, output], ignore_index=True)
+# write results to csv file
+programme.to_csv("/Users/andreeabodea/programme.csv", sep="|", index=False, decimal=",")
+outcomes.to_csv("/Users/andreeabodea/module_outcomes.csv", sep="|", index=False, decimal=",")
+outputs.to_csv("/Users/andreeabodea/module_outputs.csv", sep="|", index=False, decimal=",")
+print(programme)
+print(outcomes)
+print(outputs)
+        """