andreeabodea
/

extraction_report_project

Model card Files Files and versions

xet

Community

andreeabodea commited on Mar 27, 2024

Commit

b7c096f

verified ·

1 Parent(s): bc1bd61

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -253

app.py CHANGED Viewed

@@ -85,256 +85,4 @@ for file in files:
     print(results_dict)
     json_string = json.dumps(results_dict, indent=4)
-    print(json_string)
-"""
-def extract_section_text(pdf_path, start_section, end_section=None):
-    Extract text from a specific section of a PDF.
-    :param pdf_path: Path to the PDF file.
-    :param start_section: The title of the section to start extracting text.
-    :param end_section: The title of the section to stop extracting text (optional).
-    :return: Extracted text from the specified section.
-    text = ""
-    section_started = False
-    with fitz.open(pdf_path) as doc:  # Open the PDF
-        for page in doc:  # Iterate through each page
-            page_text = page.get_text("text")  # Extract text from the current page
-            if start_section in page_text and not section_started:
-                # Start section found
-                section_started = True
-                text += page_text
-            elif section_started:
-                if end_section and end_section in page_text:
-                    # End section found, stop reading further
-                    break
-                else:
-                    # Continue adding text from the section
-                    text += page_text
-    # Optional: refine text extraction, if necessary
-    if section_started:
-        # If the start section is in the middle of the page, trim the text before it
-        start_index = text.find(start_section)
-        text = text[start_index:]
-        if end_section:
-            # If an end section is specified, trim the text after it
-            end_index = text.find(end_section)
-            if end_index != -1:
-                text = text[:end_index]
-    return text
-# create function to read pdf and extract appendix 1 with results matrix
-def get_appendix(pdf):
-    #for each page, check whether it contains Anlage 1 and Anlage 2 to get relevant pages
-    start_page = []
-    end_page = []
-    for page in range(len(pdf.pages)):
-        if len(pdf.pages[page].search("Anlage 1: Wirkungsmatrix", return_chars=False, case = False)) > 0: # FOR PROJECTS
-        # if len(pdf.pages[page].search("A1 - Wirkungsmatrix", return_chars=False, case=False)) > 0: # FOR PROGRAMS
-            start_page.append(page)
-        if len(pdf.pages[page].search("Anlage 2: Wirkungslogik", return_chars=False, case = False)) > 0: # FOR PROJECTS
-        # if len(pdf.pages[page].search("A2 - Daten", return_chars=False, case = False)) > 0: # FOR PROGRAMS
-            end_page.append(page)
-    # return results
-    return start_page, end_page
-# create function to parse table from results_matrix and transform to dataframe
-def extract_tables_from_pdf(start_page, end_page):
-    # for each page in appendix
-    for page in range(max(start_page), max(end_page)):
-        try:
-            # extract table(s)
-            table = pdf.pages[page].extract_tables()[0]
-        except IndexError:
-            break
-        print(table)
-        # for each row of the table...
-        for row_num in range(len(table)):
-            row = table[row_num]
-            # ...remove the line breakers from the wrapped texts
-            cleaned_row = [item.replace("-\n", "") if item is not None and "-\n" in item
-                           else "None" if item is None
-                           else item for item in row]
-            cleaned_row = [item.replace("\n", " ") if item is not None and "\n" in item
-                           else "None" if item is None
-                           else item for item in cleaned_row]
-            # append row to results_matrix_list
-            results_matrix_list.append(cleaned_row)
-    return results_matrix_list
-# define function to extract programm-infos
-def extract_programm(table_rows_list, file_name):
-    # define empty lists to save results
-    programmziel = []
-    pz_indikator = []
-    basiswert = []
-    zielwert = []
-    istwert = []
-    # for each row in results matrix (list), extract elements
-    for row in table_rows_list:
-        for i in row:
-            if "Programmziel " in i:
-                programmziel.append(i)
-            else:
-                pass
-            if "Programmzielindikator" in i:
-                pz_indikator.append(i)
-            else:
-                pass
-    # extract values from impact indicators
-    for indikator in pz_indikator:
-        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
-            index1 = indikator.index("Basiswert:")
-            index2 = indikator.index("Zielwert:")
-            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
-        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
-            basiswert.append(indikator.split("Basiswert:")[1])
-        else:
-            basiswert.append("")
-        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
-            index1 = indikator.index("Zielwert:")
-            index2 = indikator.index("Istwert:")
-            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
-        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
-            zielwert.append(indikator.split("Zielwert:")[1])
-        else:
-            zielwert.append("")
-        if "Istwert:" in indikator:
-            istwert.append(indikator.split("Istwert:")[1])
-        else:
-            istwert.append("")
-    # create dataframes for each tier (programm, modul, output)
-    programm = p
-    # extract values from outcome indicators
-    for indikator in mz_indikator:
-        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
-            index1 = indikator.index("Basiswert:")
-            index2 = indikator.index("Zielwert:")
-            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
-        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
-            basiswert.append(indikator.split("Basiswert:")[1])
-        else:
-            basiswert.append("")
-        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
-            index1 = indikator.index("Zielwert:")
-            index2 = indikator.index("Istwert:")
-            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
-        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
-            zielwert.append(indikator.split("Zielwert:")[1])
-        else:
-            zielwert.append("")
-        if "Istwert:" in indikator:
-            istwert.append(indikator.split("Istwert:")[1])
-        else:
-            istwert.append("")
-    # create dataframes for each tier (programm, modul, output)
-    outcome = pd.DataFrame.from_dict({"ziel":modulziel, "indikator":mz_indikator,"basiswert": basiswert,
-                                     "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(mz_indikator)},
-                                      orient="index")
-    outcome = outcome.transpose()
-    return outcome
-# define function for outputs
-def extract_outputs(table_rows_list,file_name):
-    # define empty lists to save results
-    output = []
-    output_indikator = []
-    basiswert = []
-    zielwert = []
-    istwert = []
-    # for each row in results matrix (list), extract elements
-    for row in table_rows_list:
-        for i in row:
-            if "Output " in i:
-                output.append(i)
-            else:
-                pass
-            if "Outputindikator" in i:
-                output_indikator.append(i)
-            else:
-                pass
-    # extract values from output indicators
-    for indikator in output_indikator:
-        if (("Basiswert:" in indikator) and ("Zielwert:" in indikator)):
-            index1 = indikator.index("Basiswert:")
-            index2 = indikator.index("Zielwert:")
-            basiswert.append(indikator[index1 + len("Basiswert:") + 1: index2])
-        elif (("Basiswert:" in indikator) and ("Zielwert:" not in indikator)):
-            basiswert.append(indikator.split("Basiswert:")[1])
-        else:
-            basiswert.append("")
-        if (("Zielwert:" in indikator) and ("Istwert:" in indikator)):
-            index1 = indikator.index("Zielwert:")
-            index2 = indikator.index("Istwert:")
-            zielwert.append(indikator[index1 + len("Zielwert:") + 1: index2])
-        elif (("Zielwert:" in indikator) and ("Istwert:" not in indikator)):
-            zielwert.append(indikator.split("Zielwert:")[1])
-        else:
-            zielwert.append("")
-        if "Istwert:" in indikator:
-            istwert.append(indikator.split("Istwert:")[1])
-        else:
-            istwert.append("")
-    # create dataframes for each tier (programm, modul, output)
-    output = pd.DataFrame.from_dict({"output":output, "indikator":output_indikator, "basiswert": basiswert,
-                                     "zielwert": zielwert, "istwert": istwert,"datei":[file_name]*len(output_indikator)},
-                                      orient = "index")
-    output = output.transpose()
-    return output
-# apply functions to files
-#Define global dataframes to store results from all files
-programme = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
-outcomes = pd.DataFrame(columns = ["ziel", "indikator", "basiswert", "zielwert", "istwert", "datei"])
-outputs = pd.DataFrame(columns = ["output", "indikator", "basiswert", "zielwert", "istwert", "datei"])
-    print("...and extract table and store as list")
-    results_matrix_list = extract_tables_from_pdf(start_page, end_page)
-    print("...extract programm information")
-    programm = extract_programm(results_matrix_list, file)
-    print("...extract modul information")
-    outcome = extract_modul(results_matrix_list, file)
-    print("...extract outputs")
-    output = extract_outputs(results_matrix_list, file)
-    print("...add results from extract functions to global dataframe")
-    programme = pd.concat([programme, programm], ignore_index=True)
-    outcomes = pd.concat([outcomes, outcome], ignore_index=True)
-    outputs = pd.concat([outputs, output], ignore_index=True)
-# write results to csv file
-programme.to_csv("/Users/andreeabodea/programme.csv", sep="|", index=False, decimal=",")
-outcomes.to_csv("/Users/andreeabodea/module_outcomes.csv", sep="|", index=False, decimal=",")
-outputs.to_csv("/Users/andreeabodea/module_outputs.csv", sep="|", index=False, decimal=",")
-print(programme)
-print(outcomes)
-print(outputs)
-        """

     print(results_dict)
     json_string = json.dumps(results_dict, indent=4)
+    print(json_string)