Spaces:

sadickam
/

document-SDG-App-cpu

Runtime error

App Files Files Community

sadickam commited on Oct 16, 2024

Commit

d57c3ca

verified ·

1 Parent(s): 63d91e0

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -56

app.py CHANGED Viewed

@@ -227,11 +227,11 @@ def save_figure_as_jpeg(fig, filename):
     pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
 # Generate reports (page and sentence levels)
-def generate_page_report(df_pages):
     doc = Document()
     doc.add_heading("Page-Level SDG Analysis Report", 0)
-    doc.add_heading("General Notes", level=2)
     doc.add_paragraph(
         'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
         'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
@@ -242,26 +242,29 @@ def generate_page_report(df_pages):
         '(Primary and Secondary) for each page with a probability score greater than zero.'
     )
-    doc.add_heading("Primary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph displays the most essential SDG the AI model associates with pages. The bars '
         'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
         'sustainable development theme within the document.'
     )
-    doc.add_heading("Secondary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
         'not the primary focus, the text has some relevance to these goals.'
     )
     for doc_name in df_pages['Document'].unique():
-        doc.add_heading(f"Document: {doc_name}", level=2)
         df_doc = df_pages[df_pages['Document'] == doc_name]
         # Generate and save graphs
-        first_sdg_plot_path = f"{doc_name}_first_sdg_page.jpeg"
-        second_sdg_plot_path = f"{doc_name}_second_sdg_page.jpeg"
         plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
             first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
@@ -272,14 +275,14 @@ def generate_page_report(df_pages):
         doc.add_picture(first_sdg_plot_path, width=Inches(6))
         doc.add_picture(second_sdg_plot_path, width=Inches(6))
-    doc.save("page_report.docx")
-    return "page_report.docx"
-def generate_sentence_report(df_sentences):
     doc = Document()
     doc.add_heading("Sentence-Level SDG Analysis Report", 0)
-    doc.add_heading("General Notes", level=2)
     doc.add_paragraph(
         'This app splits documents into sentences using a natural language processing algorithm. '
         'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
@@ -291,26 +294,29 @@ def generate_sentence_report(df_sentences):
         '(Primary and Secondary) for each sentence with a probability score greater than zero.'
     )
-    doc.add_heading("Primary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph displays the most essential SDG the AI model associates with sentences. The bars '
         'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
         'into the dominant sustainable development theme within the document.'
     )
-    doc.add_heading("Secondary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
         'the primary focus, the text has some relevance to these goals.'
     )
     for doc_name in df_sentences['Document'].unique():
-        doc.add_heading(f"Document: {doc_name}", level=2)
         df_doc = df_sentences[df_sentences['Document'] == doc_name]
         # Generate and save graphs
-        first_sdg_plot_path = f"{doc_name}_first_sdg_sentence.jpeg"
-        second_sdg_plot_path = f"{doc_name}_second_sdg_sentence.jpeg"
         plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
             first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
@@ -321,8 +327,8 @@ def generate_sentence_report(df_sentences):
         doc.add_picture(first_sdg_plot_path, width=Inches(6))
         doc.add_picture(second_sdg_plot_path, width=Inches(6))
-    doc.save("sentence_report.docx")
-    return "sentence_report.docx"
 # New text extraction functions with text cleaning and line joining
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
@@ -439,13 +445,13 @@ def launch_interface():
         # Shared PDF file input for both analyses
         with gr.Row():
             file_input = gr.File(
-                label="Upload PDF File for Analysis", file_types=[".pdf"]
             )
         # Extraction mode selection with explanatory text
         gr.Markdown(
             """
-            ### PDF Text Extraction Mode
             Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
             """
         )
@@ -457,8 +463,8 @@ def launch_interface():
             )
         with gr.Row():
-            start_page = gr.Number(value=1, label="Start Page", visible=False)
-            end_page = gr.Number(value=1, label="End Page", visible=False)
         # Function to update visibility of start_page and end_page
         def update_page_inputs(extraction_mode):
@@ -474,7 +480,7 @@ def launch_interface():
         )
         # Tabs for page-level and sentence-level analysis
-        with gr.Tab("Page-Level Analysis"):
             gr.Markdown(
                 """
                 ### 📄 Page-Level SDG Analysis
@@ -485,20 +491,20 @@ def launch_interface():
             )
             with gr.Row():
                 with gr.Column():
-                    primary_page_plot = gr.Plot(label="Primary SDGs [Page-Level]")
                 with gr.Column():
-                    secondary_page_plot = gr.Plot(label="Secondary SDGs [Page-Level]")
             with gr.Row():
-                page_csv = gr.File(label="Download Page Predictions CSV")
-                page_docx = gr.File(label="Download Page Report DOCX")
-                page_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
-                page_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
-            page_button = gr.Button("Run Page-Level Analysis")
-            reset_page_button = gr.Button("Reset Page-Level Analysis")
-        with gr.Tab("Sentence-Level Analysis"):
             gr.Markdown(
                 """
                 ### ✍️ Sentence-Level SDG Analysis
@@ -509,18 +515,18 @@ def launch_interface():
             )
             with gr.Row():
                 with gr.Column():
-                    primary_sentence_plot = gr.Plot(label="Primary SDGs [Sentence-Level]")
                 with gr.Column():
-                    secondary_sentence_plot = gr.Plot(label="Secondary SDGs [Sentence-Level]")
             with gr.Row():
-                sentence_csv = gr.File(label="Download Sentence Predictions CSV")
-                sentence_docx = gr.File(label="Download Sentence Report DOCX")
-                sentence_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
-                sentence_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
-            sentence_button = gr.Button("Run Sentence-Level Analysis")
-            reset_sentence_button = gr.Button("Reset Sentence-Level Analysis")
         # Function to process page-level analysis
         @spaces.GPU
@@ -531,11 +537,17 @@ def launch_interface():
             try:
                 if hasattr(file, 'name'):
                     pdf_file_path = file.name
                 else:
                     # Save the file to a temporary location
                     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                         temp_pdf.write(file.read())
                         pdf_file_path = temp_pdf.name
                 # Determine page range based on extraction_mode
                 if extraction_mode == "All Pages":
@@ -556,22 +568,28 @@ def launch_interface():
                 df_page_predictions = predict_pages(page_df)
                 first_plot = plot_sdg(
-                    df_page_predictions, "", 'pred1'
                 )
                 second_plot = plot_sdg(
-                    df_page_predictions, "", 'pred2'
                 )
-                df_page_predictions.to_csv('page_predictions.csv', index=False)
-                page_report = generate_page_report(df_page_predictions)
                 # Save figures as JPEG
-                save_figure_as_jpeg(first_plot, "primary_page.jpeg")
-                save_figure_as_jpeg(second_plot, "secondary_page.jpeg")
                 return (
-                    first_plot, second_plot, 'page_predictions.csv', page_report,
-                    'primary_page.jpeg', 'secondary_page.jpeg')
             except Exception as e:
                 print(f"Error: {e}")
@@ -586,11 +604,17 @@ def launch_interface():
             try:
                 if hasattr(file, 'name'):
                     pdf_file_path = file.name
                 else:
                     # Save the file to a temporary location
                     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                         temp_pdf.write(file.read())
                         pdf_file_path = temp_pdf.name
                 # Determine page range based on extraction_mode
                 if extraction_mode == "All Pages":
@@ -611,22 +635,28 @@ def launch_interface():
                 df_sentence_predictions = predict_sentences(sentence_df)
                 first_plot = plot_sdg(
-                    df_sentence_predictions, "", 'pred1'
                 )
                 second_plot = plot_sdg(
-                    df_sentence_predictions, "", 'pred2'
                 )
-                df_sentence_predictions.to_csv('sentence_predictions.csv', index=False)
-                sentence_report = generate_sentence_report(df_sentence_predictions)
                 # Save figures as JPEG
-                save_figure_as_jpeg(first_plot, "primary_sentence.jpeg")
-                save_figure_as_jpeg(second_plot, "secondary_sentence.jpeg")
                 return (
-                    first_plot, second_plot, 'sentence_predictions.csv', sentence_report,
-                    'primary_sentence.jpeg', 'secondary_sentence.jpeg')
             except Exception as e:
                 print(f"Error: {e}")

     pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
 # Generate reports (page and sentence levels)
+def generate_page_report(df_pages, report_file_name):
     doc = Document()
     doc.add_heading("Page-Level SDG Analysis Report", 0)
+    doc.add_heading("📋 General Notes", level=2)
     doc.add_paragraph(
         'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
         'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
         '(Primary and Secondary) for each page with a probability score greater than zero.'
     )
+    doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph displays the most essential SDG the AI model associates with pages. The bars '
         'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
         'sustainable development theme within the document.'
     )
+    doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
         'not the primary focus, the text has some relevance to these goals.'
     )
     for doc_name in df_pages['Document'].unique():
+        # Sanitize doc_name to use in file names
+        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
+        doc.add_heading(f"📄 Document: {doc_name}", level=2)
         df_doc = df_pages[df_pages['Document'] == doc_name]
         # Generate and save graphs
+        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
+        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
         plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
             first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
         doc.add_picture(first_sdg_plot_path, width=Inches(6))
         doc.add_picture(second_sdg_plot_path, width=Inches(6))
+    doc.save(report_file_name)
+    return report_file_name
+def generate_sentence_report(df_sentences, report_file_name):
     doc = Document()
     doc.add_heading("Sentence-Level SDG Analysis Report", 0)
+    doc.add_heading("📋 General Notes", level=2)
     doc.add_paragraph(
         'This app splits documents into sentences using a natural language processing algorithm. '
         'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
         '(Primary and Secondary) for each sentence with a probability score greater than zero.'
     )
+    doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph displays the most essential SDG the AI model associates with sentences. The bars '
         'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
         'into the dominant sustainable development theme within the document.'
     )
+    doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
     doc.add_paragraph(
         'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
         'the primary focus, the text has some relevance to these goals.'
     )
     for doc_name in df_sentences['Document'].unique():
+        # Sanitize doc_name to use in file names
+        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
+        doc.add_heading(f"📄 Document: {doc_name}", level=2)
         df_doc = df_sentences[df_sentences['Document'] == doc_name]
         # Generate and save graphs
+        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
+        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
         plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
             first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
         doc.add_picture(first_sdg_plot_path, width=Inches(6))
         doc.add_picture(second_sdg_plot_path, width=Inches(6))
+    doc.save(report_file_name)
+    return report_file_name
 # New text extraction functions with text cleaning and line joining
 def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
         # Shared PDF file input for both analyses
         with gr.Row():
             file_input = gr.File(
+                label="📁 Upload PDF File for Analysis", file_types=[".pdf"]
             )
         # Extraction mode selection with explanatory text
         gr.Markdown(
             """
+            ### 📋 PDFText Extraction Mode
             Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
             """
         )
             )
         with gr.Row():
+            start_page = gr.Number(value=1, label="🔢 Start Page", visible=False)
+            end_page = gr.Number(value=1, label="🔢 End Page", visible=False)
         # Function to update visibility of start_page and end_page
         def update_page_inputs(extraction_mode):
         )
         # Tabs for page-level and sentence-level analysis
+        with gr.Tab("📄 Page-Level Analysis"):
             gr.Markdown(
                 """
                 ### 📄 Page-Level SDG Analysis
             )
             with gr.Row():
                 with gr.Column():
+                    primary_page_plot = gr.Plot(label="📊 Primary SDGs [Page-Level]")
                 with gr.Column():
+                    secondary_page_plot = gr.Plot(label="📈 Secondary SDGs [Page-Level]")
             with gr.Row():
+                page_csv = gr.File(label="📊 Download Page Predictions CSV")
+                page_docx = gr.File(label="📄 Download Page Report DOCX")
+                page_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
+                page_jpeg2 = gr.File(label="🖼️ Download Secondary SDGs JPEG")
+            page_button = gr.Button("🏃‍♂️ Run Page-Level Analysis")
+            reset_page_button = gr.Button("🔄 Reset Page-Level Analysis")
+        with gr.Tab("✍️ Sentence-Level Analysis"):
             gr.Markdown(
                 """
                 ### ✍️ Sentence-Level SDG Analysis
             )
             with gr.Row():
                 with gr.Column():
+                    primary_sentence_plot = gr.Plot(label="📊 Primary SDGs [Sentence-Level]")
                 with gr.Column():
+                    secondary_sentence_plot = gr.Plot(label="📈 Secondary SDGs [Sentence-Level]")
             with gr.Row():
+                sentence_csv = gr.File(label="📊 Download Sentence Predictions CSV")
+                sentence_docx = gr.File(label="📄 Download Sentence Report DOCX")
+                sentence_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
+                sentence_jpeg2 = gr.File(label="🖼️ Download Secondary SDGs JPEG")
+            sentence_button = gr.Button("🏃‍♂️ Run Sentence-Level Analysis")
+            reset_sentence_button = gr.Button("🔄 Reset Sentence-Level Analysis")
         # Function to process page-level analysis
         @spaces.GPU
             try:
                 if hasattr(file, 'name'):
                     pdf_file_path = file.name
+                    original_file_name = os.path.basename(file.name)
                 else:
                     # Save the file to a temporary location
                     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                         temp_pdf.write(file.read())
                         pdf_file_path = temp_pdf.name
+                    original_file_name = 'uploaded_document'
+                # Sanitize the file name to use in output file names
+                sanitized_file_name = os.path.splitext(original_file_name)[0]
+                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
                 # Determine page range based on extraction_mode
                 if extraction_mode == "All Pages":
                 df_page_predictions = predict_pages(page_df)
                 first_plot = plot_sdg(
+                    df_page_predictions, "📊 Primary SDGs", 'pred1'
                 )
                 second_plot = plot_sdg(
+                    df_page_predictions, "📈 Secondary SDGs", 'pred2'
                 )
+                # Define output file names
+                page_csv_file = f"{sanitized_file_name}_page_predictions.csv"
+                page_report_file = f"{sanitized_file_name}_page_report.docx"
+                primary_page_jpeg = f"{sanitized_file_name}_primary_page.jpeg"
+                secondary_page_jpeg = f"{sanitized_file_name}_secondary_page.jpeg"
+                df_page_predictions.to_csv(page_csv_file, index=False)
+                page_report = generate_page_report(df_page_predictions, page_report_file)
                 # Save figures as JPEG
+                save_figure_as_jpeg(first_plot, primary_page_jpeg)
+                save_figure_as_jpeg(second_plot, secondary_page_jpeg)
                 return (
+                    first_plot, second_plot, page_csv_file, page_report_file,
+                    primary_page_jpeg, secondary_page_jpeg)
             except Exception as e:
                 print(f"Error: {e}")
             try:
                 if hasattr(file, 'name'):
                     pdf_file_path = file.name
+                    original_file_name = os.path.basename(file.name)
                 else:
                     # Save the file to a temporary location
                     with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
                         temp_pdf.write(file.read())
                         pdf_file_path = temp_pdf.name
+                    original_file_name = 'uploaded_document'
+                # Sanitize the file name to use in output file names
+                sanitized_file_name = os.path.splitext(original_file_name)[0]
+                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
                 # Determine page range based on extraction_mode
                 if extraction_mode == "All Pages":
                 df_sentence_predictions = predict_sentences(sentence_df)
                 first_plot = plot_sdg(
+                    df_sentence_predictions, "📊 Primary SDGs", 'pred1'
                 )
                 second_plot = plot_sdg(
+                    df_sentence_predictions, "📈 Secondary SDGs", 'pred2'
                 )
+                # Define output file names
+                sentence_csv_file = f"{sanitized_file_name}_sentence_predictions.csv"
+                sentence_report_file = f"{sanitized_file_name}_sentence_report.docx"
+                primary_sentence_jpeg = f"{sanitized_file_name}_primary_sentence.jpeg"
+                secondary_sentence_jpeg = f"{sanitized_file_name}_secondary_sentence.jpeg"
+                df_sentence_predictions.to_csv(sentence_csv_file, index=False)
+                sentence_report = generate_sentence_report(df_sentence_predictions, sentence_report_file)
                 # Save figures as JPEG
+                save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
+                save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
                 return (
+                    first_plot, second_plot, sentence_csv_file, sentence_report_file,
+                    primary_sentence_jpeg, secondary_sentence_jpeg)
             except Exception as e:
                 print(f"Error: {e}")