Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -227,11 +227,11 @@ def save_figure_as_jpeg(fig, filename):
|
|
| 227 |
pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
|
| 228 |
|
| 229 |
# Generate reports (page and sentence levels)
|
| 230 |
-
def generate_page_report(df_pages):
|
| 231 |
doc = Document()
|
| 232 |
doc.add_heading("Page-Level SDG Analysis Report", 0)
|
| 233 |
|
| 234 |
-
doc.add_heading("General Notes", level=2)
|
| 235 |
doc.add_paragraph(
|
| 236 |
'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
|
| 237 |
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
|
|
@@ -242,26 +242,29 @@ def generate_page_report(df_pages):
|
|
| 242 |
'(Primary and Secondary) for each page with a probability score greater than zero.'
|
| 243 |
)
|
| 244 |
|
| 245 |
-
doc.add_heading("Primary SDGs Bar Graph", level=3)
|
| 246 |
doc.add_paragraph(
|
| 247 |
'This graph displays the most essential SDG the AI model associates with pages. The bars '
|
| 248 |
'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
|
| 249 |
'sustainable development theme within the document.'
|
| 250 |
)
|
| 251 |
|
| 252 |
-
doc.add_heading("Secondary SDGs Bar Graph", level=3)
|
| 253 |
doc.add_paragraph(
|
| 254 |
'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
|
| 255 |
'not the primary focus, the text has some relevance to these goals.'
|
| 256 |
)
|
| 257 |
|
| 258 |
for doc_name in df_pages['Document'].unique():
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
| 260 |
df_doc = df_pages[df_pages['Document'] == doc_name]
|
| 261 |
|
| 262 |
# Generate and save graphs
|
| 263 |
-
first_sdg_plot_path = f"{
|
| 264 |
-
second_sdg_plot_path = f"{
|
| 265 |
|
| 266 |
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
| 267 |
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
|
@@ -272,14 +275,14 @@ def generate_page_report(df_pages):
|
|
| 272 |
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
| 273 |
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
| 274 |
|
| 275 |
-
doc.save(
|
| 276 |
-
return
|
| 277 |
|
| 278 |
-
def generate_sentence_report(df_sentences):
|
| 279 |
doc = Document()
|
| 280 |
doc.add_heading("Sentence-Level SDG Analysis Report", 0)
|
| 281 |
|
| 282 |
-
doc.add_heading("General Notes", level=2)
|
| 283 |
doc.add_paragraph(
|
| 284 |
'This app splits documents into sentences using a natural language processing algorithm. '
|
| 285 |
'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
|
|
@@ -291,26 +294,29 @@ def generate_sentence_report(df_sentences):
|
|
| 291 |
'(Primary and Secondary) for each sentence with a probability score greater than zero.'
|
| 292 |
)
|
| 293 |
|
| 294 |
-
doc.add_heading("Primary SDGs Bar Graph", level=3)
|
| 295 |
doc.add_paragraph(
|
| 296 |
'This graph displays the most essential SDG the AI model associates with sentences. The bars '
|
| 297 |
'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
|
| 298 |
'into the dominant sustainable development theme within the document.'
|
| 299 |
)
|
| 300 |
|
| 301 |
-
doc.add_heading("Secondary SDGs Bar Graph", level=3)
|
| 302 |
doc.add_paragraph(
|
| 303 |
'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
|
| 304 |
'the primary focus, the text has some relevance to these goals.'
|
| 305 |
)
|
| 306 |
|
| 307 |
for doc_name in df_sentences['Document'].unique():
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
| 309 |
df_doc = df_sentences[df_sentences['Document'] == doc_name]
|
| 310 |
|
| 311 |
# Generate and save graphs
|
| 312 |
-
first_sdg_plot_path = f"{
|
| 313 |
-
second_sdg_plot_path = f"{
|
| 314 |
|
| 315 |
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
| 316 |
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
|
@@ -321,8 +327,8 @@ def generate_sentence_report(df_sentences):
|
|
| 321 |
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
| 322 |
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
| 323 |
|
| 324 |
-
doc.save(
|
| 325 |
-
return
|
| 326 |
|
| 327 |
# New text extraction functions with text cleaning and line joining
|
| 328 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
|
@@ -439,13 +445,13 @@ def launch_interface():
|
|
| 439 |
# Shared PDF file input for both analyses
|
| 440 |
with gr.Row():
|
| 441 |
file_input = gr.File(
|
| 442 |
-
label="Upload PDF File for Analysis", file_types=[".pdf"]
|
| 443 |
)
|
| 444 |
|
| 445 |
# Extraction mode selection with explanatory text
|
| 446 |
gr.Markdown(
|
| 447 |
"""
|
| 448 |
-
###
|
| 449 |
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
|
| 450 |
"""
|
| 451 |
)
|
|
@@ -457,8 +463,8 @@ def launch_interface():
|
|
| 457 |
)
|
| 458 |
|
| 459 |
with gr.Row():
|
| 460 |
-
start_page = gr.Number(value=1, label="Start Page", visible=False)
|
| 461 |
-
end_page = gr.Number(value=1, label="End Page", visible=False)
|
| 462 |
|
| 463 |
# Function to update visibility of start_page and end_page
|
| 464 |
def update_page_inputs(extraction_mode):
|
|
@@ -474,7 +480,7 @@ def launch_interface():
|
|
| 474 |
)
|
| 475 |
|
| 476 |
# Tabs for page-level and sentence-level analysis
|
| 477 |
-
with gr.Tab("Page-Level Analysis"):
|
| 478 |
gr.Markdown(
|
| 479 |
"""
|
| 480 |
### π Page-Level SDG Analysis
|
|
@@ -485,20 +491,20 @@ def launch_interface():
|
|
| 485 |
)
|
| 486 |
with gr.Row():
|
| 487 |
with gr.Column():
|
| 488 |
-
primary_page_plot = gr.Plot(label="Primary SDGs [Page-Level]")
|
| 489 |
with gr.Column():
|
| 490 |
-
secondary_page_plot = gr.Plot(label="Secondary SDGs [Page-Level]")
|
| 491 |
|
| 492 |
with gr.Row():
|
| 493 |
-
page_csv = gr.File(label="Download Page Predictions CSV")
|
| 494 |
-
page_docx = gr.File(label="Download Page Report DOCX")
|
| 495 |
-
page_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
|
| 496 |
-
page_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
|
| 497 |
|
| 498 |
-
page_button = gr.Button("Run Page-Level Analysis")
|
| 499 |
-
reset_page_button = gr.Button("Reset Page-Level Analysis")
|
| 500 |
|
| 501 |
-
with gr.Tab("Sentence-Level Analysis"):
|
| 502 |
gr.Markdown(
|
| 503 |
"""
|
| 504 |
### βοΈ Sentence-Level SDG Analysis
|
|
@@ -509,18 +515,18 @@ def launch_interface():
|
|
| 509 |
)
|
| 510 |
with gr.Row():
|
| 511 |
with gr.Column():
|
| 512 |
-
primary_sentence_plot = gr.Plot(label="Primary SDGs [Sentence-Level]")
|
| 513 |
with gr.Column():
|
| 514 |
-
secondary_sentence_plot = gr.Plot(label="Secondary SDGs [Sentence-Level]")
|
| 515 |
|
| 516 |
with gr.Row():
|
| 517 |
-
sentence_csv = gr.File(label="Download Sentence Predictions CSV")
|
| 518 |
-
sentence_docx = gr.File(label="Download Sentence Report DOCX")
|
| 519 |
-
sentence_jpeg1 = gr.File(label="Download Primary SDGs JPEG")
|
| 520 |
-
sentence_jpeg2 = gr.File(label="Download Secondary SDGs JPEG")
|
| 521 |
|
| 522 |
-
sentence_button = gr.Button("Run Sentence-Level Analysis")
|
| 523 |
-
reset_sentence_button = gr.Button("Reset Sentence-Level Analysis")
|
| 524 |
|
| 525 |
# Function to process page-level analysis
|
| 526 |
@spaces.GPU
|
|
@@ -531,11 +537,17 @@ def launch_interface():
|
|
| 531 |
try:
|
| 532 |
if hasattr(file, 'name'):
|
| 533 |
pdf_file_path = file.name
|
|
|
|
| 534 |
else:
|
| 535 |
# Save the file to a temporary location
|
| 536 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
| 537 |
temp_pdf.write(file.read())
|
| 538 |
pdf_file_path = temp_pdf.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
# Determine page range based on extraction_mode
|
| 541 |
if extraction_mode == "All Pages":
|
|
@@ -556,22 +568,28 @@ def launch_interface():
|
|
| 556 |
df_page_predictions = predict_pages(page_df)
|
| 557 |
|
| 558 |
first_plot = plot_sdg(
|
| 559 |
-
df_page_predictions, "", 'pred1'
|
| 560 |
)
|
| 561 |
second_plot = plot_sdg(
|
| 562 |
-
df_page_predictions, "", 'pred2'
|
| 563 |
)
|
| 564 |
|
| 565 |
-
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
# Save figures as JPEG
|
| 569 |
-
save_figure_as_jpeg(first_plot,
|
| 570 |
-
save_figure_as_jpeg(second_plot,
|
| 571 |
|
| 572 |
return (
|
| 573 |
-
first_plot, second_plot,
|
| 574 |
-
|
| 575 |
|
| 576 |
except Exception as e:
|
| 577 |
print(f"Error: {e}")
|
|
@@ -586,11 +604,17 @@ def launch_interface():
|
|
| 586 |
try:
|
| 587 |
if hasattr(file, 'name'):
|
| 588 |
pdf_file_path = file.name
|
|
|
|
| 589 |
else:
|
| 590 |
# Save the file to a temporary location
|
| 591 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
| 592 |
temp_pdf.write(file.read())
|
| 593 |
pdf_file_path = temp_pdf.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
|
| 595 |
# Determine page range based on extraction_mode
|
| 596 |
if extraction_mode == "All Pages":
|
|
@@ -611,22 +635,28 @@ def launch_interface():
|
|
| 611 |
df_sentence_predictions = predict_sentences(sentence_df)
|
| 612 |
|
| 613 |
first_plot = plot_sdg(
|
| 614 |
-
df_sentence_predictions, "", 'pred1'
|
| 615 |
)
|
| 616 |
second_plot = plot_sdg(
|
| 617 |
-
df_sentence_predictions, "", 'pred2'
|
| 618 |
)
|
| 619 |
|
| 620 |
-
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
|
| 623 |
# Save figures as JPEG
|
| 624 |
-
save_figure_as_jpeg(first_plot,
|
| 625 |
-
save_figure_as_jpeg(second_plot,
|
| 626 |
|
| 627 |
return (
|
| 628 |
-
first_plot, second_plot,
|
| 629 |
-
|
| 630 |
|
| 631 |
except Exception as e:
|
| 632 |
print(f"Error: {e}")
|
|
|
|
| 227 |
pio.write_image(fig, filename, format='jpeg', width=1000, height=600, scale=5)
|
| 228 |
|
| 229 |
# Generate reports (page and sentence levels)
|
| 230 |
+
def generate_page_report(df_pages, report_file_name):
|
| 231 |
doc = Document()
|
| 232 |
doc.add_heading("Page-Level SDG Analysis Report", 0)
|
| 233 |
|
| 234 |
+
doc.add_heading("π General Notes", level=2)
|
| 235 |
doc.add_paragraph(
|
| 236 |
'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
|
| 237 |
'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
|
|
|
|
| 242 |
'(Primary and Secondary) for each page with a probability score greater than zero.'
|
| 243 |
)
|
| 244 |
|
| 245 |
+
doc.add_heading("π Primary SDGs Bar Graph", level=3)
|
| 246 |
doc.add_paragraph(
|
| 247 |
'This graph displays the most essential SDG the AI model associates with pages. The bars '
|
| 248 |
'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
|
| 249 |
'sustainable development theme within the document.'
|
| 250 |
)
|
| 251 |
|
| 252 |
+
doc.add_heading("π Secondary SDGs Bar Graph", level=3)
|
| 253 |
doc.add_paragraph(
|
| 254 |
'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
|
| 255 |
'not the primary focus, the text has some relevance to these goals.'
|
| 256 |
)
|
| 257 |
|
| 258 |
for doc_name in df_pages['Document'].unique():
|
| 259 |
+
# Sanitize doc_name to use in file names
|
| 260 |
+
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
|
| 261 |
+
|
| 262 |
+
doc.add_heading(f"π Document: {doc_name}", level=2)
|
| 263 |
df_doc = df_pages[df_pages['Document'] == doc_name]
|
| 264 |
|
| 265 |
# Generate and save graphs
|
| 266 |
+
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
|
| 267 |
+
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
|
| 268 |
|
| 269 |
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
| 270 |
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
|
|
|
| 275 |
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
| 276 |
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
| 277 |
|
| 278 |
+
doc.save(report_file_name)
|
| 279 |
+
return report_file_name
|
| 280 |
|
| 281 |
+
def generate_sentence_report(df_sentences, report_file_name):
|
| 282 |
doc = Document()
|
| 283 |
doc.add_heading("Sentence-Level SDG Analysis Report", 0)
|
| 284 |
|
| 285 |
+
doc.add_heading("π General Notes", level=2)
|
| 286 |
doc.add_paragraph(
|
| 287 |
'This app splits documents into sentences using a natural language processing algorithm. '
|
| 288 |
'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
|
|
|
|
| 294 |
'(Primary and Secondary) for each sentence with a probability score greater than zero.'
|
| 295 |
)
|
| 296 |
|
| 297 |
+
doc.add_heading("π Primary SDGs Bar Graph", level=3)
|
| 298 |
doc.add_paragraph(
|
| 299 |
'This graph displays the most essential SDG the AI model associates with sentences. The bars '
|
| 300 |
'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
|
| 301 |
'into the dominant sustainable development theme within the document.'
|
| 302 |
)
|
| 303 |
|
| 304 |
+
doc.add_heading("π Secondary SDGs Bar Graph", level=3)
|
| 305 |
doc.add_paragraph(
|
| 306 |
'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
|
| 307 |
'the primary focus, the text has some relevance to these goals.'
|
| 308 |
)
|
| 309 |
|
| 310 |
for doc_name in df_sentences['Document'].unique():
|
| 311 |
+
# Sanitize doc_name to use in file names
|
| 312 |
+
sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
|
| 313 |
+
|
| 314 |
+
doc.add_heading(f"π Document: {doc_name}", level=2)
|
| 315 |
df_doc = df_sentences[df_sentences['Document'] == doc_name]
|
| 316 |
|
| 317 |
# Generate and save graphs
|
| 318 |
+
first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
|
| 319 |
+
second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
|
| 320 |
|
| 321 |
plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
|
| 322 |
first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
|
|
|
|
| 327 |
doc.add_picture(first_sdg_plot_path, width=Inches(6))
|
| 328 |
doc.add_picture(second_sdg_plot_path, width=Inches(6))
|
| 329 |
|
| 330 |
+
doc.save(report_file_name)
|
| 331 |
+
return report_file_name
|
| 332 |
|
| 333 |
# New text extraction functions with text cleaning and line joining
|
| 334 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
|
|
|
| 445 |
# Shared PDF file input for both analyses
|
| 446 |
with gr.Row():
|
| 447 |
file_input = gr.File(
|
| 448 |
+
label="π Upload PDF File for Analysis", file_types=[".pdf"]
|
| 449 |
)
|
| 450 |
|
| 451 |
# Extraction mode selection with explanatory text
|
| 452 |
gr.Markdown(
|
| 453 |
"""
|
| 454 |
+
### π PDFText Extraction Mode
|
| 455 |
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
|
| 456 |
"""
|
| 457 |
)
|
|
|
|
| 463 |
)
|
| 464 |
|
| 465 |
with gr.Row():
|
| 466 |
+
start_page = gr.Number(value=1, label="π’ Start Page", visible=False)
|
| 467 |
+
end_page = gr.Number(value=1, label="π’ End Page", visible=False)
|
| 468 |
|
| 469 |
# Function to update visibility of start_page and end_page
|
| 470 |
def update_page_inputs(extraction_mode):
|
|
|
|
| 480 |
)
|
| 481 |
|
| 482 |
# Tabs for page-level and sentence-level analysis
|
| 483 |
+
with gr.Tab("π Page-Level Analysis"):
|
| 484 |
gr.Markdown(
|
| 485 |
"""
|
| 486 |
### π Page-Level SDG Analysis
|
|
|
|
| 491 |
)
|
| 492 |
with gr.Row():
|
| 493 |
with gr.Column():
|
| 494 |
+
primary_page_plot = gr.Plot(label="π Primary SDGs [Page-Level]")
|
| 495 |
with gr.Column():
|
| 496 |
+
secondary_page_plot = gr.Plot(label="π Secondary SDGs [Page-Level]")
|
| 497 |
|
| 498 |
with gr.Row():
|
| 499 |
+
page_csv = gr.File(label="π Download Page Predictions CSV")
|
| 500 |
+
page_docx = gr.File(label="π Download Page Report DOCX")
|
| 501 |
+
page_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
|
| 502 |
+
page_jpeg2 = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
|
| 503 |
|
| 504 |
+
page_button = gr.Button("πββοΈ Run Page-Level Analysis")
|
| 505 |
+
reset_page_button = gr.Button("π Reset Page-Level Analysis")
|
| 506 |
|
| 507 |
+
with gr.Tab("βοΈ Sentence-Level Analysis"):
|
| 508 |
gr.Markdown(
|
| 509 |
"""
|
| 510 |
### βοΈ Sentence-Level SDG Analysis
|
|
|
|
| 515 |
)
|
| 516 |
with gr.Row():
|
| 517 |
with gr.Column():
|
| 518 |
+
primary_sentence_plot = gr.Plot(label="π Primary SDGs [Sentence-Level]")
|
| 519 |
with gr.Column():
|
| 520 |
+
secondary_sentence_plot = gr.Plot(label="π Secondary SDGs [Sentence-Level]")
|
| 521 |
|
| 522 |
with gr.Row():
|
| 523 |
+
sentence_csv = gr.File(label="π Download Sentence Predictions CSV")
|
| 524 |
+
sentence_docx = gr.File(label="π Download Sentence Report DOCX")
|
| 525 |
+
sentence_jpeg1 = gr.File(label="πΌοΈ Download Primary SDGs JPEG")
|
| 526 |
+
sentence_jpeg2 = gr.File(label="πΌοΈ Download Secondary SDGs JPEG")
|
| 527 |
|
| 528 |
+
sentence_button = gr.Button("πββοΈ Run Sentence-Level Analysis")
|
| 529 |
+
reset_sentence_button = gr.Button("π Reset Sentence-Level Analysis")
|
| 530 |
|
| 531 |
# Function to process page-level analysis
|
| 532 |
@spaces.GPU
|
|
|
|
| 537 |
try:
|
| 538 |
if hasattr(file, 'name'):
|
| 539 |
pdf_file_path = file.name
|
| 540 |
+
original_file_name = os.path.basename(file.name)
|
| 541 |
else:
|
| 542 |
# Save the file to a temporary location
|
| 543 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
| 544 |
temp_pdf.write(file.read())
|
| 545 |
pdf_file_path = temp_pdf.name
|
| 546 |
+
original_file_name = 'uploaded_document'
|
| 547 |
+
|
| 548 |
+
# Sanitize the file name to use in output file names
|
| 549 |
+
sanitized_file_name = os.path.splitext(original_file_name)[0]
|
| 550 |
+
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
|
| 551 |
|
| 552 |
# Determine page range based on extraction_mode
|
| 553 |
if extraction_mode == "All Pages":
|
|
|
|
| 568 |
df_page_predictions = predict_pages(page_df)
|
| 569 |
|
| 570 |
first_plot = plot_sdg(
|
| 571 |
+
df_page_predictions, "π Primary SDGs", 'pred1'
|
| 572 |
)
|
| 573 |
second_plot = plot_sdg(
|
| 574 |
+
df_page_predictions, "π Secondary SDGs", 'pred2'
|
| 575 |
)
|
| 576 |
|
| 577 |
+
# Define output file names
|
| 578 |
+
page_csv_file = f"{sanitized_file_name}_page_predictions.csv"
|
| 579 |
+
page_report_file = f"{sanitized_file_name}_page_report.docx"
|
| 580 |
+
primary_page_jpeg = f"{sanitized_file_name}_primary_page.jpeg"
|
| 581 |
+
secondary_page_jpeg = f"{sanitized_file_name}_secondary_page.jpeg"
|
| 582 |
+
|
| 583 |
+
df_page_predictions.to_csv(page_csv_file, index=False)
|
| 584 |
+
page_report = generate_page_report(df_page_predictions, page_report_file)
|
| 585 |
|
| 586 |
# Save figures as JPEG
|
| 587 |
+
save_figure_as_jpeg(first_plot, primary_page_jpeg)
|
| 588 |
+
save_figure_as_jpeg(second_plot, secondary_page_jpeg)
|
| 589 |
|
| 590 |
return (
|
| 591 |
+
first_plot, second_plot, page_csv_file, page_report_file,
|
| 592 |
+
primary_page_jpeg, secondary_page_jpeg)
|
| 593 |
|
| 594 |
except Exception as e:
|
| 595 |
print(f"Error: {e}")
|
|
|
|
| 604 |
try:
|
| 605 |
if hasattr(file, 'name'):
|
| 606 |
pdf_file_path = file.name
|
| 607 |
+
original_file_name = os.path.basename(file.name)
|
| 608 |
else:
|
| 609 |
# Save the file to a temporary location
|
| 610 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
|
| 611 |
temp_pdf.write(file.read())
|
| 612 |
pdf_file_path = temp_pdf.name
|
| 613 |
+
original_file_name = 'uploaded_document'
|
| 614 |
+
|
| 615 |
+
# Sanitize the file name to use in output file names
|
| 616 |
+
sanitized_file_name = os.path.splitext(original_file_name)[0]
|
| 617 |
+
sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
|
| 618 |
|
| 619 |
# Determine page range based on extraction_mode
|
| 620 |
if extraction_mode == "All Pages":
|
|
|
|
| 635 |
df_sentence_predictions = predict_sentences(sentence_df)
|
| 636 |
|
| 637 |
first_plot = plot_sdg(
|
| 638 |
+
df_sentence_predictions, "π Primary SDGs", 'pred1'
|
| 639 |
)
|
| 640 |
second_plot = plot_sdg(
|
| 641 |
+
df_sentence_predictions, "π Secondary SDGs", 'pred2'
|
| 642 |
)
|
| 643 |
|
| 644 |
+
# Define output file names
|
| 645 |
+
sentence_csv_file = f"{sanitized_file_name}_sentence_predictions.csv"
|
| 646 |
+
sentence_report_file = f"{sanitized_file_name}_sentence_report.docx"
|
| 647 |
+
primary_sentence_jpeg = f"{sanitized_file_name}_primary_sentence.jpeg"
|
| 648 |
+
secondary_sentence_jpeg = f"{sanitized_file_name}_secondary_sentence.jpeg"
|
| 649 |
+
|
| 650 |
+
df_sentence_predictions.to_csv(sentence_csv_file, index=False)
|
| 651 |
+
sentence_report = generate_sentence_report(df_sentence_predictions, sentence_report_file)
|
| 652 |
|
| 653 |
# Save figures as JPEG
|
| 654 |
+
save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
|
| 655 |
+
save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
|
| 656 |
|
| 657 |
return (
|
| 658 |
+
first_plot, second_plot, sentence_csv_file, sentence_report_file,
|
| 659 |
+
primary_sentence_jpeg, secondary_sentence_jpeg)
|
| 660 |
|
| 661 |
except Exception as e:
|
| 662 |
print(f"Error: {e}")
|