sadickam commited on
Commit
e230b99
·
verified ·
1 Parent(s): 6f5e36a

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -807
app.py DELETED
@@ -1,807 +0,0 @@
1
- import gradio as gr
2
- import os
3
- import re
4
- import torch
5
- import pandas as pd
6
- import plotly.express as px
7
- import plotly.io as pio
8
- import nltk
9
- import tempfile
10
- from io import BytesIO
11
- import base64
12
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
13
- from nltk.tokenize import sent_tokenize
14
- from docx.shared import Inches
15
- from docx import Document
16
- import numpy as np
17
-
18
- from styles import custom_css # Importing custom CSS
19
-
20
- nltk.download('punkt')
21
-
22
- # Import PyPDFLoader for PDF processing
23
- from langchain_community.document_loaders import PyPDFLoader
24
-
25
- # Model checkpoint for SDG BERT
26
- checkpoint = "sadickam/sdgBERT"
27
-
28
- # Text cleaning function
29
- def clean_text(text):
30
- """
31
- Cleans the extracted text by removing irrelevant characters but retains currency symbols.
32
- """
33
- text = text.strip()
34
- # Define the allowed characters (including currency symbols)
35
- allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
36
- text = re.sub(allowed_chars, '', text)
37
- text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
38
- return text
39
-
40
- # Preprocessing function for text
41
- def prep_text(text):
42
- clean_sents = []
43
- sent_tokens = sent_tokenize(str(text))
44
- for sent_token in sent_tokens:
45
- word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
46
- clean_sents.append(' '.join(word_tokens))
47
- joined = ' '.join(clean_sents).strip()
48
- return re.sub(r'`|"', "", joined)
49
-
50
- # Load the tokenizer and model with GPU support
51
- def load_model_and_tokenizer():
52
- model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
53
- tokenizer = AutoTokenizer.from_pretrained(checkpoint)
54
- return model, tokenizer
55
-
56
- # Define device (ensure usage of GPU if available in Hugging Face Spaces)
57
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
-
59
- # SDG labels
60
- label_list = [
61
- 'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
62
- 'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
63
- 'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
64
- 'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
65
- 'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
66
- 'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
67
- ]
68
-
69
- # Function to predict SDGs for a batch of text inputs
70
- def predict_sdg_labels_batch(texts, model, tokenizer):
71
- tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
72
- model.eval()
73
- with torch.no_grad():
74
- text_logits = model(**tokenized_texts).logits
75
- predictions = torch.softmax(text_logits, dim=1).tolist()
76
- return predictions
77
-
78
- # Page-level predictions with batch processing
79
- def predict_pages(page_df, batch_size=32):
80
- model, tokenizer = load_model_and_tokenizer()
81
- df_results = page_df.copy()
82
- num_rows = len(page_df)
83
- all_predicted_labels = [[] for _ in range(16)]
84
- all_prediction_scores = [[] for _ in range(16)]
85
-
86
- for start in range(0, num_rows, batch_size):
87
- end = min(start + batch_size, num_rows)
88
- df_chunk = page_df.iloc[start:end]
89
- # Clean text
90
- texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
91
- predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
92
- for predictions in predictions_batch:
93
- sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
94
- for i, (label, score) in enumerate(sorted_preds):
95
- all_predicted_labels[i].append(label)
96
- all_prediction_scores[i].append(score)
97
-
98
- # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
99
- for i in range(16):
100
- df_results[f'pred{i + 1}'] = all_predicted_labels[i]
101
- df_results[f'score{i + 1}'] = all_prediction_scores[i]
102
-
103
- # Reorder columns to ensure preds and scores are interleaved in the correct order
104
- reordered_columns = []
105
- for i in range(16):
106
- reordered_columns.append(f'pred{i + 1}')
107
- reordered_columns.append(f'score{i + 1}')
108
- other_columns = [col for col in df_results.columns if col not in reordered_columns]
109
- df_results = df_results[other_columns + reordered_columns]
110
-
111
- return df_results
112
-
113
- # Sentence-level predictions with batch processing
114
- def predict_sentences(sentence_df, batch_size=32):
115
- model, tokenizer = load_model_and_tokenizer()
116
- df_combined_sentences = sentence_df.copy()
117
-
118
- num_rows = len(sentence_df)
119
- all_predicted_labels = [[] for _ in range(16)]
120
- all_prediction_scores = [[] for _ in range(16)]
121
-
122
- for start in range(0, num_rows, batch_size):
123
- end = min(start + batch_size, num_rows)
124
- df_chunk = sentence_df.iloc[start:end]
125
- # Clean text
126
- texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
127
- predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
128
- for predictions in predictions_batch:
129
- sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
130
- for i, (label, score) in enumerate(sorted_preds):
131
- all_predicted_labels[i].append(label)
132
- all_prediction_scores[i].append(round(score, 3))
133
-
134
- # Add predictions and scores to DataFrame
135
- for i in range(16):
136
- df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
137
- df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
138
-
139
- # Reorder columns
140
- reordered_columns = []
141
- for i in range(16):
142
- reordered_columns.append(f'pred{i + 1}')
143
- reordered_columns.append(f'score{i + 1}')
144
- other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
145
- df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
146
-
147
- return df_combined_sentences
148
-
149
- # Define unique colors for each SDG
150
- sdg_colors = {
151
- "SDG1_No Poverty": "#E5243B",
152
- "SDG2_Zero Hunger": "#DDA63A",
153
- "SDG3_Good Health and Well-being": "#4C9F38",
154
- "SDG4_Quality Education": "#C5192D",
155
- "SDG5_Gender Equality": "#FF3A21",
156
- "SDG6_Clean Water and Sanitation": "#26BDE2",
157
- "SDG7_Affordable and Clean Energy": "#FCC30B",
158
- "SDG8_Decent Work and Economic Growth": "#A21942",
159
- "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
160
- "SDG10_Reduced Inequality": "#DD1367",
161
- "SDG11_Sustainable Cities and Communities": "#FD9D24",
162
- "SDG12_Responsible Consumption and Production": "#BF8B2E",
163
- "SDG13_Climate Action": "#3F7E44",
164
- "SDG14_Life Below Water": "#0A97D9",
165
- "SDG15_Life on Land": "#56C02B",
166
- "SDG16_Peace, Justice and Strong Institutions": "#00689D"
167
- }
168
-
169
- # Function to plot SDG dominant bar graphs using Plotly
170
- def plot_sdg(df, title, pred_column, icons_folder='assets/icons/'):
171
- """
172
- Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
173
-
174
- Args:
175
- df (pd.DataFrame): DataFrame containing SDG predictions.
176
- title (str): Title of the plot.
177
- pred_column (str): Column name to use for plotting (e.g., 'pred1').
178
- icons_folder (str): Path to the folder containing SDG icons.
179
-
180
- Returns:
181
- plotly.graph_objs._figure.Figure: The Plotly figure object.
182
- """
183
- df_filtered = df[df[pred_column].notna()]
184
- labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
185
- total = labels.sum()
186
- percentages = (labels / total) * 100
187
-
188
- # Create a horizontal bar plot with Plotly
189
- fig = px.bar(
190
- percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
191
- y='SDG Label',
192
- x='Percentage',
193
- orientation='h',
194
- title=title,
195
- color='SDG Label',
196
- color_discrete_map=sdg_colors # Use the defined unique colors for each SDG
197
- )
198
-
199
- # Update y-axis to show labels
200
- fig.update_yaxes(showticklabels=True)
201
-
202
- # Add percentage labels to the bars
203
- fig.update_traces(
204
- texttemplate='%{x:.2f}%',
205
- textposition='auto',
206
- textfont=dict(size=10)
207
- )
208
-
209
- # Adjust layout for better visibility
210
- fig.update_layout(
211
- title=dict(
212
- text=title, font=dict(size=14) # Increase title font size
213
- ),
214
- yaxis=dict(
215
- automargin=True,
216
- title=None,
217
- tickfont=dict(size=12)
218
- ),
219
- margin=dict(l=20, r=30, t=100, b=20), # Increased right margin for icon
220
- height=600,
221
- width=800,
222
- showlegend=False,
223
- template="simple_white",
224
- xaxis=dict(
225
- tickfont=dict(size=12) # Reduce x-axis font size
226
- ),
227
- )
228
-
229
- # Identify the most frequent SDG
230
- if not percentages.empty:
231
- top_sdg_label = percentages.index[0] # e.g., 'SDG1_No Poverty'
232
-
233
- # Map SDG label to icon filename
234
- # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
235
- sdg_number = top_sdg_label.split('_')[0] # Extract 'SDG1'
236
- icon_filename = f"{sdg_number}.png" # e.g., 'SDG1.png'
237
- icon_path = os.path.join(icons_folder, icon_filename)
238
-
239
- # Check if the icon file exists
240
- if os.path.exists(icon_path):
241
- # Read and encode the image
242
- with open(icon_path, 'rb') as image_file:
243
- encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
244
-
245
- # Add the icon as an image in the Plotly figure
246
- fig.add_layout_image(
247
- dict(
248
- source='data:image/png;base64,' + encoded_image,
249
- xref="paper", yref="paper",
250
- x=0.4, y=1.2, # Positioning: slightly to the right and top
251
- sizex=0.2, sizey=0.2, # Size of the icon
252
- xanchor="left",
253
- yanchor="top",
254
- layer="above" # Ensure the icon is above other plot elements
255
- )
256
- )
257
- else:
258
- print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
259
-
260
- return fig
261
-
262
- def save_figure_as_jpeg(fig, filename):
263
- """Saves the Plotly figure as a high-resolution JPEG."""
264
- pio.write_image(fig, filename, format='jpeg', width=1000, height=700, scale=5)
265
-
266
- # Generate reports (page and sentence levels)
267
- def generate_page_report(df_pages, report_file_name):
268
- doc = Document()
269
- doc.add_heading("Page-Level SDG Analysis Report", 0)
270
-
271
- doc.add_heading("📋 General Notes", level=2)
272
- doc.add_paragraph(
273
- 'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
274
- 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
275
- 'representing the likelihood that the text is aligned with particular SDGs. This page-level '
276
- 'analysis provides high-level insight into SDG alignment.'
277
- '\n\n'
278
- 'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
279
- '(Primary and Secondary) for each page with a probability score greater than zero.'
280
- )
281
-
282
- doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
283
- doc.add_paragraph(
284
- 'This graph displays the most essential SDG the AI model associates with pages. The bars '
285
- 'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
286
- 'sustainable development theme within the document.'
287
- )
288
-
289
- doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
290
- doc.add_paragraph(
291
- 'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
292
- 'not the primary focus, the text has some relevance to these goals.'
293
- )
294
-
295
- for doc_name in df_pages['Document'].unique():
296
- # Sanitize doc_name to use in file names
297
- sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
298
-
299
- doc.add_heading(f"📄 Document: {doc_name}", level=2)
300
- df_doc = df_pages[df_pages['Document'] == doc_name]
301
-
302
- # Generate and save graphs
303
- first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
304
- second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
305
-
306
- plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
307
- first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
308
- plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
309
- second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
310
-
311
- # Add plots to the Word document
312
- doc.add_picture(first_sdg_plot_path, width=Inches(6))
313
- doc.add_picture(second_sdg_plot_path, width=Inches(6))
314
-
315
- doc.save(report_file_name)
316
- return report_file_name
317
-
318
- def generate_sentence_report(df_sentences, report_file_name):
319
- doc = Document()
320
- doc.add_heading("Sentence-Level SDG Analysis Report", 0)
321
-
322
- doc.add_heading("📋 General Notes", level=2)
323
- doc.add_paragraph(
324
- 'This app splits documents into sentences using a natural language processing algorithm. '
325
- 'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
326
- 'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
327
- 'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
328
- 'analysis provides deeper insight into SDG alignment.'
329
- '\n\n'
330
- 'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
331
- '(Primary and Secondary) for each sentence with a probability score greater than zero.'
332
- )
333
-
334
- doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
335
- doc.add_paragraph(
336
- 'This graph displays the most essential SDG the AI model associates with sentences. The bars '
337
- 'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
338
- 'into the dominant sustainable development theme within the document.'
339
- )
340
-
341
- doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
342
- doc.add_paragraph(
343
- 'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
344
- 'the primary focus, the text has some relevance to these goals.'
345
- )
346
-
347
- for doc_name in df_sentences['Document'].unique():
348
- # Sanitize doc_name to use in file names
349
- sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
350
-
351
- doc.add_heading(f"📄 Document: {doc_name}", level=2)
352
- df_doc = df_sentences[df_sentences['Document'] == doc_name]
353
-
354
- # Generate and save graphs
355
- first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
356
- second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
357
-
358
- plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
359
- first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
360
- plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
361
- second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
362
-
363
- # Add plots to the Word document
364
- doc.add_picture(first_sdg_plot_path, width=Inches(6))
365
- doc.add_picture(second_sdg_plot_path, width=Inches(6))
366
-
367
- doc.save(report_file_name)
368
- return report_file_name
369
-
370
- # New text extraction functions with text cleaning and line joining
371
- def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
372
- """
373
- Extract text from a PDF page by page using LangChain's PyPDFLoader.
374
- Args:
375
- pdf_file_path (str): The file path to the uploaded PDF.
376
- start_page (int, optional): The starting page number for extraction (1-based index).
377
- end_page (int, optional): The ending page number for extraction (1-based index).
378
- Returns:
379
- tuple:
380
- - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
381
- - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
382
- """
383
- try:
384
- # Initialize the loader
385
- loader = PyPDFLoader(pdf_file_path)
386
- documents = loader.load_and_split() # Each document corresponds to a single page
387
-
388
- total_pages = len(documents)
389
- doc_name = os.path.basename(pdf_file_path) # Extract document name
390
-
391
- # Validate and adjust page range
392
- if start_page is not None and end_page is not None:
393
- # Convert to integers to avoid slicing issues
394
- start_page = int(start_page)
395
- end_page = int(end_page)
396
-
397
- # Adjust to valid range
398
- if start_page < 1:
399
- start_page = 1
400
- if end_page > total_pages:
401
- end_page = total_pages
402
- if start_page > end_page:
403
- start_page, end_page = end_page, start_page # Swap if out of order
404
-
405
- # Select the subset of documents based on user input
406
- selected_docs = documents[start_page - 1:end_page]
407
- else:
408
- selected_docs = documents
409
- start_page = 1
410
- end_page = total_pages
411
-
412
- # Initialize lists to store data
413
- page_data = []
414
- sentence_data = []
415
-
416
- for idx, doc in enumerate(selected_docs, start=start_page):
417
- page_num = idx
418
- text = doc.page_content.strip()
419
-
420
- # Join lines that belong to the same sentence
421
- lines = text.split('\n')
422
- joined_text = ' '.join(line.strip() for line in lines if line.strip())
423
-
424
- # Clean text
425
- cleaned_text = clean_text(joined_text)
426
-
427
- # Append page-wise data
428
- page_data.append({
429
- "Document": doc_name,
430
- "Page": page_num,
431
- "Text": cleaned_text
432
- })
433
-
434
- # Sentence tokenization
435
- sentences = sent_tokenize(cleaned_text)
436
- for sentence in sentences:
437
- sentence = sentence.strip()
438
- if sentence and len(sentence) > 70:
439
- sentence_data.append({
440
- "Document": doc_name,
441
- "Page": page_num,
442
- "Sentence": sentence
443
- })
444
-
445
- # Create DataFrames
446
- page_df = pd.DataFrame(page_data)
447
- sentence_df = pd.DataFrame(sentence_data)
448
-
449
- return page_df, sentence_df
450
-
451
- except Exception as e:
452
- raise RuntimeError(f"Error during PDF extraction: {e}")
453
-
454
- def df_to_csv_bytes(df):
455
- """
456
- Convert DataFrame to CSV in bytes.
457
- Args:
458
- df (pd.DataFrame): The DataFrame to convert.
459
- Returns:
460
- bytes: CSV data in bytes.
461
- """
462
- try:
463
- buffer = BytesIO()
464
- df.to_csv(buffer, index=False)
465
- csv_data = buffer.getvalue()
466
- buffer.close()
467
- return csv_data
468
- except Exception as e:
469
- raise RuntimeError(f"Error during CSV conversion: {e}")
470
-
471
- def launch_interface():
472
- with gr.Blocks(css=custom_css) as demo:
473
-
474
- # Title as a visible heading at the top of the page with an icon
475
- gr.Markdown(
476
- """
477
- # 🌍 SDG Document Analysis App - CPU
478
- Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
479
- """
480
- )
481
-
482
- # Shared PDF file input for both analyses
483
- with gr.Row():
484
- file_input = gr.File(
485
- label="📁 Upload PDF File for Analysis", file_types=[".pdf"]
486
- )
487
-
488
- # Extraction mode selection with explanatory text
489
- gr.Markdown(
490
- """
491
- ## PDF Text Extraction Mode
492
- Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
493
- """
494
- )
495
- with gr.Row():
496
- extraction_mode = gr.Radio(
497
- choices=["All Pages", "Range of Pages"],
498
- value="All Pages",
499
- label="Extraction Mode"
500
- )
501
-
502
- with gr.Row():
503
- start_page = gr.Number(value=1, label="🔢 Start Page", visible=False)
504
- end_page = gr.Number(value=1, label="🔢 End Page", visible=False)
505
-
506
- # Function to update visibility of start_page and end_page
507
- def update_page_inputs(extraction_mode):
508
- if extraction_mode == "Range of Pages":
509
- return gr.update(visible=True), gr.update(visible=True)
510
- else:
511
- return gr.update(visible=False), gr.update(visible=False)
512
-
513
- extraction_mode.change(
514
- update_page_inputs,
515
- inputs=extraction_mode,
516
- outputs=[start_page, end_page]
517
- )
518
-
519
- # Main Tabs for Page-Level and Sentence-Level Analysis
520
- gr.Markdown("## SDG Analysis Type")
521
-
522
- with gr.Tab("📄 Page-Level Analysis"):
523
- gr.Markdown(
524
- """
525
- ### Page-Level SDG Analysis
526
- This section conducts Sustainable Development Goals (SDG) mapping
527
- of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
528
- It provides **high-level SDG mapping** of documents at the page level.
529
- """
530
- )
531
- # Nested Tabs for Primary and Secondary SDGs
532
- with gr.Tabs():
533
- with gr.TabItem("📊 Primary SDGs"):
534
- primary_page_plot = gr.Plot(label="📊 Primary SDGs [Page-Level]")
535
-
536
- with gr.Row():
537
- page_csv = gr.File(label="📊 Download Page Predictions CSV")
538
- page_docx = gr.File(label="📄 Download Page Report DOCX")
539
- page_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
540
-
541
- with gr.TabItem("📈 Secondary SDGs"):
542
- secondary_page_plot = gr.Plot(label="📈 Secondary SDGs [Page-Level]")
543
-
544
- with gr.Row():
545
- page_csv_secondary = gr.File(label="📊 Download Page Predictions CSV")
546
- page_report_file_secondary = gr.File(label="📄 Download Page Report DOCX")
547
- secondary_page_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
548
-
549
- with gr.Row():
550
- page_button = gr.Button("🏃‍♂️ Run Page-Level Analysis")
551
- reset_page_button = gr.Button("🔄 Reset Page-Level Analysis", elem_classes="reset-button")
552
-
553
- with gr.Tab("✍️ Sentence-Level Analysis"):
554
- gr.Markdown(
555
- """
556
- ### Sentence-Level SDG Analysis
557
- This section conducts Sustainable Development Goals (SDG) mapping
558
- using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
559
- It provides **detailed SDG mapping** at the sentence level.
560
- """
561
- )
562
- # Nested Tabs for Primary and Secondary SDGs
563
- with gr.Tabs():
564
- with gr.TabItem("📊 Primary SDGs"):
565
- primary_sentence_plot = gr.Plot(label="📊 Primary SDGs [Sentence-Level]")
566
-
567
- with gr.Row():
568
- sentence_csv = gr.File(label="📊 Download Sentence Predictions CSV")
569
- sentence_docx = gr.File(label="📄 Download Sentence Report DOCX")
570
- sentence_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
571
-
572
- with gr.TabItem("📈 Secondary SDGs"):
573
- secondary_sentence_plot = gr.Plot(label="📈 Secondary SDGs [Sentence-Level]")
574
-
575
- with gr.Row():
576
- sentence_csv_secondary = gr.File(label="📊 Download Sentence Predictions CSV")
577
- sentence_report_file_secondary = gr.File(label="📄 Download Sentence Report DOCX")
578
- secondary_sentence_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
579
-
580
- with gr.Row():
581
- sentence_button = gr.Button("🏃‍♂️ Run Sentence-Level Analysis")
582
- reset_sentence_button = gr.Button("🔄 Reset Sentence-Level Analysis", elem_classes="reset-button")
583
-
584
- # Function to process page-level analysis
585
- def process_pages(file, extraction_mode, start_page, end_page):
586
- if not file:
587
- # Return None for each output component
588
- return [None, None, None, None, None, None, None, None]
589
-
590
- try:
591
- if hasattr(file, 'name'):
592
- pdf_file_path = file.name
593
- original_file_name = os.path.basename(file.name)
594
- else:
595
- # Save the file to a temporary location
596
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
597
- temp_pdf.write(file.read())
598
- pdf_file_path = temp_pdf.name
599
- original_file_name = 'uploaded_document'
600
-
601
- # Sanitize the file name to use in output file names
602
- sanitized_file_name = os.path.splitext(original_file_name)[0]
603
- sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
604
-
605
- # Determine page range based on extraction_mode
606
- if extraction_mode == "All Pages":
607
- selected_start = None
608
- selected_end = None
609
- else:
610
- selected_start = int(start_page)
611
- selected_end = int(end_page)
612
-
613
- # Extract text and create DataFrames
614
- page_df, _ = extract_text_with_py_pdf_loader(
615
- pdf_file_path,
616
- start_page=selected_start,
617
- end_page=selected_end
618
- )
619
-
620
- # Predict SDGs at page level
621
- df_page_predictions = predict_pages(page_df)
622
-
623
- # Generate plots with icon overlay
624
- first_plot = plot_sdg(
625
- df_page_predictions, "📊 Primary SDGs", 'pred1'
626
- )
627
- second_plot = plot_sdg(
628
- df_page_predictions, "📈 Secondary SDGs", 'pred2'
629
- )
630
-
631
- # Define output file names
632
- page_csv_file = f"{sanitized_file_name}_page_predictions_primary.csv"
633
- page_report_file = f"{sanitized_file_name}_page_report_primary.docx"
634
- primary_page_jpeg = f"{sanitized_file_name}_primary_page.jpeg"
635
-
636
- page_csv_file_secondary = f"{sanitized_file_name}_page_predictions_secondary.csv"
637
- page_report_file_secondary = f"{sanitized_file_name}_page_report_secondary.docx"
638
- secondary_page_jpeg = f"{sanitized_file_name}_secondary_page.jpeg"
639
-
640
- # Save CSV and reports
641
- df_page_predictions.to_csv(page_csv_file, index=False)
642
- page_report_primary = generate_page_report(df_page_predictions, page_report_file)
643
-
644
- df_page_predictions.to_csv(page_csv_file_secondary, index=False)
645
- page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
646
-
647
- # Save figures as JPEG
648
- save_figure_as_jpeg(first_plot, primary_page_jpeg)
649
- save_figure_as_jpeg(second_plot, secondary_page_jpeg)
650
-
651
- return (
652
- first_plot, second_plot,
653
- page_csv_file, page_report_file, primary_page_jpeg,
654
- page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
655
- )
656
-
657
- except Exception as e:
658
- print(f"Error: {e}")
659
- return [None, None, None, None, None, None, None, None]
660
-
661
- # Function to process sentence-level analysis
662
- def process_sentences(file, extraction_mode, start_page, end_page):
663
- if not file:
664
- # Return None for each output component
665
- return [None, None, None, None, None, None, None, None]
666
-
667
- try:
668
- if hasattr(file, 'name'):
669
- pdf_file_path = file.name
670
- original_file_name = os.path.basename(file.name)
671
- else:
672
- # Save the file to a temporary location
673
- with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
674
- temp_pdf.write(file.read())
675
- pdf_file_path = temp_pdf.name
676
- original_file_name = 'uploaded_document'
677
-
678
- # Sanitize the file name to use in output file names
679
- sanitized_file_name = os.path.splitext(original_file_name)[0]
680
- sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
681
-
682
- # Determine page range based on extraction_mode
683
- if extraction_mode == "All Pages":
684
- selected_start = None
685
- selected_end = None
686
- else:
687
- selected_start = int(start_page)
688
- selected_end = int(end_page)
689
-
690
- # Extract text and create DataFrames
691
- _, sentence_df = extract_text_with_py_pdf_loader(
692
- pdf_file_path,
693
- start_page=selected_start,
694
- end_page=selected_end
695
- )
696
-
697
- # Predict SDGs at sentence level
698
- df_sentence_predictions = predict_sentences(sentence_df)
699
-
700
- # Generate plots with icon overlay
701
- first_plot = plot_sdg(
702
- df_sentence_predictions, "📊 Primary SDGs", 'pred1'
703
- )
704
- second_plot = plot_sdg(
705
- df_sentence_predictions, "📈 Secondary SDGs", 'pred2'
706
- )
707
-
708
- # Define output file names
709
- sentence_csv_file = f"{sanitized_file_name}_sentence_predictions_primary.csv"
710
- sentence_report_file = f"{sanitized_file_name}_sentence_report_primary.docx"
711
- primary_sentence_jpeg = f"{sanitized_file_name}_primary_sentence.jpeg"
712
-
713
- sentence_csv_file_secondary = f"{sanitized_file_name}_sentence_predictions_secondary.csv"
714
- sentence_report_file_secondary = f"{sanitized_file_name}_sentence_report_secondary.docx"
715
- secondary_sentence_jpeg = f"{sanitized_file_name}_secondary_sentence.jpeg"
716
-
717
- # Save CSV and reports
718
- df_sentence_predictions.to_csv(sentence_csv_file, index=False)
719
- sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
720
-
721
- df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
722
- sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
723
-
724
- # Save figures as JPEG
725
- save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
726
- save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
727
-
728
- return (
729
- first_plot, second_plot,
730
- sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
731
- sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
732
- )
733
-
734
- except Exception as e:
735
- print(f"Error: {e}")
736
- return [None, None, None, None, None, None, None, None]
737
-
738
- # Reset functions to clear the outputs
739
- def reset_page_outputs():
740
- return [None, None, None, None, None, None, None, None]
741
-
742
- def reset_sentence_outputs():
743
- return [None, None, None, None, None, None, None, None]
744
-
745
- # Button actions for Page-Level Analysis
746
- page_button.click(
747
- process_pages,
748
- inputs=[file_input, extraction_mode, start_page, end_page],
749
- outputs=[
750
- primary_page_plot, # 📊 Primary SDGs [Page-Level]
751
- secondary_page_plot, # 📈 Secondary SDGs [Page-Level]
752
- page_csv, # 📊 Download Page Predictions CSV
753
- page_docx, # 📄 Download Page Report DOCX
754
- page_jpeg1, # 🖼️ Download Primary SDGs JPEG
755
- page_csv_secondary, # 📊 Download Page Predictions CSV (Secondary)
756
- page_report_file_secondary, # 📄 Download Page Report DOCX (Secondary)
757
- secondary_page_jpeg # 🖼️ Download Secondary SDGs JPEG
758
- ]
759
- )
760
-
761
- reset_page_button.click(
762
- reset_page_outputs,
763
- outputs=[
764
- primary_page_plot,
765
- secondary_page_plot,
766
- page_csv,
767
- page_docx,
768
- page_jpeg1,
769
- page_csv_secondary,
770
- page_report_file_secondary,
771
- secondary_page_jpeg
772
- ]
773
- )
774
-
775
- # Button actions for Sentence-Level Analysis
776
- sentence_button.click(
777
- process_sentences,
778
- inputs=[file_input, extraction_mode, start_page, end_page],
779
- outputs=[
780
- primary_sentence_plot, # 📊 Primary SDGs [Sentence-Level]
781
- secondary_sentence_plot, # 📈 Secondary SDGs [Sentence-Level]
782
- sentence_csv, # 📊 Download Sentence Predictions CSV
783
- sentence_docx, # 📄 Download Sentence Report DOCX
784
- sentence_jpeg1, # 🖼️ Download Primary SDGs JPEG
785
- sentence_csv_secondary, # 📊 Download Sentence Predictions CSV (Secondary)
786
- sentence_report_file_secondary, # 📄 Download Sentence Report DOCX (Secondary)
787
- secondary_sentence_jpeg # 🖼️ Download Secondary SDGs JPEG
788
- ]
789
- )
790
-
791
- reset_sentence_button.click(
792
- reset_sentence_outputs,
793
- outputs=[
794
- primary_sentence_plot,
795
- secondary_sentence_plot,
796
- sentence_csv,
797
- sentence_docx,
798
- sentence_jpeg1,
799
- sentence_csv_secondary,
800
- sentence_report_file_secondary,
801
- secondary_sentence_jpeg
802
- ]
803
- )
804
-
805
- demo.queue().launch()
806
-
807
- launch_interface()