Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -25,6 +25,18 @@ from langchain_community.document_loaders import PyPDFLoader
|
|
| 25 |
# Model checkpoint for SDG BERT
|
| 26 |
checkpoint = "sadickam/sdgBERT"
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
# Preprocessing function for text
|
| 29 |
def prep_text(text):
|
| 30 |
clean_sents = []
|
|
@@ -74,7 +86,8 @@ def predict_pages(page_df, batch_size=32):
|
|
| 74 |
for start in range(0, num_rows, batch_size):
|
| 75 |
end = min(start + batch_size, num_rows)
|
| 76 |
df_chunk = page_df.iloc[start:end]
|
| 77 |
-
|
|
|
|
| 78 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
| 79 |
for predictions in predictions_batch:
|
| 80 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
@@ -109,7 +122,8 @@ def predict_sentences(sentence_df, batch_size=32):
|
|
| 109 |
for start in range(0, num_rows, batch_size):
|
| 110 |
end = min(start + batch_size, num_rows)
|
| 111 |
df_chunk = sentence_df.iloc[start:end]
|
| 112 |
-
|
|
|
|
| 113 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
| 114 |
for predictions in predictions_batch:
|
| 115 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
@@ -310,7 +324,7 @@ def generate_sentence_report(df_sentences):
|
|
| 310 |
doc.save("sentence_report.docx")
|
| 311 |
return "sentence_report.docx"
|
| 312 |
|
| 313 |
-
# New text extraction functions
|
| 314 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
| 315 |
"""
|
| 316 |
Extract text from a PDF page by page using LangChain's PyPDFLoader.
|
|
@@ -360,15 +374,22 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
|
|
| 360 |
page_num = idx
|
| 361 |
text = doc.page_content.strip()
|
| 362 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 363 |
# Append page-wise data
|
| 364 |
page_data.append({
|
| 365 |
"Document": doc_name,
|
| 366 |
"Page": page_num,
|
| 367 |
-
"Text":
|
| 368 |
})
|
| 369 |
|
| 370 |
# Sentence tokenization
|
| 371 |
-
sentences = sent_tokenize(
|
| 372 |
for sentence in sentences:
|
| 373 |
sentence = sentence.strip()
|
| 374 |
if sentence:
|
|
@@ -407,10 +428,10 @@ def df_to_csv_bytes(df):
|
|
| 407 |
def launch_interface():
|
| 408 |
with gr.Blocks(title="SDG Document Analysis App") as demo:
|
| 409 |
|
| 410 |
-
# Title as a visible heading at the top of the page
|
| 411 |
gr.Markdown(
|
| 412 |
"""
|
| 413 |
-
# SDG Document Analysis App
|
| 414 |
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
|
| 415 |
"""
|
| 416 |
)
|
|
@@ -421,14 +442,23 @@ def launch_interface():
|
|
| 421 |
label="Upload PDF File for Analysis", file_types=[".pdf"]
|
| 422 |
)
|
| 423 |
|
| 424 |
-
# Extraction mode selection
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
| 429 |
)
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
|
| 433 |
# Function to update visibility of start_page and end_page
|
| 434 |
def update_page_inputs(extraction_mode):
|
|
@@ -447,7 +477,7 @@ def launch_interface():
|
|
| 447 |
with gr.Tab("Page-Level Analysis"):
|
| 448 |
gr.Markdown(
|
| 449 |
"""
|
| 450 |
-
## Page-Level SDG Analysis
|
| 451 |
This section conducts Sustainable Development Goals (SDG) mapping
|
| 452 |
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
| 453 |
It provides **high-level SDG mapping** of documents at the page level.
|
|
@@ -471,7 +501,7 @@ def launch_interface():
|
|
| 471 |
with gr.Tab("Sentence-Level Analysis"):
|
| 472 |
gr.Markdown(
|
| 473 |
"""
|
| 474 |
-
## Sentence-Level SDG Analysis
|
| 475 |
This section conducts Sustainable Development Goals (SDG) mapping
|
| 476 |
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
| 477 |
It provides **detailed SDG mapping** at the sentence level.
|
|
|
|
| 25 |
# Model checkpoint for SDG BERT
|
| 26 |
checkpoint = "sadickam/sdgBERT"
|
| 27 |
|
| 28 |
+
# Text cleaning function
|
| 29 |
+
def clean_text(text):
|
| 30 |
+
"""
|
| 31 |
+
Cleans the extracted text by removing irrelevant characters but retains currency symbols.
|
| 32 |
+
"""
|
| 33 |
+
text = text.strip()
|
| 34 |
+
# Define the allowed characters (including currency symbols)
|
| 35 |
+
allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
|
| 36 |
+
text = re.sub(allowed_chars, '', text)
|
| 37 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
|
| 38 |
+
return text
|
| 39 |
+
|
| 40 |
# Preprocessing function for text
|
| 41 |
def prep_text(text):
|
| 42 |
clean_sents = []
|
|
|
|
| 86 |
for start in range(0, num_rows, batch_size):
|
| 87 |
end = min(start + batch_size, num_rows)
|
| 88 |
df_chunk = page_df.iloc[start:end]
|
| 89 |
+
# Clean text
|
| 90 |
+
texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
|
| 91 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
| 92 |
for predictions in predictions_batch:
|
| 93 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
|
|
| 122 |
for start in range(0, num_rows, batch_size):
|
| 123 |
end = min(start + batch_size, num_rows)
|
| 124 |
df_chunk = sentence_df.iloc[start:end]
|
| 125 |
+
# Clean text
|
| 126 |
+
texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
|
| 127 |
predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
|
| 128 |
for predictions in predictions_batch:
|
| 129 |
sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
|
|
|
|
| 324 |
doc.save("sentence_report.docx")
|
| 325 |
return "sentence_report.docx"
|
| 326 |
|
| 327 |
+
# New text extraction functions with text cleaning and line joining
|
| 328 |
def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
|
| 329 |
"""
|
| 330 |
Extract text from a PDF page by page using LangChain's PyPDFLoader.
|
|
|
|
| 374 |
page_num = idx
|
| 375 |
text = doc.page_content.strip()
|
| 376 |
|
| 377 |
+
# Join lines that belong to the same sentence
|
| 378 |
+
lines = text.split('\n')
|
| 379 |
+
joined_text = ' '.join(line.strip() for line in lines if line.strip())
|
| 380 |
+
|
| 381 |
+
# Clean text
|
| 382 |
+
cleaned_text = clean_text(joined_text)
|
| 383 |
+
|
| 384 |
# Append page-wise data
|
| 385 |
page_data.append({
|
| 386 |
"Document": doc_name,
|
| 387 |
"Page": page_num,
|
| 388 |
+
"Text": cleaned_text
|
| 389 |
})
|
| 390 |
|
| 391 |
# Sentence tokenization
|
| 392 |
+
sentences = sent_tokenize(cleaned_text)
|
| 393 |
for sentence in sentences:
|
| 394 |
sentence = sentence.strip()
|
| 395 |
if sentence:
|
|
|
|
| 428 |
def launch_interface():
|
| 429 |
with gr.Blocks(title="SDG Document Analysis App") as demo:
|
| 430 |
|
| 431 |
+
# Title as a visible heading at the top of the page with an icon
|
| 432 |
gr.Markdown(
|
| 433 |
"""
|
| 434 |
+
# 🌍 SDG Document Analysis App
|
| 435 |
Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
|
| 436 |
"""
|
| 437 |
)
|
|
|
|
| 442 |
label="Upload PDF File for Analysis", file_types=[".pdf"]
|
| 443 |
)
|
| 444 |
|
| 445 |
+
# Extraction mode selection with explanatory text
|
| 446 |
+
gr.Markdown(
|
| 447 |
+
"""
|
| 448 |
+
## Extraction Mode
|
| 449 |
+
Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
|
| 450 |
+
"""
|
| 451 |
)
|
| 452 |
+
with gr.Row():
|
| 453 |
+
extraction_mode = gr.Radio(
|
| 454 |
+
choices=["All Pages", "Range of Pages"],
|
| 455 |
+
value="All Pages",
|
| 456 |
+
label="Extraction Mode"
|
| 457 |
+
)
|
| 458 |
+
|
| 459 |
+
with gr.Row():
|
| 460 |
+
start_page = gr.Number(value=1, label="Start Page", visible=False)
|
| 461 |
+
end_page = gr.Number(value=1, label="End Page", visible=False)
|
| 462 |
|
| 463 |
# Function to update visibility of start_page and end_page
|
| 464 |
def update_page_inputs(extraction_mode):
|
|
|
|
| 477 |
with gr.Tab("Page-Level Analysis"):
|
| 478 |
gr.Markdown(
|
| 479 |
"""
|
| 480 |
+
## 📄 Page-Level SDG Analysis
|
| 481 |
This section conducts Sustainable Development Goals (SDG) mapping
|
| 482 |
of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
| 483 |
It provides **high-level SDG mapping** of documents at the page level.
|
|
|
|
| 501 |
with gr.Tab("Sentence-Level Analysis"):
|
| 502 |
gr.Markdown(
|
| 503 |
"""
|
| 504 |
+
## ✍️ Sentence-Level SDG Analysis
|
| 505 |
This section conducts Sustainable Development Goals (SDG) mapping
|
| 506 |
using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
|
| 507 |
It provides **detailed SDG mapping** at the sentence level.
|