Spaces:

sadickam
/

document-SDG-App-cpu

Runtime error

App Files Files Community

sadickam commited on Oct 18, 2024

Commit

e230b99

verified ·

1 Parent(s): 6f5e36a

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -807

app.py DELETED Viewed

@@ -1,807 +0,0 @@
-import gradio as gr
-import os
-import re
-import torch
-import pandas as pd
-import plotly.express as px
-import plotly.io as pio
-import nltk
-import tempfile
-from io import BytesIO
-import base64
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from nltk.tokenize import sent_tokenize
-from docx.shared import Inches
-from docx import Document
-import numpy as np
-from styles import custom_css  # Importing custom CSS
-nltk.download('punkt')
-# Import PyPDFLoader for PDF processing
-from langchain_community.document_loaders import PyPDFLoader
-# Model checkpoint for SDG BERT
-checkpoint = "sadickam/sdgBERT"
-# Text cleaning function
-def clean_text(text):
-    """
-    Cleans the extracted text by removing irrelevant characters but retains currency symbols.
-    """
-    text = text.strip()
-    # Define the allowed characters (including currency symbols)
-    allowed_chars = r'[^a-zA-Z0-9\s\.,!?$€£¥₹¢₩]'
-    text = re.sub(allowed_chars, '', text)
-    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
-    return text
-# Preprocessing function for text
-def prep_text(text):
-    clean_sents = []
-    sent_tokens = sent_tokenize(str(text))
-    for sent_token in sent_tokens:
-        word_tokens = [str(word_token).strip().lower() for word_token in sent_token.split()]
-        clean_sents.append(' '.join(word_tokens))
-    joined = ' '.join(clean_sents).strip()
-    return re.sub(r'`|"', "", joined)
-# Load the tokenizer and model with GPU support
-def load_model_and_tokenizer():
-    model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
-    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-    return model, tokenizer
-# Define device (ensure usage of GPU if available in Hugging Face Spaces)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# SDG labels
-label_list = [
-    'SDG1_No Poverty', 'SDG2_Zero Hunger', 'SDG3_Good Health and Well-being', 'SDG4_Quality Education',
-    'SDG5_Gender Equality', 'SDG6_Clean Water and Sanitation', 'SDG7_Affordable and Clean Energy',
-    'SDG8_Decent Work and Economic Growth', 'SDG9_Industry, Innovation and Infrastructure',
-    'SDG10_Reduced Inequality', 'SDG11_Sustainable Cities and Communities',
-    'SDG12_Responsible Consumption and Production', 'SDG13_Climate Action',
-    'SDG14_Life Below Water', 'SDG15_Life on Land', 'SDG16_Peace, Justice and Strong Institutions'
-]
-# Function to predict SDGs for a batch of text inputs
-def predict_sdg_labels_batch(texts, model, tokenizer):
-    tokenized_texts = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
-    model.eval()
-    with torch.no_grad():
-        text_logits = model(**tokenized_texts).logits
-    predictions = torch.softmax(text_logits, dim=1).tolist()
-    return predictions
-# Page-level predictions with batch processing
-def predict_pages(page_df, batch_size=32):
-    model, tokenizer = load_model_and_tokenizer()
-    df_results = page_df.copy()
-    num_rows = len(page_df)
-    all_predicted_labels = [[] for _ in range(16)]
-    all_prediction_scores = [[] for _ in range(16)]
-    for start in range(0, num_rows, batch_size):
-        end = min(start + batch_size, num_rows)
-        df_chunk = page_df.iloc[start:end]
-        # Clean text
-        texts = df_chunk['Text'].apply(clean_text).apply(prep_text).tolist()
-        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
-        for predictions in predictions_batch:
-            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
-            for i, (label, score) in enumerate(sorted_preds):
-                all_predicted_labels[i].append(label)
-                all_prediction_scores[i].append(score)
-    # Add columns to the DataFrame in the desired order (pred1, score1, pred2, score2, ...)
-    for i in range(16):
-        df_results[f'pred{i + 1}'] = all_predicted_labels[i]
-        df_results[f'score{i + 1}'] = all_prediction_scores[i]
-    # Reorder columns to ensure preds and scores are interleaved in the correct order
-    reordered_columns = []
-    for i in range(16):
-        reordered_columns.append(f'pred{i + 1}')
-        reordered_columns.append(f'score{i + 1}')
-    other_columns = [col for col in df_results.columns if col not in reordered_columns]
-    df_results = df_results[other_columns + reordered_columns]
-    return df_results
-# Sentence-level predictions with batch processing
-def predict_sentences(sentence_df, batch_size=32):
-    model, tokenizer = load_model_and_tokenizer()
-    df_combined_sentences = sentence_df.copy()
-    num_rows = len(sentence_df)
-    all_predicted_labels = [[] for _ in range(16)]
-    all_prediction_scores = [[] for _ in range(16)]
-    for start in range(0, num_rows, batch_size):
-        end = min(start + batch_size, num_rows)
-        df_chunk = sentence_df.iloc[start:end]
-        # Clean text
-        texts = df_chunk['Sentence'].apply(clean_text).apply(prep_text).tolist()
-        predictions_batch = predict_sdg_labels_batch(texts, model, tokenizer)
-        for predictions in predictions_batch:
-            sorted_preds = sorted(zip(label_list, predictions), key=lambda x: x[1], reverse=True)
-            for i, (label, score) in enumerate(sorted_preds):
-                all_predicted_labels[i].append(label)
-                all_prediction_scores[i].append(round(score, 3))
-    # Add predictions and scores to DataFrame
-    for i in range(16):
-        df_combined_sentences[f'pred{i + 1}'] = all_predicted_labels[i]
-        df_combined_sentences[f'score{i + 1}'] = all_prediction_scores[i]
-    # Reorder columns
-    reordered_columns = []
-    for i in range(16):
-        reordered_columns.append(f'pred{i + 1}')
-        reordered_columns.append(f'score{i + 1}')
-    other_columns = [col for col in df_combined_sentences.columns if col not in reordered_columns]
-    df_combined_sentences = df_combined_sentences[other_columns + reordered_columns]
-    return df_combined_sentences
-# Define unique colors for each SDG
-sdg_colors = {
-    "SDG1_No Poverty": "#E5243B",
-    "SDG2_Zero Hunger": "#DDA63A",
-    "SDG3_Good Health and Well-being": "#4C9F38",
-    "SDG4_Quality Education": "#C5192D",
-    "SDG5_Gender Equality": "#FF3A21",
-    "SDG6_Clean Water and Sanitation": "#26BDE2",
-    "SDG7_Affordable and Clean Energy": "#FCC30B",
-    "SDG8_Decent Work and Economic Growth": "#A21942",
-    "SDG9_Industry, Innovation and Infrastructure": "#FD6925",
-    "SDG10_Reduced Inequality": "#DD1367",
-    "SDG11_Sustainable Cities and Communities": "#FD9D24",
-    "SDG12_Responsible Consumption and Production": "#BF8B2E",
-    "SDG13_Climate Action": "#3F7E44",
-    "SDG14_Life Below Water": "#0A97D9",
-    "SDG15_Life on Land": "#56C02B",
-    "SDG16_Peace, Justice and Strong Institutions": "#00689D"
-}
-# Function to plot SDG dominant bar graphs using Plotly
-def plot_sdg(df, title, pred_column, icons_folder='assets/icons/'):
-    """
-    Plots a horizontal bar graph of SDG predictions and superimposes the icon of the most frequent SDG.
-    Args:
-        df (pd.DataFrame): DataFrame containing SDG predictions.
-        title (str): Title of the plot.
-        pred_column (str): Column name to use for plotting (e.g., 'pred1').
-        icons_folder (str): Path to the folder containing SDG icons.
-    Returns:
-        plotly.graph_objs._figure.Figure: The Plotly figure object.
-    """
-    df_filtered = df[df[pred_column].notna()]
-    labels = df_filtered[pred_column].value_counts().sort_values(ascending=False)
-    total = labels.sum()
-    percentages = (labels / total) * 100
-    # Create a horizontal bar plot with Plotly
-    fig = px.bar(
-        percentages.rename_axis('SDG Label').reset_index(name='Percentage'),
-        y='SDG Label',
-        x='Percentage',
-        orientation='h',
-        title=title,
-        color='SDG Label',
-        color_discrete_map=sdg_colors  # Use the defined unique colors for each SDG
-    )
-    # Update y-axis to show labels
-    fig.update_yaxes(showticklabels=True)
-    # Add percentage labels to the bars
-    fig.update_traces(
-        texttemplate='%{x:.2f}%',
-        textposition='auto',
-        textfont=dict(size=10)
-    )
-    # Adjust layout for better visibility
-    fig.update_layout(
-        title=dict(
-            text=title, font=dict(size=14)  # Increase title font size
-        ),
-        yaxis=dict(
-            automargin=True,
-            title=None,
-            tickfont=dict(size=12)
-        ),
-        margin=dict(l=20, r=30, t=100, b=20),  # Increased right margin for icon
-        height=600,
-        width=800,
-        showlegend=False,
-        template="simple_white",
-        xaxis=dict(
-            tickfont=dict(size=12)  # Reduce x-axis font size
-        ),
-    )
-    # Identify the most frequent SDG
-    if not percentages.empty:
-        top_sdg_label = percentages.index[0]  # e.g., 'SDG1_No Poverty'
-        # Map SDG label to icon filename
-        # Assuming naming convention 'SDG1.png', 'SDG2.png', etc.
-        sdg_number = top_sdg_label.split('_')[0]  # Extract 'SDG1'
-        icon_filename = f"{sdg_number}.png"  # e.g., 'SDG1.png'
-        icon_path = os.path.join(icons_folder, icon_filename)
-        # Check if the icon file exists
-        if os.path.exists(icon_path):
-            # Read and encode the image
-            with open(icon_path, 'rb') as image_file:
-                encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
-            # Add the icon as an image in the Plotly figure
-            fig.add_layout_image(
-                dict(
-                    source='data:image/png;base64,' + encoded_image,
-                    xref="paper", yref="paper",
-                    x=0.4, y=1.2,  # Positioning: slightly to the right and top
-                    sizex=0.2, sizey=0.2,  # Size of the icon
-                    xanchor="left",
-                    yanchor="top",
-                    layer="above"  # Ensure the icon is above other plot elements
-                )
-            )
-        else:
-            print(f"Icon file '{icon_path}' not found. Skipping icon overlay.")
-    return fig
-def save_figure_as_jpeg(fig, filename):
-    """Saves the Plotly figure as a high-resolution JPEG."""
-    pio.write_image(fig, filename, format='jpeg', width=1000, height=700, scale=5)
-# Generate reports (page and sentence levels)
-def generate_page_report(df_pages, report_file_name):
-    doc = Document()
-    doc.add_heading("Page-Level SDG Analysis Report", 0)
-    doc.add_heading("📋 General Notes", level=2)
-    doc.add_paragraph(
-        'This app conducts page-level analysis of the uploaded document. Each page is processed by the sdgBERT AI model trained to predict the first 16 '
-        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
-        'representing the likelihood that the text is aligned with particular SDGs. This page-level '
-        'analysis provides high-level insight into SDG alignment.'
-        '\n\n'
-        'Given that a page may align with more than one SDG, this app focuses on the top two SDG predictions '
-        '(Primary and Secondary) for each page with a probability score greater than zero.'
-    )
-    doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
-    doc.add_paragraph(
-        'This graph displays the most essential SDG the AI model associates with pages. The bars '
-        'represent the percentage of pages most strongly aligned with each SDG. This offers insight into the dominant '
-        'sustainable development theme within the document.'
-    )
-    doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
-    doc.add_paragraph(
-        'This graph shows the second most relevant SDGs for pages. Although these SDGs are '
-        'not the primary focus, the text has some relevance to these goals.'
-    )
-    for doc_name in df_pages['Document'].unique():
-        # Sanitize doc_name to use in file names
-        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
-        doc.add_heading(f"📄 Document: {doc_name}", level=2)
-        df_doc = df_pages[df_pages['Document'] == doc_name]
-        # Generate and save graphs
-        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_page.jpeg"
-        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_page.jpeg"
-        plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
-            first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
-        plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
-            second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
-        # Add plots to the Word document
-        doc.add_picture(first_sdg_plot_path, width=Inches(6))
-        doc.add_picture(second_sdg_plot_path, width=Inches(6))
-    doc.save(report_file_name)
-    return report_file_name
-def generate_sentence_report(df_sentences, report_file_name):
-    doc = Document()
-    doc.add_heading("Sentence-Level SDG Analysis Report", 0)
-    doc.add_heading("📋 General Notes", level=2)
-    doc.add_paragraph(
-        'This app splits documents into sentences using a natural language processing algorithm. '
-        'Each sentence is processed by the sdgBERT AI model trained to predict the first 16 '
-        'Sustainable Development Goals (SDGs). The model analyzes the content and returns scores '
-        'representing the likelihood that the text is aligned with particular SDGs. This sentence-level '
-        'analysis provides deeper insight into SDG alignment.'
-        '\n\n'
-        'Given that a sentence may align with more than one SDG, this app focuses on the top two SDG predictions '
-        '(Primary and Secondary) for each sentence with a probability score greater than zero.'
-    )
-    doc.add_heading("📊 Primary SDGs Bar Graph", level=3)
-    doc.add_paragraph(
-        'This graph displays the most essential SDG the AI model associates with sentences. The bars '
-        'represent the percentage of sentences most strongly aligned with each SDG. This offers more profound insight '
-        'into the dominant sustainable development theme within the document.'
-    )
-    doc.add_heading("📈 Secondary SDGs Bar Graph", level=3)
-    doc.add_paragraph(
-        'This graph shows the second most relevant SDGs for sentences. Although these SDGs are not '
-        'the primary focus, the text has some relevance to these goals.'
-    )
-    for doc_name in df_sentences['Document'].unique():
-        # Sanitize doc_name to use in file names
-        sanitized_doc_name = re.sub(r'[^\w\-]', '_', os.path.splitext(doc_name)[0])
-        doc.add_heading(f"📄 Document: {doc_name}", level=2)
-        df_doc = df_sentences[df_sentences['Document'] == doc_name]
-        # Generate and save graphs
-        first_sdg_plot_path = f"{sanitized_doc_name}_first_sdg_sentence.jpeg"
-        second_sdg_plot_path = f"{sanitized_doc_name}_second_sdg_sentence.jpeg"
-        plot_sdg(df_doc, "Primary SDGs", 'pred1').write_image(
-            first_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
-        plot_sdg(df_doc, "Secondary SDGs", 'pred2').write_image(
-            second_sdg_plot_path, format='jpeg', scale=7, engine="kaleido")
-        # Add plots to the Word document
-        doc.add_picture(first_sdg_plot_path, width=Inches(6))
-        doc.add_picture(second_sdg_plot_path, width=Inches(6))
-    doc.save(report_file_name)
-    return report_file_name
-# New text extraction functions with text cleaning and line joining
-def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
-    """
-    Extract text from a PDF page by page using LangChain's PyPDFLoader.
-    Args:
-        pdf_file_path (str): The file path to the uploaded PDF.
-        start_page (int, optional): The starting page number for extraction (1-based index).
-        end_page (int, optional): The ending page number for extraction (1-based index).
-    Returns:
-        tuple:
-            - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
-            - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
-    """
-    try:
-        # Initialize the loader
-        loader = PyPDFLoader(pdf_file_path)
-        documents = loader.load_and_split()  # Each document corresponds to a single page
-        total_pages = len(documents)
-        doc_name = os.path.basename(pdf_file_path)  # Extract document name
-        # Validate and adjust page range
-        if start_page is not None and end_page is not None:
-            # Convert to integers to avoid slicing issues
-            start_page = int(start_page)
-            end_page = int(end_page)
-            # Adjust to valid range
-            if start_page < 1:
-                start_page = 1
-            if end_page > total_pages:
-                end_page = total_pages
-            if start_page > end_page:
-                start_page, end_page = end_page, start_page  # Swap if out of order
-            # Select the subset of documents based on user input
-            selected_docs = documents[start_page - 1:end_page]
-        else:
-            selected_docs = documents
-            start_page = 1
-            end_page = total_pages
-        # Initialize lists to store data
-        page_data = []
-        sentence_data = []
-        for idx, doc in enumerate(selected_docs, start=start_page):
-            page_num = idx
-            text = doc.page_content.strip()
-            # Join lines that belong to the same sentence
-            lines = text.split('\n')
-            joined_text = ' '.join(line.strip() for line in lines if line.strip())
-            # Clean text
-            cleaned_text = clean_text(joined_text)
-            # Append page-wise data
-            page_data.append({
-                "Document": doc_name,
-                "Page": page_num,
-                "Text": cleaned_text
-            })
-            # Sentence tokenization
-            sentences = sent_tokenize(cleaned_text)
-            for sentence in sentences:
-                sentence = sentence.strip()
-                if sentence and len(sentence) > 70:
-                    sentence_data.append({
-                        "Document": doc_name,
-                        "Page": page_num,
-                        "Sentence": sentence
-                    })
-        # Create DataFrames
-        page_df = pd.DataFrame(page_data)
-        sentence_df = pd.DataFrame(sentence_data)
-        return page_df, sentence_df
-    except Exception as e:
-        raise RuntimeError(f"Error during PDF extraction: {e}")
-def df_to_csv_bytes(df):
-    """
-    Convert DataFrame to CSV in bytes.
-    Args:
-        df (pd.DataFrame): The DataFrame to convert.
-    Returns:
-        bytes: CSV data in bytes.
-    """
-    try:
-        buffer = BytesIO()
-        df.to_csv(buffer, index=False)
-        csv_data = buffer.getvalue()
-        buffer.close()
-        return csv_data
-    except Exception as e:
-        raise RuntimeError(f"Error during CSV conversion: {e}")
-def launch_interface():
-    with gr.Blocks(css=custom_css) as demo:
-        # Title as a visible heading at the top of the page with an icon
-        gr.Markdown(
-            """
-            # 🌍 SDG Document Analysis App - CPU
-            Analyze documents to map Sustainable Development Goals (SDGs) at both page and sentence levels.
-            """
-        )
-        # Shared PDF file input for both analyses
-        with gr.Row():
-            file_input = gr.File(
-                label="📁 Upload PDF File for Analysis", file_types=[".pdf"]
-            )
-        # Extraction mode selection with explanatory text
-        gr.Markdown(
-            """
-            ## PDF Text Extraction Mode
-            Choose whether to analyze all pages or a specific range of pages. If you want to exclude certain pages from the analysis, select "Range of Pages" and specify the start and end pages.
-            """
-        )
-        with gr.Row():
-            extraction_mode = gr.Radio(
-                choices=["All Pages", "Range of Pages"],
-                value="All Pages",
-                label="Extraction Mode"
-            )
-        with gr.Row():
-            start_page = gr.Number(value=1, label="🔢 Start Page", visible=False)
-            end_page = gr.Number(value=1, label="🔢 End Page", visible=False)
-        # Function to update visibility of start_page and end_page
-        def update_page_inputs(extraction_mode):
-            if extraction_mode == "Range of Pages":
-                return gr.update(visible=True), gr.update(visible=True)
-            else:
-                return gr.update(visible=False), gr.update(visible=False)
-        extraction_mode.change(
-            update_page_inputs,
-            inputs=extraction_mode,
-            outputs=[start_page, end_page]
-        )
-        # Main Tabs for Page-Level and Sentence-Level Analysis
-        gr.Markdown("## SDG Analysis Type")
-        with gr.Tab("📄 Page-Level Analysis"):
-            gr.Markdown(
-                """
-                ### Page-Level SDG Analysis
-                This section conducts Sustainable Development Goals (SDG) mapping
-                of documents using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
-                It provides **high-level SDG mapping** of documents at the page level.
-                """
-            )
-            # Nested Tabs for Primary and Secondary SDGs
-            with gr.Tabs():
-                with gr.TabItem("📊 Primary SDGs"):
-                    primary_page_plot = gr.Plot(label="📊 Primary SDGs [Page-Level]")
-                    with gr.Row():
-                        page_csv = gr.File(label="📊 Download Page Predictions CSV")
-                        page_docx = gr.File(label="📄 Download Page Report DOCX")
-                        page_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
-                with gr.TabItem("📈 Secondary SDGs"):
-                    secondary_page_plot = gr.Plot(label="📈 Secondary SDGs [Page-Level]")
-                    with gr.Row():
-                        page_csv_secondary = gr.File(label="📊 Download Page Predictions CSV")
-                        page_report_file_secondary = gr.File(label="📄 Download Page Report DOCX")
-                        secondary_page_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
-            with gr.Row():
-                page_button = gr.Button("🏃‍♂️ Run Page-Level Analysis")
-                reset_page_button = gr.Button("🔄 Reset Page-Level Analysis", elem_classes="reset-button")
-        with gr.Tab("✍️ Sentence-Level Analysis"):
-            gr.Markdown(
-                """
-                ### Sentence-Level SDG Analysis
-                This section conducts Sustainable Development Goals (SDG) mapping
-                using the [sdgBERT model](https://huggingface.co/sadickam/sdgBERT).
-                It provides **detailed SDG mapping** at the sentence level.
-                """
-            )
-            # Nested Tabs for Primary and Secondary SDGs
-            with gr.Tabs():
-                with gr.TabItem("📊 Primary SDGs"):
-                    primary_sentence_plot = gr.Plot(label="📊 Primary SDGs [Sentence-Level]")
-                    with gr.Row():
-                        sentence_csv = gr.File(label="📊 Download Sentence Predictions CSV")
-                        sentence_docx = gr.File(label="📄 Download Sentence Report DOCX")
-                        sentence_jpeg1 = gr.File(label="🖼️ Download Primary SDGs JPEG")
-                with gr.TabItem("📈 Secondary SDGs"):
-                    secondary_sentence_plot = gr.Plot(label="📈 Secondary SDGs [Sentence-Level]")
-                    with gr.Row():
-                        sentence_csv_secondary = gr.File(label="📊 Download Sentence Predictions CSV")
-                        sentence_report_file_secondary = gr.File(label="📄 Download Sentence Report DOCX")
-                        secondary_sentence_jpeg = gr.File(label="🖼️ Download Secondary SDGs JPEG")
-            with gr.Row():
-                sentence_button = gr.Button("🏃‍♂️ Run Sentence-Level Analysis")
-                reset_sentence_button = gr.Button("🔄 Reset Sentence-Level Analysis", elem_classes="reset-button")
-        # Function to process page-level analysis
-        def process_pages(file, extraction_mode, start_page, end_page):
-            if not file:
-                # Return None for each output component
-                return [None, None, None, None, None, None, None, None]
-            try:
-                if hasattr(file, 'name'):
-                    pdf_file_path = file.name
-                    original_file_name = os.path.basename(file.name)
-                else:
-                    # Save the file to a temporary location
-                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
-                        temp_pdf.write(file.read())
-                        pdf_file_path = temp_pdf.name
-                    original_file_name = 'uploaded_document'
-                # Sanitize the file name to use in output file names
-                sanitized_file_name = os.path.splitext(original_file_name)[0]
-                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
-                # Determine page range based on extraction_mode
-                if extraction_mode == "All Pages":
-                    selected_start = None
-                    selected_end = None
-                else:
-                    selected_start = int(start_page)
-                    selected_end = int(end_page)
-                # Extract text and create DataFrames
-                page_df, _ = extract_text_with_py_pdf_loader(
-                    pdf_file_path,
-                    start_page=selected_start,
-                    end_page=selected_end
-                )
-                # Predict SDGs at page level
-                df_page_predictions = predict_pages(page_df)
-                # Generate plots with icon overlay
-                first_plot = plot_sdg(
-                    df_page_predictions, "📊 Primary SDGs", 'pred1'
-                )
-                second_plot = plot_sdg(
-                    df_page_predictions, "📈 Secondary SDGs", 'pred2'
-                )
-                # Define output file names
-                page_csv_file = f"{sanitized_file_name}_page_predictions_primary.csv"
-                page_report_file = f"{sanitized_file_name}_page_report_primary.docx"
-                primary_page_jpeg = f"{sanitized_file_name}_primary_page.jpeg"
-                page_csv_file_secondary = f"{sanitized_file_name}_page_predictions_secondary.csv"
-                page_report_file_secondary = f"{sanitized_file_name}_page_report_secondary.docx"
-                secondary_page_jpeg = f"{sanitized_file_name}_secondary_page.jpeg"
-                # Save CSV and reports
-                df_page_predictions.to_csv(page_csv_file, index=False)
-                page_report_primary = generate_page_report(df_page_predictions, page_report_file)
-                df_page_predictions.to_csv(page_csv_file_secondary, index=False)
-                page_report_secondary = generate_page_report(df_page_predictions, page_report_file_secondary)
-                # Save figures as JPEG
-                save_figure_as_jpeg(first_plot, primary_page_jpeg)
-                save_figure_as_jpeg(second_plot, secondary_page_jpeg)
-                return (
-                    first_plot, second_plot,
-                    page_csv_file, page_report_file, primary_page_jpeg,
-                    page_csv_file_secondary, page_report_file_secondary, secondary_page_jpeg
-                )
-            except Exception as e:
-                print(f"Error: {e}")
-                return [None, None, None, None, None, None, None, None]
-        # Function to process sentence-level analysis
-        def process_sentences(file, extraction_mode, start_page, end_page):
-            if not file:
-                # Return None for each output component
-                return [None, None, None, None, None, None, None, None]
-            try:
-                if hasattr(file, 'name'):
-                    pdf_file_path = file.name
-                    original_file_name = os.path.basename(file.name)
-                else:
-                    # Save the file to a temporary location
-                    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
-                        temp_pdf.write(file.read())
-                        pdf_file_path = temp_pdf.name
-                    original_file_name = 'uploaded_document'
-                # Sanitize the file name to use in output file names
-                sanitized_file_name = os.path.splitext(original_file_name)[0]
-                sanitized_file_name = re.sub(r'[^\w\-]', '_', sanitized_file_name)
-                # Determine page range based on extraction_mode
-                if extraction_mode == "All Pages":
-                    selected_start = None
-                    selected_end = None
-                else:
-                    selected_start = int(start_page)
-                    selected_end = int(end_page)
-                # Extract text and create DataFrames
-                _, sentence_df = extract_text_with_py_pdf_loader(
-                    pdf_file_path,
-                    start_page=selected_start,
-                    end_page=selected_end
-                )
-                # Predict SDGs at sentence level
-                df_sentence_predictions = predict_sentences(sentence_df)
-                # Generate plots with icon overlay
-                first_plot = plot_sdg(
-                    df_sentence_predictions, "📊 Primary SDGs", 'pred1'
-                )
-                second_plot = plot_sdg(
-                    df_sentence_predictions, "📈 Secondary SDGs", 'pred2'
-                )
-                # Define output file names
-                sentence_csv_file = f"{sanitized_file_name}_sentence_predictions_primary.csv"
-                sentence_report_file = f"{sanitized_file_name}_sentence_report_primary.docx"
-                primary_sentence_jpeg = f"{sanitized_file_name}_primary_sentence.jpeg"
-                sentence_csv_file_secondary = f"{sanitized_file_name}_sentence_predictions_secondary.csv"
-                sentence_report_file_secondary = f"{sanitized_file_name}_sentence_report_secondary.docx"
-                secondary_sentence_jpeg = f"{sanitized_file_name}_secondary_sentence.jpeg"
-                # Save CSV and reports
-                df_sentence_predictions.to_csv(sentence_csv_file, index=False)
-                sentence_report_primary = generate_sentence_report(df_sentence_predictions, sentence_report_file)
-                df_sentence_predictions.to_csv(sentence_csv_file_secondary, index=False)
-                sentence_report_secondary = generate_sentence_report(df_sentence_predictions, sentence_report_file_secondary)
-                # Save figures as JPEG
-                save_figure_as_jpeg(first_plot, primary_sentence_jpeg)
-                save_figure_as_jpeg(second_plot, secondary_sentence_jpeg)
-                return (
-                    first_plot, second_plot,
-                    sentence_csv_file, sentence_report_file, primary_sentence_jpeg,
-                    sentence_csv_file_secondary, sentence_report_file_secondary, secondary_sentence_jpeg
-                )
-            except Exception as e:
-                print(f"Error: {e}")
-                return [None, None, None, None, None, None, None, None]
-        # Reset functions to clear the outputs
-        def reset_page_outputs():
-            return [None, None, None, None, None, None, None, None]
-        def reset_sentence_outputs():
-            return [None, None, None, None, None, None, None, None]
-        # Button actions for Page-Level Analysis
-        page_button.click(
-            process_pages,
-            inputs=[file_input, extraction_mode, start_page, end_page],
-            outputs=[
-                primary_page_plot,           # 📊 Primary SDGs [Page-Level]
-                secondary_page_plot,         # 📈 Secondary SDGs [Page-Level]
-                page_csv,                    # 📊 Download Page Predictions CSV
-                page_docx,                   # 📄 Download Page Report DOCX
-                page_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
-                page_csv_secondary,          # 📊 Download Page Predictions CSV (Secondary)
-                page_report_file_secondary,  # 📄 Download Page Report DOCX (Secondary)
-                secondary_page_jpeg          # 🖼️ Download Secondary SDGs JPEG
-            ]
-        )
-        reset_page_button.click(
-            reset_page_outputs,
-            outputs=[
-                primary_page_plot,
-                secondary_page_plot,
-                page_csv,
-                page_docx,
-                page_jpeg1,
-                page_csv_secondary,
-                page_report_file_secondary,
-                secondary_page_jpeg
-            ]
-        )
-        # Button actions for Sentence-Level Analysis
-        sentence_button.click(
-            process_sentences,
-            inputs=[file_input, extraction_mode, start_page, end_page],
-            outputs=[
-                primary_sentence_plot,           # 📊 Primary SDGs [Sentence-Level]
-                secondary_sentence_plot,         # 📈 Secondary SDGs [Sentence-Level]
-                sentence_csv,                    # 📊 Download Sentence Predictions CSV
-                sentence_docx,                   # 📄 Download Sentence Report DOCX
-                sentence_jpeg1,                  # 🖼️ Download Primary SDGs JPEG
-                sentence_csv_secondary,          # 📊 Download Sentence Predictions CSV (Secondary)
-                sentence_report_file_secondary,  # 📄 Download Sentence Report DOCX (Secondary)
-                secondary_sentence_jpeg          # 🖼️ Download Secondary SDGs JPEG
-            ]
-        )
-        reset_sentence_button.click(
-            reset_sentence_outputs,
-            outputs=[
-                primary_sentence_plot,
-                secondary_sentence_plot,
-                sentence_csv,
-                sentence_docx,
-                sentence_jpeg1,
-                sentence_csv_secondary,
-                sentence_report_file_secondary,
-                secondary_sentence_jpeg
-            ]
-        )
-    demo.queue().launch()
-launch_interface()