Spaces:

rameshmoorthy
/

DAR_standardiser

Sleeping

App Files Files Community

rameshmoorthy commited on Jun 21, 2025

Commit

ba31f91

verified ·

1 Parent(s): d92fc43

Update app.py

Browse files

Files changed (1) hide show

app.py +312 -133

app.py CHANGED Viewed

@@ -2,7 +2,9 @@ import gradio as gr
 import pandas as pd
 from io import BytesIO
 import os
 from dar_processor import preprocess_pdf_text
 from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
 from models import ParsedDARReport, HarmonisedPara
@@ -11,193 +13,370 @@ def create_html_report(results_with_harmonised: list[dict]) -> str:
     """Generates an HTML string to display the results in a styled table."""
     if not results_with_harmonised:
         return "<p>No audit paras found or processed.</p>"
-    # Basic CSS for styling
     style = """
     <style>
         body { font-family: sans-serif; }
         .styled-table {
-            border-collapse: collapse;
-            margin: 25px 0;
-            font-size: 0.9em;
-            min-width: 400px;
-            box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
-            border-radius: 8px;
-            overflow: hidden;
-        }
-        .styled-table thead tr {
-            background-color: #009879;
-            color: #ffffff;
-            text-align: left;
-        }
-        .styled-table th, .styled-table td {
-            padding: 12px 15px;
-            border-bottom: 1px solid #dddddd;
-        }
-        .styled-table tbody tr:last-of-type {
-            border-bottom: 2px solid #009879;
-        }
-        .styled-table tbody tr.active-row {
-            font-weight: bold;
-            color: #009879;
         }
     </style>
     """
-    # Table header
-    html = f"{style}<table class='styled-table'>"
-    html += """
-    <thead>
-        <tr>
-            <th>Para No.</th>
-            <th>Original Audit Para Heading</th>
-            <th>Harmonised Audit Para Heading</th>
-            <th>Amount Involved (in Lakhs)</th>
-        </tr>
-    </thead>
-    """
-    # Table body
-    html += "<tbody>"
     for item in results_with_harmonised:
         para_num = item.get('audit_para_number', 'N/A')
         original_heading = item.get('audit_para_heading', 'N/A')
         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
-        html += f"""
-        <tr>
-            <td>{para_num}</td>
-            <td>{original_heading}</td>
-            <td>{harmonised_heading}</td>
-            <td>{amount}</td>
-        </tr>
-        """
     html += "</tbody></table>"
     return html
 def process_dar_pdf(pdf_file):
-    """
-    The main function for the Gradio interface. It processes the PDF,
-    extracts data, gets harmonised titles, and returns the results.
-    """
-    # Get API Key from environment secrets
     gemini_api_key = os.environ.get("GEMINI_API_KEY")
     if not pdf_file:
         return "Please upload a PDF file.", None, None
     if not gemini_api_key:
-        return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
-    # --- Step 1: Process PDF to text ---
-    progress_update = "Processing PDF to text..."
-    print(progress_update)
     full_text = preprocess_pdf_text(pdf_file.name)
     if full_text.startswith("Error"):
         return f"Failed to process PDF: {full_text}", None, None
-    # --- Step 2: Extract structured data from text ---
-    progress_update += "\nExtracting structured data from DAR text..."
-    print(progress_update)
-    parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
-    if parsed_report.parsing_errors:
-        error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
-        print(error_msg)
         return error_msg, None, None
-    if not parsed_report.audit_paras:
-        return "Could not find any audit paras in the document.", None, None
-    # --- Step 3: Get harmonised titles ---
-    progress_update += "\nGenerating harmonised titles..."
-    print(progress_update)
-    original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
     if not original_headings:
-        return "Found audit paras, but could not extract any headings to harmonise.", None, None
-    # UPDATED: Pass the full_text to the harmonisation function for better context
-    harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
     if not harmonised_results:
         return "Failed to generate harmonised titles.", None, None
-    # --- Step 4: Combine data and prepare for output ---
-    progress_update += "\nCombining data and preparing output..."
-    print(progress_update)
-    # Create a mapping from original heading to harmonised heading
     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
-    # Combine all data into a list of dictionaries for display and download
     final_data_list = []
     for para in parsed_report.audit_paras:
-        para_dict = para.dict()
-        header_dict = parsed_report.header.dict() if parsed_report.header else {}
-        # Combine header and para info
-        combined_info = {**header_dict, **para_dict}
-        # Add the harmonised heading
-        harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
-        combined_info['harmonised_audit_para_heading'] = harmonised_heading
         final_data_list.append(combined_info)
-    # --- Step 5: Generate HTML report and Excel file ---
     html_output = create_html_report(final_data_list)
     df = pd.DataFrame(final_data_list)
-    # Reorder columns for clarity in the Excel file
     excel_columns = [
-        'gstin', 'trade_name', 'category', 'audit_group_number',
-        'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
-        'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
-        'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
     ]
     df = df.reindex(columns=excel_columns).fillna('N/A')
-    # Save to an in-memory buffer
     output_excel = BytesIO()
-    with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
-        df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
     output_excel.seek(0)
     excel_file_name = "dar_extraction_report.xlsx"
     with open(excel_file_name, "wb") as f:
         f.write(output_excel.getbuffer())
-    return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
 # --- Gradio Interface Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
-    gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
-    gr.Markdown("## Audit 1 Commissionerate Mumbai")
-    gr.Markdown(
-        "Upload a Draft Audit Report (DAR) in PDF format. "
-        "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
-    )
-    with gr.Row():
-        with gr.Column(scale=1):
-            pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
-            submit_btn = gr.Button("Process Report", variant="primary")
-        with gr.Column(scale=2):
-            status_output = gr.Textbox(label="Processing Status", interactive=False)
-            excel_output = gr.File(label="Download Excel Report")
-    gr.Markdown("## Harmonised Audit Para Titles")
-    html_output = gr.HTML()
-    submit_btn.click(
-        fn=process_dar_pdf,
-        inputs=[pdf_input],
-        outputs=[status_output, html_output, excel_output]
     )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import pandas as pd
 from io import BytesIO
 import os
+import json
+# These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
 from dar_processor import preprocess_pdf_text
 from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
 from models import ParsedDARReport, HarmonisedPara
     """Generates an HTML string to display the results in a styled table."""
     if not results_with_harmonised:
         return "<p>No audit paras found or processed.</p>"
     style = """
     <style>
         body { font-family: sans-serif; }
         .styled-table {
+            border-collapse: collapse; margin: 25px 0; font-size: 0.9em;
+            min-width: 400px; box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+            border-radius: 8px; overflow: hidden;
         }
+        .styled-table thead tr { background-color: #009879; color: #ffffff; text-align: left; }
+        .styled-table th, .styled-table td { padding: 12px 15px; border-bottom: 1px solid #dddddd; }
+        .styled-table tbody tr:last-of-type { border-bottom: 2px solid #009879; }
     </style>
     """
+    html = f"{style}<table class='styled-table'><thead><tr><th>Para No.</th><th>Original Audit Para Heading</th><th>Harmonised Audit Para Heading</th><th>Amount Involved (in Lakhs)</th></tr></thead><tbody>"
     for item in results_with_harmonised:
         para_num = item.get('audit_para_number', 'N/A')
         original_heading = item.get('audit_para_heading', 'N/A')
         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
+        html += f"<tr><td>{para_num}</td><td>{original_heading}</td><td>{harmonised_heading}</td><td>{amount}</td></tr>"
     html += "</tbody></table>"
     return html
 def process_dar_pdf(pdf_file):
+    """The main processing function, called after successful login."""
     gemini_api_key = os.environ.get("GEMINI_API_KEY")
     if not pdf_file:
         return "Please upload a PDF file.", None, None
     if not gemini_api_key:
+        return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None
+    # Step 1: Process PDF to text
     full_text = preprocess_pdf_text(pdf_file.name)
     if full_text.startswith("Error"):
         return f"Failed to process PDF: {full_text}", None, None
+    # Step 2: Extract structured data
+    parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text)
+    if parsed_report.parsing_errors or not parsed_report.audit_paras:
+        error_msg = parsed_report.parsing_errors or "Could not find any audit paras."
         return error_msg, None, None
+    # Step 3: Get harmonised titles
+    original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading]
     if not original_headings:
+        return "Found paras but no headings to harmonise.", None, None
+    harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings)
     if not harmonised_results:
         return "Failed to generate harmonised titles.", None, None
+    # Step 4: Combine and prepare outputs
     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
     final_data_list = []
     for para in parsed_report.audit_paras:
+        combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict()
+        combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A")
         final_data_list.append(combined_info)
     html_output = create_html_report(final_data_list)
+    # Step 5: Create Excel file for download
     df = pd.DataFrame(final_data_list)
     excel_columns = [
+        'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number',
+        'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs',
+        'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs',
+        'total_amount_recovered_overall_rs'
     ]
     df = df.reindex(columns=excel_columns).fillna('N/A')
     output_excel = BytesIO()
+    df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction')
     output_excel.seek(0)
     excel_file_name = "dar_extraction_report.xlsx"
     with open(excel_file_name, "wb") as f:
         f.write(output_excel.getbuffer())
+    return "Processing complete.", html_output, gr.File(value=excel_file_name)
 # --- Gradio Interface Definition ---
 with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
+    # --- Login UI (visible initially) ---
+    with gr.Column(visible=True) as login_ui:
+        gr.Markdown("# DAR Harmonisation Tool Login")
+        gr.Markdown("Please enter the credentials to access the tool.")
+        with gr.Row():
+            username_input = gr.Textbox(label="Username", placeholder="Enter your username")
+            password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
+        login_button = gr.Button("Login", variant="primary")
+        login_error_msg = gr.Markdown(visible=False)
+    # --- Main App UI (hidden initially) ---
+    with gr.Column(visible=False) as main_app_ui:
+        gr.Markdown("# DAR PDF Harmonisation Tool")
+        gr.Markdown(
+            "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
+            "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
+                submit_btn = gr.Button("Process Report", variant="primary")
+            with gr.Column(scale=2):
+                status_output = gr.Textbox(label="Processing Status", interactive=False)
+                excel_output = gr.File(label="Download Excel Report")
+        gr.Markdown("## Harmonised Audit Para Titles")
+        html_output = gr.HTML()
+        submit_btn.click(
+            fn=process_dar_pdf,
+            inputs=[pdf_input],
+            outputs=[status_output, html_output, excel_output]
+        )
+    # --- Login Functionality ---
+    def login(username, password):
+        """
+        Checks user credentials against a dictionary.
+        For production, this dictionary is loaded from a Hugging Face secret.
+        """
+        # Default credentials for local testing if secret is not found
+        default_creds = {
+            "admin": "iloveaudit1",
+            "planning_officer": "pco_password",
+            "audit_group1": "ag1_password"
+        }
+        # In production, load credentials from a HF secret as a JSON string.
+        # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
+        auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
+        try:
+            # Use credentials from secret if available and valid JSON
+            creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
+        except json.JSONDecodeError:
+            # Fallback to default if secret contains invalid JSON
+            creds = default_creds
+        if username in creds and password == creds.get(username):
+            # Login successful: hide login UI, show main app
+            return {
+                login_ui: gr.update(visible=False),
+                main_app_ui: gr.update(visible=True),
+                login_error_msg: gr.update(visible=False)
+            }
+        else:
+            # Login failed: keep login UI visible, show error message
+            return {
+                login_ui: gr.update(visible=True),
+                main_app_ui: gr.update(visible=False),
+                login_error_msg: gr.update(value="<p style='color:red;'>Invalid username or password.</p>", visible=True)
+            }
+    login_button.click(
+        login,
+        inputs=[username_input, password_input],
+        outputs=[login_ui, main_app_ui, login_error_msg]
     )
 if __name__ == "__main__":
+    demo.launch(debug=True)
+# import gradio as gr
+# import pandas as pd
+# from io import BytesIO
+# import os
+# from dar_processor import preprocess_pdf_text
+# from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
+# from models import ParsedDARReport, HarmonisedPara
+# def create_html_report(results_with_harmonised: list[dict]) -> str:
+#     """Generates an HTML string to display the results in a styled table."""
+#     if not results_with_harmonised:
+#         return "<p>No audit paras found or processed.</p>"
+#     # Basic CSS for styling
+#     style = """
+#     <style>
+#         body { font-family: sans-serif; }
+#         .styled-table {
+#             border-collapse: collapse;
+#             margin: 25px 0;
+#             font-size: 0.9em;
+#             min-width: 400px;
+#             box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+#             border-radius: 8px;
+#             overflow: hidden;
+#         }
+#         .styled-table thead tr {
+#             background-color: #009879;
+#             color: #ffffff;
+#             text-align: left;
+#         }
+#         .styled-table th, .styled-table td {
+#             padding: 12px 15px;
+#             border-bottom: 1px solid #dddddd;
+#         }
+#         .styled-table tbody tr:last-of-type {
+#             border-bottom: 2px solid #009879;
+#         }
+#         .styled-table tbody tr.active-row {
+#             font-weight: bold;
+#             color: #009879;
+#         }
+#     </style>
+#     """
+#     # Table header
+#     html = f"{style}<table class='styled-table'>"
+#     html += """
+#     <thead>
+#         <tr>
+#             <th>Para No.</th>
+#             <th>Original Audit Para Heading</th>
+#             <th>Harmonised Audit Para Heading</th>
+#             <th>Amount Involved (in Lakhs)</th>
+#         </tr>
+#     </thead>
+#     """
+#     # Table body
+#     html += "<tbody>"
+#     for item in results_with_harmonised:
+#         para_num = item.get('audit_para_number', 'N/A')
+#         original_heading = item.get('audit_para_heading', 'N/A')
+#         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
+#         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
+#         html += f"""
+#         <tr>
+#             <td>{para_num}</td>
+#             <td>{original_heading}</td>
+#             <td>{harmonised_heading}</td>
+#             <td>{amount}</td>
+#         </tr>
+#         """
+#     html += "</tbody></table>"
+#     return html
+# def process_dar_pdf(pdf_file):
+#     """
+#     The main function for the Gradio interface. It processes the PDF,
+#     extracts data, gets harmonised titles, and returns the results.
+#     """
+#     # Get API Key from environment secrets
+#     gemini_api_key = os.environ.get("GEMINI_API_KEY")
+#     if not pdf_file:
+#         return "Please upload a PDF file.", None, None
+#     if not gemini_api_key:
+#         return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
+#     # --- Step 1: Process PDF to text ---
+#     progress_update = "Processing PDF to text..."
+#     print(progress_update)
+#     full_text = preprocess_pdf_text(pdf_file.name)
+#     if full_text.startswith("Error"):
+#         return f"Failed to process PDF: {full_text}", None, None
+#     # --- Step 2: Extract structured data from text ---
+#     progress_update += "\nExtracting structured data from DAR text..."
+#     print(progress_update)
+#     parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
+#     if parsed_report.parsing_errors:
+#         error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
+#         print(error_msg)
+#         return error_msg, None, None
+#     if not parsed_report.audit_paras:
+#         return "Could not find any audit paras in the document.", None, None
+#     # --- Step 3: Get harmonised titles ---
+#     progress_update += "\nGenerating harmonised titles..."
+#     print(progress_update)
+#     original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
+#     if not original_headings:
+#         return "Found audit paras, but could not extract any headings to harmonise.", None, None
+#     # UPDATED: Pass the full_text to the harmonisation function for better context
+#     harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
+#     if not harmonised_results:
+#         return "Failed to generate harmonised titles.", None, None
+#     # --- Step 4: Combine data and prepare for output ---
+#     progress_update += "\nCombining data and preparing output..."
+#     print(progress_update)
+#     # Create a mapping from original heading to harmonised heading
+#     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
+#     # Combine all data into a list of dictionaries for display and download
+#     final_data_list = []
+#     for para in parsed_report.audit_paras:
+#         para_dict = para.dict()
+#         header_dict = parsed_report.header.dict() if parsed_report.header else {}
+#         # Combine header and para info
+#         combined_info = {**header_dict, **para_dict}
+#         # Add the harmonised heading
+#         harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
+#         combined_info['harmonised_audit_para_heading'] = harmonised_heading
+#         final_data_list.append(combined_info)
+#     # --- Step 5: Generate HTML report and Excel file ---
+#     html_output = create_html_report(final_data_list)
+#     df = pd.DataFrame(final_data_list)
+#     # Reorder columns for clarity in the Excel file
+#     excel_columns = [
+#         'gstin', 'trade_name', 'category', 'audit_group_number',
+#         'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
+#         'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
+#         'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
+#     ]
+#     df = df.reindex(columns=excel_columns).fillna('N/A')
+#     # Save to an in-memory buffer
+#     output_excel = BytesIO()
+#     with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
+#         df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
+#     output_excel.seek(0)
+#     excel_file_name = "dar_extraction_report.xlsx"
+#     with open(excel_file_name, "wb") as f:
+#         f.write(output_excel.getbuffer())
+#     return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
+# # --- Gradio Interface Definition ---
+# with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
+#     gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
+#     gr.Markdown("## Audit 1 Commissionerate Mumbai")
+#     gr.Markdown(
+#         "Upload a Draft Audit Report (DAR) in PDF format. "
+#         "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
+#     )
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
+#             submit_btn = gr.Button("Process Report", variant="primary")
+#         with gr.Column(scale=2):
+#             status_output = gr.Textbox(label="Processing Status", interactive=False)
+#             excel_output = gr.File(label="Download Excel Report")
+#     gr.Markdown("## Harmonised Audit Para Titles")
+#     html_output = gr.HTML()
+#     submit_btn.click(
+#         fn=process_dar_pdf,
+#         inputs=[pdf_input],
+#         outputs=[status_output, html_output, excel_output]
+#     )
+# if __name__ == "__main__":
+#     demo.launch(debug=True)