Spaces:

rameshmoorthy
/

DAR_standardiser

Sleeping

App Files Files Community

rameshmoorthy commited on Jun 21, 2025

Commit

d93d2b8

verified ·

1 Parent(s): ba31f91

Update app.py

Browse files

Files changed (1) hide show

app.py +321 -154

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import gradio as gr
 import pandas as pd
 from io import BytesIO
 import os
-import json
 # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
 from dar_processor import preprocess_pdf_text
@@ -112,7 +111,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
         gr.Markdown("# DAR PDF Harmonisation Tool")
         gr.Markdown(
             "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
-            "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
         )
         with gr.Row():
             with gr.Column(scale=1):
@@ -133,28 +132,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
     # --- Login Functionality ---
     def login(username, password):
         """
-        Checks user credentials against a dictionary.
-        For production, this dictionary is loaded from a Hugging Face secret.
         """
-        # Default credentials for local testing if secret is not found
-        default_creds = {
-            "admin": "iloveaudit1",
-            "planning_officer": "pco_password",
-            "audit_group1": "ag1_password"
-        }
-        # In production, load credentials from a HF secret as a JSON string.
-        # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
-        auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
-        try:
-            # Use credentials from secret if available and valid JSON
-            creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
-        except json.JSONDecodeError:
-            # Fallback to default if secret contains invalid JSON
-            creds = default_creds
-        if username in creds and password == creds.get(username):
             # Login successful: hide login UI, show main app
             return {
                 login_ui: gr.update(visible=False),
@@ -181,7 +169,9 @@ if __name__ == "__main__":
 # import pandas as pd
 # from io import BytesIO
 # import os
 # from dar_processor import preprocess_pdf_text
 # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
 # from models import ParsedDARReport, HarmonisedPara
@@ -190,193 +180,370 @@ if __name__ == "__main__":
 #     """Generates an HTML string to display the results in a styled table."""
 #     if not results_with_harmonised:
 #         return "<p>No audit paras found or processed.</p>"
-#     # Basic CSS for styling
 #     style = """
 #     <style>
 #         body { font-family: sans-serif; }
 #         .styled-table {
-#             border-collapse: collapse;
-#             margin: 25px 0;
-#             font-size: 0.9em;
-#             min-width: 400px;
-#             box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
-#             border-radius: 8px;
-#             overflow: hidden;
-#         }
-#         .styled-table thead tr {
-#             background-color: #009879;
-#             color: #ffffff;
-#             text-align: left;
-#         }
-#         .styled-table th, .styled-table td {
-#             padding: 12px 15px;
-#             border-bottom: 1px solid #dddddd;
-#         }
-#         .styled-table tbody tr:last-of-type {
-#             border-bottom: 2px solid #009879;
-#         }
-#         .styled-table tbody tr.active-row {
-#             font-weight: bold;
-#             color: #009879;
 #         }
 #     </style>
 #     """
-#     # Table header
-#     html = f"{style}<table class='styled-table'>"
-#     html += """
-#     <thead>
-#         <tr>
-#             <th>Para No.</th>
-#             <th>Original Audit Para Heading</th>
-#             <th>Harmonised Audit Para Heading</th>
-#             <th>Amount Involved (in Lakhs)</th>
-#         </tr>
-#     </thead>
-#     """
-#     # Table body
-#     html += "<tbody>"
 #     for item in results_with_harmonised:
 #         para_num = item.get('audit_para_number', 'N/A')
 #         original_heading = item.get('audit_para_heading', 'N/A')
 #         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
 #         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
-#         html += f"""
-#         <tr>
-#             <td>{para_num}</td>
-#             <td>{original_heading}</td>
-#             <td>{harmonised_heading}</td>
-#             <td>{amount}</td>
-#         </tr>
-#         """
 #     html += "</tbody></table>"
 #     return html
 # def process_dar_pdf(pdf_file):
-#     """
-#     The main function for the Gradio interface. It processes the PDF,
-#     extracts data, gets harmonised titles, and returns the results.
-#     """
-#     # Get API Key from environment secrets
 #     gemini_api_key = os.environ.get("GEMINI_API_KEY")
 #     if not pdf_file:
 #         return "Please upload a PDF file.", None, None
 #     if not gemini_api_key:
-#         return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
-#     # --- Step 1: Process PDF to text ---
-#     progress_update = "Processing PDF to text..."
-#     print(progress_update)
 #     full_text = preprocess_pdf_text(pdf_file.name)
 #     if full_text.startswith("Error"):
 #         return f"Failed to process PDF: {full_text}", None, None
-#     # --- Step 2: Extract structured data from text ---
-#     progress_update += "\nExtracting structured data from DAR text..."
-#     print(progress_update)
-#     parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
-#     if parsed_report.parsing_errors:
-#         error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
-#         print(error_msg)
 #         return error_msg, None, None
-#     if not parsed_report.audit_paras:
-#         return "Could not find any audit paras in the document.", None, None
-#     # --- Step 3: Get harmonised titles ---
-#     progress_update += "\nGenerating harmonised titles..."
-#     print(progress_update)
-#     original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
 #     if not original_headings:
-#         return "Found audit paras, but could not extract any headings to harmonise.", None, None
-#     # UPDATED: Pass the full_text to the harmonisation function for better context
-#     harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
 #     if not harmonised_results:
 #         return "Failed to generate harmonised titles.", None, None
-#     # --- Step 4: Combine data and prepare for output ---
-#     progress_update += "\nCombining data and preparing output..."
-#     print(progress_update)
-#     # Create a mapping from original heading to harmonised heading
 #     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
-#     # Combine all data into a list of dictionaries for display and download
 #     final_data_list = []
 #     for para in parsed_report.audit_paras:
-#         para_dict = para.dict()
-#         header_dict = parsed_report.header.dict() if parsed_report.header else {}
-#         # Combine header and para info
-#         combined_info = {**header_dict, **para_dict}
-#         # Add the harmonised heading
-#         harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
-#         combined_info['harmonised_audit_para_heading'] = harmonised_heading
 #         final_data_list.append(combined_info)
-#     # --- Step 5: Generate HTML report and Excel file ---
 #     html_output = create_html_report(final_data_list)
 #     df = pd.DataFrame(final_data_list)
-#     # Reorder columns for clarity in the Excel file
 #     excel_columns = [
-#         'gstin', 'trade_name', 'category', 'audit_group_number',
-#         'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
-#         'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
-#         'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
 #     ]
 #     df = df.reindex(columns=excel_columns).fillna('N/A')
-#     # Save to an in-memory buffer
 #     output_excel = BytesIO()
-#     with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
-#         df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
 #     output_excel.seek(0)
 #     excel_file_name = "dar_extraction_report.xlsx"
 #     with open(excel_file_name, "wb") as f:
 #         f.write(output_excel.getbuffer())
-#     return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
 # # --- Gradio Interface Definition ---
 # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
-#     gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
-#     gr.Markdown("## Audit 1 Commissionerate Mumbai")
-#     gr.Markdown(
-#         "Upload a Draft Audit Report (DAR) in PDF format. "
-#         "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
 #     )
-#     with gr.Row():
-#         with gr.Column(scale=1):
-#             pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
-#             submit_btn = gr.Button("Process Report", variant="primary")
-#         with gr.Column(scale=2):
-#             status_output = gr.Textbox(label="Processing Status", interactive=False)
-#             excel_output = gr.File(label="Download Excel Report")
-#     gr.Markdown("## Harmonised Audit Para Titles")
-#     html_output = gr.HTML()
-#     submit_btn.click(
-#         fn=process_dar_pdf,
-#         inputs=[pdf_input],
-#         outputs=[status_output, html_output, excel_output]
-#     )
-# if __name__ == "__main__":
-#     demo.launch(debug=True)

 import pandas as pd
 from io import BytesIO
 import os
 # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
 from dar_processor import preprocess_pdf_text
         gr.Markdown("# DAR PDF Harmonisation Tool")
         gr.Markdown(
             "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
+            "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY`, `APP_USERNAME`, and `APP_PASSWORD`.*"
         )
         with gr.Row():
             with gr.Column(scale=1):
     # --- Login Functionality ---
     def login(username, password):
         """
+        Checks user credentials against secrets.
+        For production, these are loaded from Hugging Face secrets.
         """
+        # Get credentials from Hugging Face secrets.
+        # Fallback to default values for local testing if secrets are not set.
+        auth_username = os.environ.get("APP_USERNAME")
+        auth_password = os.environ.get("APP_PASSWORD")
+        is_valid_user = (username == auth_username and password == auth_password)
+        if is_valid_user:
             # Login successful: hide login UI, show main app
             return {
                 login_ui: gr.update(visible=False),
 # import pandas as pd
 # from io import BytesIO
 # import os
+# import json
+# # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
 # from dar_processor import preprocess_pdf_text
 # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
 # from models import ParsedDARReport, HarmonisedPara
 #     """Generates an HTML string to display the results in a styled table."""
 #     if not results_with_harmonised:
 #         return "<p>No audit paras found or processed.</p>"
 #     style = """
 #     <style>
 #         body { font-family: sans-serif; }
 #         .styled-table {
+#             border-collapse: collapse; margin: 25px 0; font-size: 0.9em;
+#             min-width: 400px; box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+#             border-radius: 8px; overflow: hidden;
 #         }
+#         .styled-table thead tr { background-color: #009879; color: #ffffff; text-align: left; }
+#         .styled-table th, .styled-table td { padding: 12px 15px; border-bottom: 1px solid #dddddd; }
+#         .styled-table tbody tr:last-of-type { border-bottom: 2px solid #009879; }
 #     </style>
 #     """
+#     html = f"{style}<table class='styled-table'><thead><tr><th>Para No.</th><th>Original Audit Para Heading</th><th>Harmonised Audit Para Heading</th><th>Amount Involved (in Lakhs)</th></tr></thead><tbody>"
 #     for item in results_with_harmonised:
 #         para_num = item.get('audit_para_number', 'N/A')
 #         original_heading = item.get('audit_para_heading', 'N/A')
 #         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
 #         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
+#         html += f"<tr><td>{para_num}</td><td>{original_heading}</td><td>{harmonised_heading}</td><td>{amount}</td></tr>"
 #     html += "</tbody></table>"
 #     return html
 # def process_dar_pdf(pdf_file):
+#     """The main processing function, called after successful login."""
 #     gemini_api_key = os.environ.get("GEMINI_API_KEY")
 #     if not pdf_file:
 #         return "Please upload a PDF file.", None, None
 #     if not gemini_api_key:
+#         return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None
+#     # Step 1: Process PDF to text
 #     full_text = preprocess_pdf_text(pdf_file.name)
 #     if full_text.startswith("Error"):
 #         return f"Failed to process PDF: {full_text}", None, None
+#     # Step 2: Extract structured data
+#     parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text)
+#     if parsed_report.parsing_errors or not parsed_report.audit_paras:
+#         error_msg = parsed_report.parsing_errors or "Could not find any audit paras."
 #         return error_msg, None, None
+#     # Step 3: Get harmonised titles
+#     original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading]
 #     if not original_headings:
+#         return "Found paras but no headings to harmonise.", None, None
+#     harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings)
 #     if not harmonised_results:
 #         return "Failed to generate harmonised titles.", None, None
+#     # Step 4: Combine and prepare outputs
 #     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
 #     final_data_list = []
 #     for para in parsed_report.audit_paras:
+#         combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict()
+#         combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A")
 #         final_data_list.append(combined_info)
 #     html_output = create_html_report(final_data_list)
+#     # Step 5: Create Excel file for download
 #     df = pd.DataFrame(final_data_list)
 #     excel_columns = [
+#         'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number',
+#         'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs',
+#         'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs',
+#         'total_amount_recovered_overall_rs'
 #     ]
 #     df = df.reindex(columns=excel_columns).fillna('N/A')
 #     output_excel = BytesIO()
+#     df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction')
 #     output_excel.seek(0)
 #     excel_file_name = "dar_extraction_report.xlsx"
 #     with open(excel_file_name, "wb") as f:
 #         f.write(output_excel.getbuffer())
+#     return "Processing complete.", html_output, gr.File(value=excel_file_name)
 # # --- Gradio Interface Definition ---
 # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
+#     # --- Login UI (visible initially) ---
+#     with gr.Column(visible=True) as login_ui:
+#         gr.Markdown("# DAR Harmonisation Tool Login")
+#         gr.Markdown("Please enter the credentials to access the tool.")
+#         with gr.Row():
+#             username_input = gr.Textbox(label="Username", placeholder="Enter your username")
+#             password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
+#         login_button = gr.Button("Login", variant="primary")
+#         login_error_msg = gr.Markdown(visible=False)
+#     # --- Main App UI (hidden initially) ---
+#     with gr.Column(visible=False) as main_app_ui:
+#         gr.Markdown("# DAR PDF Harmonisation Tool")
+#         gr.Markdown(
+#             "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
+#             "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
+#         )
+#         with gr.Row():
+#             with gr.Column(scale=1):
+#                 pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
+#                 submit_btn = gr.Button("Process Report", variant="primary")
+#             with gr.Column(scale=2):
+#                 status_output = gr.Textbox(label="Processing Status", interactive=False)
+#                 excel_output = gr.File(label="Download Excel Report")
+#         gr.Markdown("## Harmonised Audit Para Titles")
+#         html_output = gr.HTML()
+#         submit_btn.click(
+#             fn=process_dar_pdf,
+#             inputs=[pdf_input],
+#             outputs=[status_output, html_output, excel_output]
+#         )
+#     # --- Login Functionality ---
+#     def login(username, password):
+#         """
+#         Checks user credentials against a dictionary.
+#         For production, this dictionary is loaded from a Hugging Face secret.
+#         """
+#         # Default credentials for local testing if secret is not found
+#         default_creds = {
+#             "admin": "iloveaudit1",
+#             "planning_officer": "pco_password",
+#             "audit_group1": "ag1_password"
+#         }
+#         # In production, load credentials from a HF secret as a JSON string.
+#         # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
+#         auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
+#         try:
+#             # Use credentials from secret if available and valid JSON
+#             creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
+#         except json.JSONDecodeError:
+#             # Fallback to default if secret contains invalid JSON
+#             creds = default_creds
+#         if username in creds and password == creds.get(username):
+#             # Login successful: hide login UI, show main app
+#             return {
+#                 login_ui: gr.update(visible=False),
+#                 main_app_ui: gr.update(visible=True),
+#                 login_error_msg: gr.update(visible=False)
+#             }
+#         else:
+#             # Login failed: keep login UI visible, show error message
+#             return {
+#                 login_ui: gr.update(visible=True),
+#                 main_app_ui: gr.update(visible=False),
+#                 login_error_msg: gr.update(value="<p style='color:red;'>Invalid username or password.</p>", visible=True)
+#             }
+#     login_button.click(
+#         login,
+#         inputs=[username_input, password_input],
+#         outputs=[login_ui, main_app_ui, login_error_msg]
 #     )
+# if __name__ == "__main__":
+#     demo.launch(debug=True)
+# # import gradio as gr
+# # import pandas as pd
+# # from io import BytesIO
+# # import os
+# # from dar_processor import preprocess_pdf_text
+# # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
+# # from models import ParsedDARReport, HarmonisedPara
+# # def create_html_report(results_with_harmonised: list[dict]) -> str:
+# #     """Generates an HTML string to display the results in a styled table."""
+# #     if not results_with_harmonised:
+# #         return "<p>No audit paras found or processed.</p>"
+# #     # Basic CSS for styling
+# #     style = """
+# #     <style>
+# #         body { font-family: sans-serif; }
+# #         .styled-table {
+# #             border-collapse: collapse;
+# #             margin: 25px 0;
+# #             font-size: 0.9em;
+# #             min-width: 400px;
+# #             box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
+# #             border-radius: 8px;
+# #             overflow: hidden;
+# #         }
+# #         .styled-table thead tr {
+# #             background-color: #009879;
+# #             color: #ffffff;
+# #             text-align: left;
+# #         }
+# #         .styled-table th, .styled-table td {
+# #             padding: 12px 15px;
+# #             border-bottom: 1px solid #dddddd;
+# #         }
+# #         .styled-table tbody tr:last-of-type {
+# #             border-bottom: 2px solid #009879;
+# #         }
+# #         .styled-table tbody tr.active-row {
+# #             font-weight: bold;
+# #             color: #009879;
+# #         }
+# #     </style>
+# #     """
+# #     # Table header
+# #     html = f"{style}<table class='styled-table'>"
+# #     html += """
+# #     <thead>
+# #         <tr>
+# #             <th>Para No.</th>
+# #             <th>Original Audit Para Heading</th>
+# #             <th>Harmonised Audit Para Heading</th>
+# #             <th>Amount Involved (in Lakhs)</th>
+# #         </tr>
+# #     </thead>
+# #     """
+# #     # Table body
+# #     html += "<tbody>"
+# #     for item in results_with_harmonised:
+# #         para_num = item.get('audit_para_number', 'N/A')
+# #         original_heading = item.get('audit_para_heading', 'N/A')
+# #         harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
+# #         amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
+# #         html += f"""
+# #         <tr>
+# #             <td>{para_num}</td>
+# #             <td>{original_heading}</td>
+# #             <td>{harmonised_heading}</td>
+# #             <td>{amount}</td>
+# #         </tr>
+# #         """
+# #     html += "</tbody></table>"
+# #     return html
+# # def process_dar_pdf(pdf_file):
+# #     """
+# #     The main function for the Gradio interface. It processes the PDF,
+# #     extracts data, gets harmonised titles, and returns the results.
+# #     """
+# #     # Get API Key from environment secrets
+# #     gemini_api_key = os.environ.get("GEMINI_API_KEY")
+# #     if not pdf_file:
+# #         return "Please upload a PDF file.", None, None
+# #     if not gemini_api_key:
+# #         return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
+# #     # --- Step 1: Process PDF to text ---
+# #     progress_update = "Processing PDF to text..."
+# #     print(progress_update)
+# #     full_text = preprocess_pdf_text(pdf_file.name)
+# #     if full_text.startswith("Error"):
+# #         return f"Failed to process PDF: {full_text}", None, None
+# #     # --- Step 2: Extract structured data from text ---
+# #     progress_update += "\nExtracting structured data from DAR text..."
+# #     print(progress_update)
+# #     parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
+# #     if parsed_report.parsing_errors:
+# #         error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
+# #         print(error_msg)
+# #         return error_msg, None, None
+# #     if not parsed_report.audit_paras:
+# #         return "Could not find any audit paras in the document.", None, None
+# #     # --- Step 3: Get harmonised titles ---
+# #     progress_update += "\nGenerating harmonised titles..."
+# #     print(progress_update)
+# #     original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
+# #     if not original_headings:
+# #         return "Found audit paras, but could not extract any headings to harmonise.", None, None
+# #     # UPDATED: Pass the full_text to the harmonisation function for better context
+# #     harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
+# #     if not harmonised_results:
+# #         return "Failed to generate harmonised titles.", None, None
+# #     # --- Step 4: Combine data and prepare for output ---
+# #     progress_update += "\nCombining data and preparing output..."
+# #     print(progress_update)
+# #     # Create a mapping from original heading to harmonised heading
+# #     harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
+# #     # Combine all data into a list of dictionaries for display and download
+# #     final_data_list = []
+# #     for para in parsed_report.audit_paras:
+# #         para_dict = para.dict()
+# #         header_dict = parsed_report.header.dict() if parsed_report.header else {}
+# #         # Combine header and para info
+# #         combined_info = {**header_dict, **para_dict}
+# #         # Add the harmonised heading
+# #         harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
+# #         combined_info['harmonised_audit_para_heading'] = harmonised_heading
+# #         final_data_list.append(combined_info)
+# #     # --- Step 5: Generate HTML report and Excel file ---
+# #     html_output = create_html_report(final_data_list)
+# #     df = pd.DataFrame(final_data_list)
+# #     # Reorder columns for clarity in the Excel file
+# #     excel_columns = [
+# #         'gstin', 'trade_name', 'category', 'audit_group_number',
+# #         'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
+# #         'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
+# #         'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
+# #     ]
+# #     df = df.reindex(columns=excel_columns).fillna('N/A')
+# #     # Save to an in-memory buffer
+# #     output_excel = BytesIO()
+# #     with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
+# #         df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
+# #     output_excel.seek(0)
+# #     excel_file_name = "dar_extraction_report.xlsx"
+# #     with open(excel_file_name, "wb") as f:
+# #         f.write(output_excel.getbuffer())
+# #     return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
+# # # --- Gradio Interface Definition ---
+# # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
+# #     gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
+# #     gr.Markdown("## Audit 1 Commissionerate Mumbai")
+# #     gr.Markdown(
+# #         "Upload a Draft Audit Report (DAR) in PDF format. "
+# #         "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
+# #     )
+# #     with gr.Row():
+# #         with gr.Column(scale=1):
+# #             pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
+# #             submit_btn = gr.Button("Process Report", variant="primary")
+# #         with gr.Column(scale=2):
+# #             status_output = gr.Textbox(label="Processing Status", interactive=False)
+# #             excel_output = gr.File(label="Download Excel Report")
+# #     gr.Markdown("## Harmonised Audit Para Titles")
+# #     html_output = gr.HTML()
+# #     submit_btn.click(
+# #         fn=process_dar_pdf,
+# #         inputs=[pdf_input],
+# #         outputs=[status_output, html_output, excel_output]
+# #     )
+# # if __name__ == "__main__":
+# #     demo.launch(debug=True)