rameshmoorthy commited on
Commit
d93d2b8
·
verified ·
1 Parent(s): ba31f91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +321 -154
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import pandas as pd
3
  from io import BytesIO
4
  import os
5
- import json
6
 
7
  # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
8
  from dar_processor import preprocess_pdf_text
@@ -112,7 +111,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
112
  gr.Markdown("# DAR PDF Harmonisation Tool")
113
  gr.Markdown(
114
  "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
115
- "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
116
  )
117
  with gr.Row():
118
  with gr.Column(scale=1):
@@ -133,28 +132,17 @@ with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
133
  # --- Login Functionality ---
134
  def login(username, password):
135
  """
136
- Checks user credentials against a dictionary.
137
- For production, this dictionary is loaded from a Hugging Face secret.
138
  """
139
- # Default credentials for local testing if secret is not found
140
- default_creds = {
141
- "admin": "iloveaudit1",
142
- "planning_officer": "pco_password",
143
- "audit_group1": "ag1_password"
144
- }
145
 
146
- # In production, load credentials from a HF secret as a JSON string.
147
- # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
148
- auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
149
 
150
- try:
151
- # Use credentials from secret if available and valid JSON
152
- creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
153
- except json.JSONDecodeError:
154
- # Fallback to default if secret contains invalid JSON
155
- creds = default_creds
156
-
157
- if username in creds and password == creds.get(username):
158
  # Login successful: hide login UI, show main app
159
  return {
160
  login_ui: gr.update(visible=False),
@@ -181,7 +169,9 @@ if __name__ == "__main__":
181
  # import pandas as pd
182
  # from io import BytesIO
183
  # import os
 
184
 
 
185
  # from dar_processor import preprocess_pdf_text
186
  # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
187
  # from models import ParsedDARReport, HarmonisedPara
@@ -190,193 +180,370 @@ if __name__ == "__main__":
190
  # """Generates an HTML string to display the results in a styled table."""
191
  # if not results_with_harmonised:
192
  # return "<p>No audit paras found or processed.</p>"
193
-
194
- # # Basic CSS for styling
195
  # style = """
196
  # <style>
197
  # body { font-family: sans-serif; }
198
  # .styled-table {
199
- # border-collapse: collapse;
200
- # margin: 25px 0;
201
- # font-size: 0.9em;
202
- # min-width: 400px;
203
- # box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
204
- # border-radius: 8px;
205
- # overflow: hidden;
206
- # }
207
- # .styled-table thead tr {
208
- # background-color: #009879;
209
- # color: #ffffff;
210
- # text-align: left;
211
- # }
212
- # .styled-table th, .styled-table td {
213
- # padding: 12px 15px;
214
- # border-bottom: 1px solid #dddddd;
215
- # }
216
- # .styled-table tbody tr:last-of-type {
217
- # border-bottom: 2px solid #009879;
218
- # }
219
- # .styled-table tbody tr.active-row {
220
- # font-weight: bold;
221
- # color: #009879;
222
  # }
 
 
 
223
  # </style>
224
  # """
225
-
226
- # # Table header
227
- # html = f"{style}<table class='styled-table'>"
228
- # html += """
229
- # <thead>
230
- # <tr>
231
- # <th>Para No.</th>
232
- # <th>Original Audit Para Heading</th>
233
- # <th>Harmonised Audit Para Heading</th>
234
- # <th>Amount Involved (in Lakhs)</th>
235
- # </tr>
236
- # </thead>
237
- # """
238
-
239
- # # Table body
240
- # html += "<tbody>"
241
  # for item in results_with_harmonised:
242
  # para_num = item.get('audit_para_number', 'N/A')
243
  # original_heading = item.get('audit_para_heading', 'N/A')
244
  # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
245
  # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
246
-
247
- # html += f"""
248
- # <tr>
249
- # <td>{para_num}</td>
250
- # <td>{original_heading}</td>
251
- # <td>{harmonised_heading}</td>
252
- # <td>{amount}</td>
253
- # </tr>
254
- # """
255
  # html += "</tbody></table>"
256
  # return html
257
 
258
-
259
  # def process_dar_pdf(pdf_file):
260
- # """
261
- # The main function for the Gradio interface. It processes the PDF,
262
- # extracts data, gets harmonised titles, and returns the results.
263
- # """
264
- # # Get API Key from environment secrets
265
  # gemini_api_key = os.environ.get("GEMINI_API_KEY")
266
-
267
  # if not pdf_file:
268
  # return "Please upload a PDF file.", None, None
269
-
270
  # if not gemini_api_key:
271
- # return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
272
 
273
- # # --- Step 1: Process PDF to text ---
274
- # progress_update = "Processing PDF to text..."
275
- # print(progress_update)
276
  # full_text = preprocess_pdf_text(pdf_file.name)
277
  # if full_text.startswith("Error"):
278
  # return f"Failed to process PDF: {full_text}", None, None
279
 
280
- # # --- Step 2: Extract structured data from text ---
281
- # progress_update += "\nExtracting structured data from DAR text..."
282
- # print(progress_update)
283
- # parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
284
-
285
- # if parsed_report.parsing_errors:
286
- # error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
287
- # print(error_msg)
288
  # return error_msg, None, None
289
-
290
- # if not parsed_report.audit_paras:
291
- # return "Could not find any audit paras in the document.", None, None
292
 
293
- # # --- Step 3: Get harmonised titles ---
294
- # progress_update += "\nGenerating harmonised titles..."
295
- # print(progress_update)
296
- # original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
297
-
298
  # if not original_headings:
299
- # return "Found audit paras, but could not extract any headings to harmonise.", None, None
300
-
301
- # # UPDATED: Pass the full_text to the harmonisation function for better context
302
- # harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
303
-
304
  # if not harmonised_results:
305
  # return "Failed to generate harmonised titles.", None, None
306
-
307
- # # --- Step 4: Combine data and prepare for output ---
308
- # progress_update += "\nCombining data and preparing output..."
309
- # print(progress_update)
310
 
311
- # # Create a mapping from original heading to harmonised heading
312
  # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
313
-
314
- # # Combine all data into a list of dictionaries for display and download
315
  # final_data_list = []
316
  # for para in parsed_report.audit_paras:
317
- # para_dict = para.dict()
318
- # header_dict = parsed_report.header.dict() if parsed_report.header else {}
319
-
320
- # # Combine header and para info
321
- # combined_info = {**header_dict, **para_dict}
322
-
323
- # # Add the harmonised heading
324
- # harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
325
- # combined_info['harmonised_audit_para_heading'] = harmonised_heading
326
  # final_data_list.append(combined_info)
327
-
328
- # # --- Step 5: Generate HTML report and Excel file ---
329
  # html_output = create_html_report(final_data_list)
330
-
 
331
  # df = pd.DataFrame(final_data_list)
332
- # # Reorder columns for clarity in the Excel file
333
  # excel_columns = [
334
- # 'gstin', 'trade_name', 'category', 'audit_group_number',
335
- # 'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
336
- # 'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
337
- # 'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
338
  # ]
339
  # df = df.reindex(columns=excel_columns).fillna('N/A')
340
 
341
- # # Save to an in-memory buffer
342
  # output_excel = BytesIO()
343
- # with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
344
- # df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
345
  # output_excel.seek(0)
346
 
347
  # excel_file_name = "dar_extraction_report.xlsx"
348
  # with open(excel_file_name, "wb") as f:
349
  # f.write(output_excel.getbuffer())
350
 
351
- # return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
352
-
353
 
354
  # # --- Gradio Interface Definition ---
355
  # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
356
- # gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
357
- # gr.Markdown("## Audit 1 Commissionerate Mumbai")
358
- # gr.Markdown(
359
- # "Upload a Draft Audit Report (DAR) in PDF format. "
360
- # "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  # )
363
 
364
- # with gr.Row():
365
- # with gr.Column(scale=1):
366
- # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
367
- # submit_btn = gr.Button("Process Report", variant="primary")
368
- # with gr.Column(scale=2):
369
- # status_output = gr.Textbox(label="Processing Status", interactive=False)
370
- # excel_output = gr.File(label="Download Excel Report")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
- # gr.Markdown("## Harmonised Audit Para Titles")
373
- # html_output = gr.HTML()
 
 
374
 
375
- # submit_btn.click(
376
- # fn=process_dar_pdf,
377
- # inputs=[pdf_input],
378
- # outputs=[status_output, html_output, excel_output]
379
- # )
380
 
381
- # if __name__ == "__main__":
382
- # demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  from io import BytesIO
4
  import os
 
5
 
6
  # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
7
  from dar_processor import preprocess_pdf_text
 
111
  gr.Markdown("# DAR PDF Harmonisation Tool")
112
  gr.Markdown(
113
  "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
114
+ "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY`, `APP_USERNAME`, and `APP_PASSWORD`.*"
115
  )
116
  with gr.Row():
117
  with gr.Column(scale=1):
 
132
  # --- Login Functionality ---
133
  def login(username, password):
134
  """
135
+ Checks user credentials against secrets.
136
+ For production, these are loaded from Hugging Face secrets.
137
  """
138
+ # Get credentials from Hugging Face secrets.
139
+ # Fallback to default values for local testing if secrets are not set.
140
+ auth_username = os.environ.get("APP_USERNAME")
141
+ auth_password = os.environ.get("APP_PASSWORD")
 
 
142
 
143
+ is_valid_user = (username == auth_username and password == auth_password)
 
 
144
 
145
+ if is_valid_user:
 
 
 
 
 
 
 
146
  # Login successful: hide login UI, show main app
147
  return {
148
  login_ui: gr.update(visible=False),
 
169
  # import pandas as pd
170
  # from io import BytesIO
171
  # import os
172
+ # import json
173
 
174
+ # # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
175
  # from dar_processor import preprocess_pdf_text
176
  # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
177
  # from models import ParsedDARReport, HarmonisedPara
 
180
  # """Generates an HTML string to display the results in a styled table."""
181
  # if not results_with_harmonised:
182
  # return "<p>No audit paras found or processed.</p>"
 
 
183
  # style = """
184
  # <style>
185
  # body { font-family: sans-serif; }
186
  # .styled-table {
187
+ # border-collapse: collapse; margin: 25px 0; font-size: 0.9em;
188
+ # min-width: 400px; box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
189
+ # border-radius: 8px; overflow: hidden;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # }
191
+ # .styled-table thead tr { background-color: #009879; color: #ffffff; text-align: left; }
192
+ # .styled-table th, .styled-table td { padding: 12px 15px; border-bottom: 1px solid #dddddd; }
193
+ # .styled-table tbody tr:last-of-type { border-bottom: 2px solid #009879; }
194
  # </style>
195
  # """
196
+ # html = f"{style}<table class='styled-table'><thead><tr><th>Para No.</th><th>Original Audit Para Heading</th><th>Harmonised Audit Para Heading</th><th>Amount Involved (in Lakhs)</th></tr></thead><tbody>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # for item in results_with_harmonised:
198
  # para_num = item.get('audit_para_number', 'N/A')
199
  # original_heading = item.get('audit_para_heading', 'N/A')
200
  # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
201
  # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
202
+ # html += f"<tr><td>{para_num}</td><td>{original_heading}</td><td>{harmonised_heading}</td><td>{amount}</td></tr>"
 
 
 
 
 
 
 
 
203
  # html += "</tbody></table>"
204
  # return html
205
 
 
206
  # def process_dar_pdf(pdf_file):
207
+ # """The main processing function, called after successful login."""
 
 
 
 
208
  # gemini_api_key = os.environ.get("GEMINI_API_KEY")
 
209
  # if not pdf_file:
210
  # return "Please upload a PDF file.", None, None
 
211
  # if not gemini_api_key:
212
+ # return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None
213
 
214
+ # # Step 1: Process PDF to text
 
 
215
  # full_text = preprocess_pdf_text(pdf_file.name)
216
  # if full_text.startswith("Error"):
217
  # return f"Failed to process PDF: {full_text}", None, None
218
 
219
+ # # Step 2: Extract structured data
220
+ # parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text)
221
+ # if parsed_report.parsing_errors or not parsed_report.audit_paras:
222
+ # error_msg = parsed_report.parsing_errors or "Could not find any audit paras."
 
 
 
 
223
  # return error_msg, None, None
 
 
 
224
 
225
+ # # Step 3: Get harmonised titles
226
+ # original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading]
 
 
 
227
  # if not original_headings:
228
+ # return "Found paras but no headings to harmonise.", None, None
229
+
230
+ # harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings)
 
 
231
  # if not harmonised_results:
232
  # return "Failed to generate harmonised titles.", None, None
 
 
 
 
233
 
234
+ # # Step 4: Combine and prepare outputs
235
  # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
 
 
236
  # final_data_list = []
237
  # for para in parsed_report.audit_paras:
238
+ # combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict()
239
+ # combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A")
 
 
 
 
 
 
 
240
  # final_data_list.append(combined_info)
241
+
 
242
  # html_output = create_html_report(final_data_list)
243
+
244
+ # # Step 5: Create Excel file for download
245
  # df = pd.DataFrame(final_data_list)
 
246
  # excel_columns = [
247
+ # 'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number',
248
+ # 'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs',
249
+ # 'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs',
250
+ # 'total_amount_recovered_overall_rs'
251
  # ]
252
  # df = df.reindex(columns=excel_columns).fillna('N/A')
253
 
 
254
  # output_excel = BytesIO()
255
+ # df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction')
 
256
  # output_excel.seek(0)
257
 
258
  # excel_file_name = "dar_extraction_report.xlsx"
259
  # with open(excel_file_name, "wb") as f:
260
  # f.write(output_excel.getbuffer())
261
 
262
+ # return "Processing complete.", html_output, gr.File(value=excel_file_name)
 
263
 
264
  # # --- Gradio Interface Definition ---
265
  # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
266
+
267
+ # # --- Login UI (visible initially) ---
268
+ # with gr.Column(visible=True) as login_ui:
269
+ # gr.Markdown("# DAR Harmonisation Tool Login")
270
+ # gr.Markdown("Please enter the credentials to access the tool.")
271
+ # with gr.Row():
272
+ # username_input = gr.Textbox(label="Username", placeholder="Enter your username")
273
+ # password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
274
+ # login_button = gr.Button("Login", variant="primary")
275
+ # login_error_msg = gr.Markdown(visible=False)
276
+
277
+ # # --- Main App UI (hidden initially) ---
278
+ # with gr.Column(visible=False) as main_app_ui:
279
+ # gr.Markdown("# DAR PDF Harmonisation Tool")
280
+ # gr.Markdown(
281
+ # "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
282
+ # "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
283
+ # )
284
+ # with gr.Row():
285
+ # with gr.Column(scale=1):
286
+ # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
287
+ # submit_btn = gr.Button("Process Report", variant="primary")
288
+ # with gr.Column(scale=2):
289
+ # status_output = gr.Textbox(label="Processing Status", interactive=False)
290
+ # excel_output = gr.File(label="Download Excel Report")
291
+ # gr.Markdown("## Harmonised Audit Para Titles")
292
+ # html_output = gr.HTML()
293
+
294
+ # submit_btn.click(
295
+ # fn=process_dar_pdf,
296
+ # inputs=[pdf_input],
297
+ # outputs=[status_output, html_output, excel_output]
298
+ # )
299
+
300
+ # # --- Login Functionality ---
301
+ # def login(username, password):
302
+ # """
303
+ # Checks user credentials against a dictionary.
304
+ # For production, this dictionary is loaded from a Hugging Face secret.
305
+ # """
306
+ # # Default credentials for local testing if secret is not found
307
+ # default_creds = {
308
+ # "admin": "iloveaudit1",
309
+ # "planning_officer": "pco_password",
310
+ # "audit_group1": "ag1_password"
311
+ # }
312
+
313
+ # # In production, load credentials from a HF secret as a JSON string.
314
+ # # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
315
+ # auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
316
 
317
+ # try:
318
+ # # Use credentials from secret if available and valid JSON
319
+ # creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
320
+ # except json.JSONDecodeError:
321
+ # # Fallback to default if secret contains invalid JSON
322
+ # creds = default_creds
323
+
324
+ # if username in creds and password == creds.get(username):
325
+ # # Login successful: hide login UI, show main app
326
+ # return {
327
+ # login_ui: gr.update(visible=False),
328
+ # main_app_ui: gr.update(visible=True),
329
+ # login_error_msg: gr.update(visible=False)
330
+ # }
331
+ # else:
332
+ # # Login failed: keep login UI visible, show error message
333
+ # return {
334
+ # login_ui: gr.update(visible=True),
335
+ # main_app_ui: gr.update(visible=False),
336
+ # login_error_msg: gr.update(value="<p style='color:red;'>Invalid username or password.</p>", visible=True)
337
+ # }
338
+
339
+ # login_button.click(
340
+ # login,
341
+ # inputs=[username_input, password_input],
342
+ # outputs=[login_ui, main_app_ui, login_error_msg]
343
  # )
344
 
345
+ # if __name__ == "__main__":
346
+ # demo.launch(debug=True)
347
+ # # import gradio as gr
348
+ # # import pandas as pd
349
+ # # from io import BytesIO
350
+ # # import os
351
+
352
+ # # from dar_processor import preprocess_pdf_text
353
+ # # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
354
+ # # from models import ParsedDARReport, HarmonisedPara
355
+
356
+ # # def create_html_report(results_with_harmonised: list[dict]) -> str:
357
+ # # """Generates an HTML string to display the results in a styled table."""
358
+ # # if not results_with_harmonised:
359
+ # # return "<p>No audit paras found or processed.</p>"
360
+
361
+ # # # Basic CSS for styling
362
+ # # style = """
363
+ # # <style>
364
+ # # body { font-family: sans-serif; }
365
+ # # .styled-table {
366
+ # # border-collapse: collapse;
367
+ # # margin: 25px 0;
368
+ # # font-size: 0.9em;
369
+ # # min-width: 400px;
370
+ # # box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
371
+ # # border-radius: 8px;
372
+ # # overflow: hidden;
373
+ # # }
374
+ # # .styled-table thead tr {
375
+ # # background-color: #009879;
376
+ # # color: #ffffff;
377
+ # # text-align: left;
378
+ # # }
379
+ # # .styled-table th, .styled-table td {
380
+ # # padding: 12px 15px;
381
+ # # border-bottom: 1px solid #dddddd;
382
+ # # }
383
+ # # .styled-table tbody tr:last-of-type {
384
+ # # border-bottom: 2px solid #009879;
385
+ # # }
386
+ # # .styled-table tbody tr.active-row {
387
+ # # font-weight: bold;
388
+ # # color: #009879;
389
+ # # }
390
+ # # </style>
391
+ # # """
392
+
393
+ # # # Table header
394
+ # # html = f"{style}<table class='styled-table'>"
395
+ # # html += """
396
+ # # <thead>
397
+ # # <tr>
398
+ # # <th>Para No.</th>
399
+ # # <th>Original Audit Para Heading</th>
400
+ # # <th>Harmonised Audit Para Heading</th>
401
+ # # <th>Amount Involved (in Lakhs)</th>
402
+ # # </tr>
403
+ # # </thead>
404
+ # # """
405
+
406
+ # # # Table body
407
+ # # html += "<tbody>"
408
+ # # for item in results_with_harmonised:
409
+ # # para_num = item.get('audit_para_number', 'N/A')
410
+ # # original_heading = item.get('audit_para_heading', 'N/A')
411
+ # # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
412
+ # # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
413
+
414
+ # # html += f"""
415
+ # # <tr>
416
+ # # <td>{para_num}</td>
417
+ # # <td>{original_heading}</td>
418
+ # # <td>{harmonised_heading}</td>
419
+ # # <td>{amount}</td>
420
+ # # </tr>
421
+ # # """
422
+ # # html += "</tbody></table>"
423
+ # # return html
424
+
425
+
426
+ # # def process_dar_pdf(pdf_file):
427
+ # # """
428
+ # # The main function for the Gradio interface. It processes the PDF,
429
+ # # extracts data, gets harmonised titles, and returns the results.
430
+ # # """
431
+ # # # Get API Key from environment secrets
432
+ # # gemini_api_key = os.environ.get("GEMINI_API_KEY")
433
+
434
+ # # if not pdf_file:
435
+ # # return "Please upload a PDF file.", None, None
436
+
437
+ # # if not gemini_api_key:
438
+ # # return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
439
+
440
+ # # # --- Step 1: Process PDF to text ---
441
+ # # progress_update = "Processing PDF to text..."
442
+ # # print(progress_update)
443
+ # # full_text = preprocess_pdf_text(pdf_file.name)
444
+ # # if full_text.startswith("Error"):
445
+ # # return f"Failed to process PDF: {full_text}", None, None
446
+
447
+ # # # --- Step 2: Extract structured data from text ---
448
+ # # progress_update += "\nExtracting structured data from DAR text..."
449
+ # # print(progress_update)
450
+ # # parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
451
+
452
+ # # if parsed_report.parsing_errors:
453
+ # # error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
454
+ # # print(error_msg)
455
+ # # return error_msg, None, None
456
+
457
+ # # if not parsed_report.audit_paras:
458
+ # # return "Could not find any audit paras in the document.", None, None
459
 
460
+ # # # --- Step 3: Get harmonised titles ---
461
+ # # progress_update += "\nGenerating harmonised titles..."
462
+ # # print(progress_update)
463
+ # # original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
464
 
465
+ # # if not original_headings:
466
+ # # return "Found audit paras, but could not extract any headings to harmonise.", None, None
 
 
 
467
 
468
+ # # # UPDATED: Pass the full_text to the harmonisation function for better context
469
+ # # harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
470
+
471
+ # # if not harmonised_results:
472
+ # # return "Failed to generate harmonised titles.", None, None
473
+
474
+ # # # --- Step 4: Combine data and prepare for output ---
475
+ # # progress_update += "\nCombining data and preparing output..."
476
+ # # print(progress_update)
477
+
478
+ # # # Create a mapping from original heading to harmonised heading
479
+ # # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
480
+
481
+ # # # Combine all data into a list of dictionaries for display and download
482
+ # # final_data_list = []
483
+ # # for para in parsed_report.audit_paras:
484
+ # # para_dict = para.dict()
485
+ # # header_dict = parsed_report.header.dict() if parsed_report.header else {}
486
+
487
+ # # # Combine header and para info
488
+ # # combined_info = {**header_dict, **para_dict}
489
+
490
+ # # # Add the harmonised heading
491
+ # # harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
492
+ # # combined_info['harmonised_audit_para_heading'] = harmonised_heading
493
+ # # final_data_list.append(combined_info)
494
+
495
+ # # # --- Step 5: Generate HTML report and Excel file ---
496
+ # # html_output = create_html_report(final_data_list)
497
+
498
+ # # df = pd.DataFrame(final_data_list)
499
+ # # # Reorder columns for clarity in the Excel file
500
+ # # excel_columns = [
501
+ # # 'gstin', 'trade_name', 'category', 'audit_group_number',
502
+ # # 'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
503
+ # # 'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
504
+ # # 'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
505
+ # # ]
506
+ # # df = df.reindex(columns=excel_columns).fillna('N/A')
507
+
508
+ # # # Save to an in-memory buffer
509
+ # # output_excel = BytesIO()
510
+ # # with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
511
+ # # df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
512
+ # # output_excel.seek(0)
513
+
514
+ # # excel_file_name = "dar_extraction_report.xlsx"
515
+ # # with open(excel_file_name, "wb") as f:
516
+ # # f.write(output_excel.getbuffer())
517
+
518
+ # # return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
519
+
520
+
521
+ # # # --- Gradio Interface Definition ---
522
+ # # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
523
+ # # gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
524
+ # # gr.Markdown("## Audit 1 Commissionerate Mumbai")
525
+ # # gr.Markdown(
526
+ # # "Upload a Draft Audit Report (DAR) in PDF format. "
527
+ # # "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
528
+
529
+ # # )
530
+
531
+ # # with gr.Row():
532
+ # # with gr.Column(scale=1):
533
+ # # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
534
+ # # submit_btn = gr.Button("Process Report", variant="primary")
535
+ # # with gr.Column(scale=2):
536
+ # # status_output = gr.Textbox(label="Processing Status", interactive=False)
537
+ # # excel_output = gr.File(label="Download Excel Report")
538
+
539
+ # # gr.Markdown("## Harmonised Audit Para Titles")
540
+ # # html_output = gr.HTML()
541
+
542
+ # # submit_btn.click(
543
+ # # fn=process_dar_pdf,
544
+ # # inputs=[pdf_input],
545
+ # # outputs=[status_output, html_output, excel_output]
546
+ # # )
547
+
548
+ # # if __name__ == "__main__":
549
+ # # demo.launch(debug=True)