rameshmoorthy commited on
Commit
ba31f91
·
verified ·
1 Parent(s): d92fc43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +312 -133
app.py CHANGED
@@ -2,7 +2,9 @@ import gradio as gr
2
  import pandas as pd
3
  from io import BytesIO
4
  import os
 
5
 
 
6
  from dar_processor import preprocess_pdf_text
7
  from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
8
  from models import ParsedDARReport, HarmonisedPara
@@ -11,193 +13,370 @@ def create_html_report(results_with_harmonised: list[dict]) -> str:
11
  """Generates an HTML string to display the results in a styled table."""
12
  if not results_with_harmonised:
13
  return "<p>No audit paras found or processed.</p>"
14
-
15
- # Basic CSS for styling
16
  style = """
17
  <style>
18
  body { font-family: sans-serif; }
19
  .styled-table {
20
- border-collapse: collapse;
21
- margin: 25px 0;
22
- font-size: 0.9em;
23
- min-width: 400px;
24
- box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
25
- border-radius: 8px;
26
- overflow: hidden;
27
- }
28
- .styled-table thead tr {
29
- background-color: #009879;
30
- color: #ffffff;
31
- text-align: left;
32
- }
33
- .styled-table th, .styled-table td {
34
- padding: 12px 15px;
35
- border-bottom: 1px solid #dddddd;
36
- }
37
- .styled-table tbody tr:last-of-type {
38
- border-bottom: 2px solid #009879;
39
- }
40
- .styled-table tbody tr.active-row {
41
- font-weight: bold;
42
- color: #009879;
43
  }
 
 
 
44
  </style>
45
  """
46
-
47
- # Table header
48
- html = f"{style}<table class='styled-table'>"
49
- html += """
50
- <thead>
51
- <tr>
52
- <th>Para No.</th>
53
- <th>Original Audit Para Heading</th>
54
- <th>Harmonised Audit Para Heading</th>
55
- <th>Amount Involved (in Lakhs)</th>
56
- </tr>
57
- </thead>
58
- """
59
-
60
- # Table body
61
- html += "<tbody>"
62
  for item in results_with_harmonised:
63
  para_num = item.get('audit_para_number', 'N/A')
64
  original_heading = item.get('audit_para_heading', 'N/A')
65
  harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
66
  amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
67
-
68
- html += f"""
69
- <tr>
70
- <td>{para_num}</td>
71
- <td>{original_heading}</td>
72
- <td>{harmonised_heading}</td>
73
- <td>{amount}</td>
74
- </tr>
75
- """
76
  html += "</tbody></table>"
77
  return html
78
 
79
-
80
  def process_dar_pdf(pdf_file):
81
- """
82
- The main function for the Gradio interface. It processes the PDF,
83
- extracts data, gets harmonised titles, and returns the results.
84
- """
85
- # Get API Key from environment secrets
86
  gemini_api_key = os.environ.get("GEMINI_API_KEY")
87
-
88
  if not pdf_file:
89
  return "Please upload a PDF file.", None, None
90
-
91
  if not gemini_api_key:
92
- return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
93
 
94
- # --- Step 1: Process PDF to text ---
95
- progress_update = "Processing PDF to text..."
96
- print(progress_update)
97
  full_text = preprocess_pdf_text(pdf_file.name)
98
  if full_text.startswith("Error"):
99
  return f"Failed to process PDF: {full_text}", None, None
100
 
101
- # --- Step 2: Extract structured data from text ---
102
- progress_update += "\nExtracting structured data from DAR text..."
103
- print(progress_update)
104
- parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
105
-
106
- if parsed_report.parsing_errors:
107
- error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
108
- print(error_msg)
109
  return error_msg, None, None
110
-
111
- if not parsed_report.audit_paras:
112
- return "Could not find any audit paras in the document.", None, None
113
 
114
- # --- Step 3: Get harmonised titles ---
115
- progress_update += "\nGenerating harmonised titles..."
116
- print(progress_update)
117
- original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
118
-
119
  if not original_headings:
120
- return "Found audit paras, but could not extract any headings to harmonise.", None, None
121
-
122
- # UPDATED: Pass the full_text to the harmonisation function for better context
123
- harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
124
-
125
  if not harmonised_results:
126
  return "Failed to generate harmonised titles.", None, None
127
-
128
- # --- Step 4: Combine data and prepare for output ---
129
- progress_update += "\nCombining data and preparing output..."
130
- print(progress_update)
131
 
132
- # Create a mapping from original heading to harmonised heading
133
  harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
134
-
135
- # Combine all data into a list of dictionaries for display and download
136
  final_data_list = []
137
  for para in parsed_report.audit_paras:
138
- para_dict = para.dict()
139
- header_dict = parsed_report.header.dict() if parsed_report.header else {}
140
-
141
- # Combine header and para info
142
- combined_info = {**header_dict, **para_dict}
143
-
144
- # Add the harmonised heading
145
- harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
146
- combined_info['harmonised_audit_para_heading'] = harmonised_heading
147
  final_data_list.append(combined_info)
148
-
149
- # --- Step 5: Generate HTML report and Excel file ---
150
  html_output = create_html_report(final_data_list)
151
-
 
152
  df = pd.DataFrame(final_data_list)
153
- # Reorder columns for clarity in the Excel file
154
  excel_columns = [
155
- 'gstin', 'trade_name', 'category', 'audit_group_number',
156
- 'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
157
- 'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
158
- 'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
159
  ]
160
  df = df.reindex(columns=excel_columns).fillna('N/A')
161
 
162
- # Save to an in-memory buffer
163
  output_excel = BytesIO()
164
- with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
165
- df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
166
  output_excel.seek(0)
167
 
168
  excel_file_name = "dar_extraction_report.xlsx"
169
  with open(excel_file_name, "wb") as f:
170
  f.write(output_excel.getbuffer())
171
 
172
- return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
173
-
174
 
175
  # --- Gradio Interface Definition ---
176
  with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
177
- gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
178
- gr.Markdown("## Audit 1 Commissionerate Mumbai")
179
- gr.Markdown(
180
- "Upload a Draft Audit Report (DAR) in PDF format. "
181
- "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- )
 
 
 
 
 
 
 
 
 
184
 
185
- with gr.Row():
186
- with gr.Column(scale=1):
187
- pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
188
- submit_btn = gr.Button("Process Report", variant="primary")
189
- with gr.Column(scale=2):
190
- status_output = gr.Textbox(label="Processing Status", interactive=False)
191
- excel_output = gr.File(label="Download Excel Report")
 
 
 
 
 
 
 
192
 
193
- gr.Markdown("## Harmonised Audit Para Titles")
194
- html_output = gr.HTML()
195
-
196
- submit_btn.click(
197
- fn=process_dar_pdf,
198
- inputs=[pdf_input],
199
- outputs=[status_output, html_output, excel_output]
200
  )
201
 
202
  if __name__ == "__main__":
203
- demo.launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  from io import BytesIO
4
  import os
5
+ import json
6
 
7
+ # These imports assume the other python files (dar_processor.py, etc.) are in the same directory.
8
  from dar_processor import preprocess_pdf_text
9
  from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
10
  from models import ParsedDARReport, HarmonisedPara
 
13
  """Generates an HTML string to display the results in a styled table."""
14
  if not results_with_harmonised:
15
  return "<p>No audit paras found or processed.</p>"
 
 
16
  style = """
17
  <style>
18
  body { font-family: sans-serif; }
19
  .styled-table {
20
+ border-collapse: collapse; margin: 25px 0; font-size: 0.9em;
21
+ min-width: 400px; box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
22
+ border-radius: 8px; overflow: hidden;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
+ .styled-table thead tr { background-color: #009879; color: #ffffff; text-align: left; }
25
+ .styled-table th, .styled-table td { padding: 12px 15px; border-bottom: 1px solid #dddddd; }
26
+ .styled-table tbody tr:last-of-type { border-bottom: 2px solid #009879; }
27
  </style>
28
  """
29
+ html = f"{style}<table class='styled-table'><thead><tr><th>Para No.</th><th>Original Audit Para Heading</th><th>Harmonised Audit Para Heading</th><th>Amount Involved (in Lakhs)</th></tr></thead><tbody>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  for item in results_with_harmonised:
31
  para_num = item.get('audit_para_number', 'N/A')
32
  original_heading = item.get('audit_para_heading', 'N/A')
33
  harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
34
  amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
35
+ html += f"<tr><td>{para_num}</td><td>{original_heading}</td><td>{harmonised_heading}</td><td>{amount}</td></tr>"
 
 
 
 
 
 
 
 
36
  html += "</tbody></table>"
37
  return html
38
 
 
39
  def process_dar_pdf(pdf_file):
40
+ """The main processing function, called after successful login."""
 
 
 
 
41
  gemini_api_key = os.environ.get("GEMINI_API_KEY")
 
42
  if not pdf_file:
43
  return "Please upload a PDF file.", None, None
 
44
  if not gemini_api_key:
45
+ return "Error: GEMINI_API_KEY secret not found in Space settings.", None, None
46
 
47
+ # Step 1: Process PDF to text
 
 
48
  full_text = preprocess_pdf_text(pdf_file.name)
49
  if full_text.startswith("Error"):
50
  return f"Failed to process PDF: {full_text}", None, None
51
 
52
+ # Step 2: Extract structured data
53
+ parsed_report = get_structured_data_with_gemini(gemini_api_key, full_text)
54
+ if parsed_report.parsing_errors or not parsed_report.audit_paras:
55
+ error_msg = parsed_report.parsing_errors or "Could not find any audit paras."
 
 
 
 
56
  return error_msg, None, None
 
 
 
57
 
58
+ # Step 3: Get harmonised titles
59
+ original_headings = [p.audit_para_heading for p in parsed_report.audit_paras if p.audit_para_heading]
 
 
 
60
  if not original_headings:
61
+ return "Found paras but no headings to harmonise.", None, None
62
+
63
+ harmonised_results = get_harmonised_titles(gemini_api_key, full_text, original_headings)
 
 
64
  if not harmonised_results:
65
  return "Failed to generate harmonised titles.", None, None
 
 
 
 
66
 
67
+ # Step 4: Combine and prepare outputs
68
  harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
 
 
69
  final_data_list = []
70
  for para in parsed_report.audit_paras:
71
+ combined_info = (parsed_report.header.dict() if parsed_report.header else {}) | para.dict()
72
+ combined_info['harmonised_audit_para_heading'] = harmonised_map.get(para.audit_para_heading, "N/A")
 
 
 
 
 
 
 
73
  final_data_list.append(combined_info)
74
+
 
75
  html_output = create_html_report(final_data_list)
76
+
77
+ # Step 5: Create Excel file for download
78
  df = pd.DataFrame(final_data_list)
 
79
  excel_columns = [
80
+ 'gstin', 'trade_name', 'category', 'audit_group_number', 'audit_para_number',
81
+ 'audit_para_heading', 'harmonised_audit_para_heading', 'revenue_involved_lakhs_rs',
82
+ 'revenue_recovered_lakhs_rs', 'status_of_para', 'total_amount_detected_overall_rs',
83
+ 'total_amount_recovered_overall_rs'
84
  ]
85
  df = df.reindex(columns=excel_columns).fillna('N/A')
86
 
 
87
  output_excel = BytesIO()
88
+ df.to_excel(output_excel, index=False, sheet_name='DAR_Extraction')
 
89
  output_excel.seek(0)
90
 
91
  excel_file_name = "dar_extraction_report.xlsx"
92
  with open(excel_file_name, "wb") as f:
93
  f.write(output_excel.getbuffer())
94
 
95
+ return "Processing complete.", html_output, gr.File(value=excel_file_name)
 
96
 
97
  # --- Gradio Interface Definition ---
98
  with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
99
+
100
+ # --- Login UI (visible initially) ---
101
+ with gr.Column(visible=True) as login_ui:
102
+ gr.Markdown("# DAR Harmonisation Tool Login")
103
+ gr.Markdown("Please enter the credentials to access the tool.")
104
+ with gr.Row():
105
+ username_input = gr.Textbox(label="Username", placeholder="Enter your username")
106
+ password_input = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
107
+ login_button = gr.Button("Login", variant="primary")
108
+ login_error_msg = gr.Markdown(visible=False)
109
+
110
+ # --- Main App UI (hidden initially) ---
111
+ with gr.Column(visible=False) as main_app_ui:
112
+ gr.Markdown("# DAR PDF Harmonisation Tool")
113
+ gr.Markdown(
114
+ "Upload a Departmental Audit Report (DAR) in PDF format. The tool will process it and generate harmonised titles."
115
+ "\n*Note: This application requires secrets to be set in the Hugging Face Space settings: `GEMINI_API_KEY` and `APP_CREDENTIALS_JSON`.*"
116
+ )
117
+ with gr.Row():
118
+ with gr.Column(scale=1):
119
+ pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
120
+ submit_btn = gr.Button("Process Report", variant="primary")
121
+ with gr.Column(scale=2):
122
+ status_output = gr.Textbox(label="Processing Status", interactive=False)
123
+ excel_output = gr.File(label="Download Excel Report")
124
+ gr.Markdown("## Harmonised Audit Para Titles")
125
+ html_output = gr.HTML()
126
+
127
+ submit_btn.click(
128
+ fn=process_dar_pdf,
129
+ inputs=[pdf_input],
130
+ outputs=[status_output, html_output, excel_output]
131
+ )
132
+
133
+ # --- Login Functionality ---
134
+ def login(username, password):
135
+ """
136
+ Checks user credentials against a dictionary.
137
+ For production, this dictionary is loaded from a Hugging Face secret.
138
+ """
139
+ # Default credentials for local testing if secret is not found
140
+ default_creds = {
141
+ "admin": "iloveaudit1",
142
+ "planning_officer": "pco_password",
143
+ "audit_group1": "ag1_password"
144
+ }
145
 
146
+ # In production, load credentials from a HF secret as a JSON string.
147
+ # e.g., '{"admin": "iloveaudit1", "planning_officer": "pco_password"}'
148
+ auth_creds_json = os.environ.get("APP_CREDENTIALS_JSON")
149
+
150
+ try:
151
+ # Use credentials from secret if available and valid JSON
152
+ creds = json.loads(auth_creds_json) if auth_creds_json else default_creds
153
+ except json.JSONDecodeError:
154
+ # Fallback to default if secret contains invalid JSON
155
+ creds = default_creds
156
 
157
+ if username in creds and password == creds.get(username):
158
+ # Login successful: hide login UI, show main app
159
+ return {
160
+ login_ui: gr.update(visible=False),
161
+ main_app_ui: gr.update(visible=True),
162
+ login_error_msg: gr.update(visible=False)
163
+ }
164
+ else:
165
+ # Login failed: keep login UI visible, show error message
166
+ return {
167
+ login_ui: gr.update(visible=True),
168
+ main_app_ui: gr.update(visible=False),
169
+ login_error_msg: gr.update(value="<p style='color:red;'>Invalid username or password.</p>", visible=True)
170
+ }
171
 
172
+ login_button.click(
173
+ login,
174
+ inputs=[username_input, password_input],
175
+ outputs=[login_ui, main_app_ui, login_error_msg]
 
 
 
176
  )
177
 
178
  if __name__ == "__main__":
179
+ demo.launch(debug=True)
180
+ # import gradio as gr
181
+ # import pandas as pd
182
+ # from io import BytesIO
183
+ # import os
184
+
185
+ # from dar_processor import preprocess_pdf_text
186
+ # from gemini_utils import get_structured_data_with_gemini, get_harmonised_titles
187
+ # from models import ParsedDARReport, HarmonisedPara
188
+
189
+ # def create_html_report(results_with_harmonised: list[dict]) -> str:
190
+ # """Generates an HTML string to display the results in a styled table."""
191
+ # if not results_with_harmonised:
192
+ # return "<p>No audit paras found or processed.</p>"
193
+
194
+ # # Basic CSS for styling
195
+ # style = """
196
+ # <style>
197
+ # body { font-family: sans-serif; }
198
+ # .styled-table {
199
+ # border-collapse: collapse;
200
+ # margin: 25px 0;
201
+ # font-size: 0.9em;
202
+ # min-width: 400px;
203
+ # box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
204
+ # border-radius: 8px;
205
+ # overflow: hidden;
206
+ # }
207
+ # .styled-table thead tr {
208
+ # background-color: #009879;
209
+ # color: #ffffff;
210
+ # text-align: left;
211
+ # }
212
+ # .styled-table th, .styled-table td {
213
+ # padding: 12px 15px;
214
+ # border-bottom: 1px solid #dddddd;
215
+ # }
216
+ # .styled-table tbody tr:last-of-type {
217
+ # border-bottom: 2px solid #009879;
218
+ # }
219
+ # .styled-table tbody tr.active-row {
220
+ # font-weight: bold;
221
+ # color: #009879;
222
+ # }
223
+ # </style>
224
+ # """
225
+
226
+ # # Table header
227
+ # html = f"{style}<table class='styled-table'>"
228
+ # html += """
229
+ # <thead>
230
+ # <tr>
231
+ # <th>Para No.</th>
232
+ # <th>Original Audit Para Heading</th>
233
+ # <th>Harmonised Audit Para Heading</th>
234
+ # <th>Amount Involved (in Lakhs)</th>
235
+ # </tr>
236
+ # </thead>
237
+ # """
238
+
239
+ # # Table body
240
+ # html += "<tbody>"
241
+ # for item in results_with_harmonised:
242
+ # para_num = item.get('audit_para_number', 'N/A')
243
+ # original_heading = item.get('audit_para_heading', 'N/A')
244
+ # harmonised_heading = item.get('harmonised_audit_para_heading', 'N/A')
245
+ # amount = f"₹{item.get('revenue_involved_lakhs_rs', 0.0):,.2f} L"
246
+
247
+ # html += f"""
248
+ # <tr>
249
+ # <td>{para_num}</td>
250
+ # <td>{original_heading}</td>
251
+ # <td>{harmonised_heading}</td>
252
+ # <td>{amount}</td>
253
+ # </tr>
254
+ # """
255
+ # html += "</tbody></table>"
256
+ # return html
257
+
258
+
259
+ # def process_dar_pdf(pdf_file):
260
+ # """
261
+ # The main function for the Gradio interface. It processes the PDF,
262
+ # extracts data, gets harmonised titles, and returns the results.
263
+ # """
264
+ # # Get API Key from environment secrets
265
+ # gemini_api_key = os.environ.get("GEMINI_API_KEY")
266
+
267
+ # if not pdf_file:
268
+ # return "Please upload a PDF file.", None, None
269
+
270
+ # if not gemini_api_key:
271
+ # return "Error: GEMINI_API_KEY secret not found. Please configure it in your Hugging Face Space settings.", None, None
272
+
273
+ # # --- Step 1: Process PDF to text ---
274
+ # progress_update = "Processing PDF to text..."
275
+ # print(progress_update)
276
+ # full_text = preprocess_pdf_text(pdf_file.name)
277
+ # if full_text.startswith("Error"):
278
+ # return f"Failed to process PDF: {full_text}", None, None
279
+
280
+ # # --- Step 2: Extract structured data from text ---
281
+ # progress_update += "\nExtracting structured data from DAR text..."
282
+ # print(progress_update)
283
+ # parsed_report: ParsedDARReport = get_structured_data_with_gemini(gemini_api_key, full_text)
284
+
285
+ # if parsed_report.parsing_errors:
286
+ # error_msg = f"Error during data extraction: {parsed_report.parsing_errors}"
287
+ # print(error_msg)
288
+ # return error_msg, None, None
289
+
290
+ # if not parsed_report.audit_paras:
291
+ # return "Could not find any audit paras in the document.", None, None
292
+
293
+ # # --- Step 3: Get harmonised titles ---
294
+ # progress_update += "\nGenerating harmonised titles..."
295
+ # print(progress_update)
296
+ # original_headings = [para.audit_para_heading for para in parsed_report.audit_paras if para.audit_para_heading]
297
+
298
+ # if not original_headings:
299
+ # return "Found audit paras, but could not extract any headings to harmonise.", None, None
300
+
301
+ # # UPDATED: Pass the full_text to the harmonisation function for better context
302
+ # harmonised_results: list[HarmonisedPara] = get_harmonised_titles(gemini_api_key, full_text, original_headings)
303
+
304
+ # if not harmonised_results:
305
+ # return "Failed to generate harmonised titles.", None, None
306
+
307
+ # # --- Step 4: Combine data and prepare for output ---
308
+ # progress_update += "\nCombining data and preparing output..."
309
+ # print(progress_update)
310
+
311
+ # # Create a mapping from original heading to harmonised heading
312
+ # harmonised_map = {item.original_heading: item.harmonised_heading for item in harmonised_results}
313
+
314
+ # # Combine all data into a list of dictionaries for display and download
315
+ # final_data_list = []
316
+ # for para in parsed_report.audit_paras:
317
+ # para_dict = para.dict()
318
+ # header_dict = parsed_report.header.dict() if parsed_report.header else {}
319
+
320
+ # # Combine header and para info
321
+ # combined_info = {**header_dict, **para_dict}
322
+
323
+ # # Add the harmonised heading
324
+ # harmonised_heading = harmonised_map.get(para.audit_para_heading, "N/A")
325
+ # combined_info['harmonised_audit_para_heading'] = harmonised_heading
326
+ # final_data_list.append(combined_info)
327
+
328
+ # # --- Step 5: Generate HTML report and Excel file ---
329
+ # html_output = create_html_report(final_data_list)
330
+
331
+ # df = pd.DataFrame(final_data_list)
332
+ # # Reorder columns for clarity in the Excel file
333
+ # excel_columns = [
334
+ # 'gstin', 'trade_name', 'category', 'audit_group_number',
335
+ # 'audit_para_number', 'audit_para_heading', 'harmonised_audit_para_heading',
336
+ # 'revenue_involved_lakhs_rs', 'revenue_recovered_lakhs_rs', 'status_of_para',
337
+ # 'total_amount_detected_overall_rs', 'total_amount_recovered_overall_rs'
338
+ # ]
339
+ # df = df.reindex(columns=excel_columns).fillna('N/A')
340
+
341
+ # # Save to an in-memory buffer
342
+ # output_excel = BytesIO()
343
+ # with pd.ExcelWriter(output_excel, engine='openpyxl') as writer:
344
+ # df.to_excel(writer, index=False, sheet_name='DAR_Extraction')
345
+ # output_excel.seek(0)
346
+
347
+ # excel_file_name = "dar_extraction_report.xlsx"
348
+ # with open(excel_file_name, "wb") as f:
349
+ # f.write(output_excel.getbuffer())
350
+
351
+ # return "Processing complete.", html_output, gr.File(value=excel_file_name, label="Download Excel Report")
352
+
353
+
354
+ # # --- Gradio Interface Definition ---
355
+ # with gr.Blocks(theme=gr.themes.Soft(), title="DAR Harmonisation Tool") as demo:
356
+ # gr.Markdown("# Draft Audit Report DAR Harmonisation Tool")
357
+ # gr.Markdown("## Audit 1 Commissionerate Mumbai")
358
+ # gr.Markdown(
359
+ # "Upload a Draft Audit Report (DAR) in PDF format. "
360
+ # "The tool will extract audit para details, generate standardised para titles, and provide an Excel download of the results."
361
+
362
+ # )
363
+
364
+ # with gr.Row():
365
+ # with gr.Column(scale=1):
366
+ # pdf_input = gr.File(label="Upload DAR PDF", file_types=[".pdf"])
367
+ # submit_btn = gr.Button("Process Report", variant="primary")
368
+ # with gr.Column(scale=2):
369
+ # status_output = gr.Textbox(label="Processing Status", interactive=False)
370
+ # excel_output = gr.File(label="Download Excel Report")
371
+
372
+ # gr.Markdown("## Harmonised Audit Para Titles")
373
+ # html_output = gr.HTML()
374
+
375
+ # submit_btn.click(
376
+ # fn=process_dar_pdf,
377
+ # inputs=[pdf_input],
378
+ # outputs=[status_output, html_output, excel_output]
379
+ # )
380
+
381
+ # if __name__ == "__main__":
382
+ # demo.launch(debug=True)