Bhuvi13 commited on
Commit
5dac2a1
Β·
verified Β·
1 Parent(s): 504246e

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +410 -366
src/streamlit_app.py CHANGED
@@ -37,7 +37,7 @@ else: # Linux/Mac (HF Spaces uses Linux)
37
  pass
38
 
39
  # Page configuration
40
- st.set_page_config(page_title="Remittance Data Viewer", layout="wide")
41
 
42
  # Custom CSS
43
  st.markdown("""
@@ -141,10 +141,34 @@ def load_jsonl(file):
141
  data.append(json.loads(line))
142
  return data
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def save_to_jsonl(data):
145
- """Convert data list to JSONL format"""
146
- jsonl_content = '\n'.join([json.dumps(record) for record in data])
147
- return jsonl_content
 
 
 
148
 
149
  def pdf_to_images(pdf_file):
150
  """Convert PDF to list of PIL Images (one per page)"""
@@ -155,8 +179,6 @@ def pdf_to_images(pdf_file):
155
 
156
  for page_num in range(pdf_document.page_count):
157
  page = pdf_document[page_num]
158
- # Render page to an image (higher DPI for better quality)
159
- # Using 3x zoom (300 DPI equivalent) for much better clarity
160
  pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), alpha=False)
161
  img_data = pix.tobytes("png")
162
  img = Image.open(io.BytesIO(img_data))
@@ -182,7 +204,6 @@ def perform_ocr(image, bbox):
182
 
183
  def scale_image_to_fixed_size(image, max_width=900, max_height=1100):
184
  """Scale image to fit within max dimensions while maintaining aspect ratio - NO PADDING"""
185
- # Convert to RGB with proper handling
186
  if image.mode not in ('RGB', 'RGBA'):
187
  image = image.convert('RGB')
188
  elif image.mode == 'RGBA':
@@ -190,7 +211,6 @@ def scale_image_to_fixed_size(image, max_width=900, max_height=1100):
190
  background.paste(image, mask=image.split()[3])
191
  image = background
192
 
193
- # Calculate scaling ratio
194
  width_ratio = max_width / image.width
195
  height_ratio = max_height / image.height
196
  ratio = min(width_ratio, height_ratio)
@@ -198,34 +218,47 @@ def scale_image_to_fixed_size(image, max_width=900, max_height=1100):
198
  new_width = int(image.width * ratio)
199
  new_height = int(image.height * ratio)
200
 
201
- # Always use LANCZOS for highest quality resampling
202
  resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
203
 
204
- # Return without padding - image takes only the space it needs
205
  return resized_image, ratio, 0, 0
206
 
207
- def swap_customer_supplier_details(index):
208
- """Swap customer and supplier details"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  gt_parse = st.session_state.edited_data[index].get('gt_parse', {})
210
- customer_supplier = gt_parse.get('Customer_supplier_details', {})
211
 
212
- # Store customer values
213
- temp_customer_name = customer_supplier.get('Customer_name', '')
214
- temp_customer_address = customer_supplier.get('Customer_address', '')
215
- temp_customer_contact = customer_supplier.get('Customer_contact_info', '')
216
 
217
- # Swap: Customer ← Supplier
218
- customer_supplier['Customer_name'] = customer_supplier.get('Supplier_name', '')
219
- customer_supplier['Customer_address'] = customer_supplier.get('Supplier_address', '')
220
- customer_supplier['Customer_contact_info'] = customer_supplier.get('Supplier_contact_info', '')
221
 
222
- # Swap: Supplier ← Customer (from temp)
223
- customer_supplier['Supplier_name'] = temp_customer_name
224
- customer_supplier['Supplier_address'] = temp_customer_address
225
- customer_supplier['Supplier_contact_info'] = temp_customer_contact
226
 
227
  # Update session state
228
- gt_parse['Customer_supplier_details'] = customer_supplier
229
  st.session_state.edited_data[index]['gt_parse'] = gt_parse
230
  st.session_state.modified_indices.add(index)
231
 
@@ -241,9 +274,9 @@ if 'page' not in st.session_state:
241
  if 'images' not in st.session_state:
242
  st.session_state.images = {}
243
  if 'pdf_metadata' not in st.session_state:
244
- st.session_state.pdf_metadata = {} # Store {filename: {'pages': [images], 'current_page': 0}}
245
  if 'current_page_num' not in st.session_state:
246
- st.session_state.current_page_num = {} # Track current page for each file
247
  if 'modified_indices' not in st.session_state:
248
  st.session_state.modified_indices = set()
249
  if 'ocr_active_section' not in st.session_state:
@@ -254,8 +287,6 @@ if 'ocr_line_item_row' not in st.session_state:
254
  st.session_state.ocr_line_item_row = None
255
  if 'canvas_key' not in st.session_state:
256
  st.session_state.canvas_key = 0
257
- if 'line_items_temp' not in st.session_state:
258
- st.session_state.line_items_temp = []
259
  if 'button_clicked' not in st.session_state:
260
  st.session_state.button_clicked = False
261
  if 'save_message' not in st.session_state:
@@ -272,15 +303,63 @@ if 'navigating_page' not in st.session_state:
272
  def auto_save(index):
273
  """Automatically save changes to session state and mark as modified"""
274
  if st.session_state.edited_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  st.session_state.data = st.session_state.edited_data.copy()
276
  st.session_state.modified_indices.add(index)
277
 
278
  def activate_ocr_field(section, field, row_idx=None):
279
- """Activate OCR for a specific field.
280
- Toggle behavior: if the same field is already active, deactivate it to avoid repeated activations/looping.
281
- Also ensures the line-item expander stays expanded when OCR is requested.
282
- """
283
- # If the requested field is already active, deactivate it (toggle off)
284
  if (st.session_state.ocr_active_section == section and
285
  st.session_state.ocr_active_field == field and
286
  st.session_state.ocr_line_item_row == row_idx):
@@ -288,18 +367,15 @@ def activate_ocr_field(section, field, row_idx=None):
288
  st.session_state.ocr_active_field = None
289
  st.session_state.ocr_line_item_row = None
290
  else:
291
- # Activate new OCR target
292
  st.session_state.ocr_active_section = section
293
  st.session_state.ocr_active_field = field
294
  st.session_state.ocr_line_item_row = row_idx
295
 
296
- # If it's a line-item, mark that expander as expanded so it remains open after rerun
297
- if section == 'Line_items' and row_idx is not None:
298
  current_idx = st.session_state.get('current_index', 0)
299
  expander_key = f"line_item_expander_{current_idx}_{row_idx}"
300
  st.session_state[expander_key] = True
301
 
302
- # Bump canvas_key to ensure canvas is refreshed/cleared when toggling OCR
303
  st.session_state.canvas_key += 1
304
  st.rerun()
305
 
@@ -311,7 +387,7 @@ def is_ocr_active(section, field, row_idx=None):
311
 
312
  # PAGE 1: Upload Page
313
  if st.session_state.page == 'upload':
314
- st.title("πŸ“€ Remittance Data Viewer with OCR")
315
  st.markdown("### Upload your files to begin")
316
 
317
  st.markdown("**Step 1: Upload JSONL File**")
@@ -344,20 +420,15 @@ if st.session_state.page == 'upload':
344
  file_ext = file.name.lower().split('.')[-1]
345
 
346
  if file_ext == 'pdf':
347
- # Convert PDF to images
348
  pdf_images = pdf_to_images(file)
349
  if pdf_images:
350
- # Store first page as the main image
351
  images_dict[file.name] = pdf_images[0]
352
- # Store all pages in metadata
353
  pdf_metadata[file.name] = {
354
  'pages': pdf_images,
355
  'total_pages': len(pdf_images),
356
  'current_page': 0
357
  }
358
- #st.info(f"πŸ“„ Converted PDF '{file.name}' ({len(pdf_images)} pages)")
359
  else:
360
- # Handle regular images
361
  image = Image.open(file)
362
  images_dict[file.name] = image
363
 
@@ -367,26 +438,27 @@ if st.session_state.page == 'upload':
367
  st.session_state.images = images_dict
368
  st.session_state.pdf_metadata = pdf_metadata
369
 
370
- # Initialize current page tracking
371
  for filename in pdf_metadata.keys():
372
  if filename not in st.session_state.current_page_num:
373
  st.session_state.current_page_num[filename] = 0
374
 
375
  if st.session_state.data is not None:
376
- gt_file_names = [rec.get('file_name', '') for rec in st.session_state.data]
 
 
 
 
 
377
  matched_images = set()
378
  unmatched_gt_files = []
379
 
380
- # Try to match with and without extensions
381
  for fname in gt_file_names:
382
  if not fname:
383
  continue
384
 
385
- # Try exact match first
386
  if fname in images_dict:
387
  matched_images.add(fname)
388
  else:
389
- # Try adding common extensions
390
  found = False
391
  for ext in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
392
  if fname + ext in images_dict:
@@ -394,7 +466,6 @@ if st.session_state.page == 'upload':
394
  found = True
395
  break
396
 
397
- # Try matching filename without extension from uploaded files
398
  if not found:
399
  for uploaded_name in images_dict.keys():
400
  uploaded_base = uploaded_name.rsplit('.', 1)[0]
@@ -458,7 +529,7 @@ elif st.session_state.page == 'viewer':
458
  st.download_button(
459
  label=f"⬇️ Download Modified ({len(modified_data)})",
460
  data=jsonl_modified,
461
- file_name=f"modified_remittance_data_{today_date}.jsonl",
462
  mime="application/jsonl",
463
  type="primary",
464
  use_container_width=True
@@ -474,7 +545,7 @@ elif st.session_state.page == 'viewer':
474
  st.download_button(
475
  label=f"⬇️ Download Unmodified ({len(unmodified_data)})",
476
  data=jsonl_unmodified,
477
- file_name=f"unmodified_remittance_data_{today_date}.jsonl",
478
  mime="application/jsonl",
479
  use_container_width=True
480
  )
@@ -486,29 +557,28 @@ elif st.session_state.page == 'viewer':
486
  st.download_button(
487
  label=f"⬇️ Download All ({len(st.session_state.edited_data)})",
488
  data=jsonl_all,
489
- file_name=f"all_remittance_data_{today_date}.jsonl",
490
  mime="application/jsonl",
491
  use_container_width=True
492
  )
493
 
494
- file_names = [record.get('file_name', f'Record {i}') for i, record in enumerate(st.session_state.data or [])]
 
 
 
 
495
 
496
- # Guard: no records at all
497
  if not file_names:
498
  st.error("No records loaded. Please upload a JSONL file on the Upload page.")
499
  if st.button("← Back to Upload"):
500
  st.session_state.page = 'upload'
501
  st.rerun()
502
  else:
503
- # Build options (list is safer than range for length checks)
504
  options = list(range(len(file_names)))
505
 
506
- # Ensure edited_data exists and has consistent length
507
  if not st.session_state.edited_data or len(st.session_state.edited_data) != len(file_names):
508
- # try to sync edited_data to data
509
  st.session_state.edited_data = (st.session_state.data or []).copy()
510
 
511
- # Clamp current_index into valid range
512
  cur_idx = st.session_state.get('current_index', 0)
513
  try:
514
  cur_idx = int(cur_idx)
@@ -519,7 +589,6 @@ elif st.session_state.page == 'viewer':
519
  if cur_idx >= len(options):
520
  cur_idx = len(options) - 1
521
 
522
- # Show selectbox with a safe index
523
  selected_file = st.selectbox(
524
  "Select a file to view:",
525
  options=options,
@@ -527,10 +596,7 @@ elif st.session_state.page == 'viewer':
527
  index=cur_idx
528
  )
529
 
530
- # Persist chosen index
531
  st.session_state.current_index = selected_file
532
-
533
- # Safe access to the current record
534
  current_record = st.session_state.edited_data[selected_file]
535
 
536
  left_col, right_col = st.columns([1.6, 1.0], gap="small")
@@ -538,21 +604,19 @@ elif st.session_state.page == 'viewer':
538
  # LEFT SIDE: Image Display with OCR Canvas
539
  with left_col:
540
  with st.container(height=700, border=False):
541
- file_name = current_record.get('file_name', '')
 
542
 
543
  if file_name:
544
- # Find the actual file name (handle cases where extension is missing)
545
  actual_file_name = None
546
  if file_name in st.session_state.images:
547
  actual_file_name = file_name
548
  else:
549
- # Try adding common extensions
550
  for ext in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
551
  if file_name + ext in st.session_state.images:
552
  actual_file_name = file_name + ext
553
  break
554
 
555
- # Try matching without extension
556
  if not actual_file_name:
557
  for uploaded_name in st.session_state.images.keys():
558
  uploaded_base = uploaded_name.rsplit('.', 1)[0]
@@ -561,7 +625,6 @@ elif st.session_state.page == 'viewer':
561
  break
562
 
563
  if actual_file_name:
564
- # Check if this is a PDF with multiple pages
565
  is_pdf = actual_file_name in st.session_state.pdf_metadata
566
 
567
  if is_pdf:
@@ -569,7 +632,6 @@ elif st.session_state.page == 'viewer':
569
  total_pages = pdf_meta['total_pages']
570
  current_page = st.session_state.current_page_num.get(actual_file_name, 0)
571
 
572
- # PDF Navigation Header
573
  col_prev, col_info, col_next = st.columns([1, 2, 1])
574
 
575
  with col_prev:
@@ -583,7 +645,6 @@ elif st.session_state.page == 'viewer':
583
  next_clicked = st.button("Next ➑️", key=f"next_page_{selected_file}_{actual_file_name}",
584
  disabled=(current_page >= total_pages - 1), use_container_width=True)
585
 
586
- # Handle navigation only if not already navigating
587
  if not st.session_state.navigating_page:
588
  if prev_clicked:
589
  st.session_state.navigating_page = True
@@ -600,15 +661,12 @@ elif st.session_state.page == 'viewer':
600
  st.session_state.ocr_active_field = None
601
  st.rerun()
602
  else:
603
- # Reset the flag after rerun
604
  st.session_state.navigating_page = False
605
 
606
  if actual_file_name:
607
- # Determine if PDF and get the appropriate image
608
  is_pdf = actual_file_name in st.session_state.pdf_metadata
609
 
610
  if is_pdf:
611
- # Get the current page image
612
  current_page = st.session_state.current_page_num.get(actual_file_name, 0)
613
  pdf_meta = st.session_state.pdf_metadata[actual_file_name]
614
  current_image = pdf_meta['pages'][current_page]
@@ -622,12 +680,11 @@ elif st.session_state.page == 'viewer':
622
  st.text(f" β€’ {img_name}")
623
  if len(st.session_state.images) > 20:
624
  st.text(f" ... and {len(st.session_state.images) - 20} more")
 
625
 
626
  if current_image:
627
- # Scale to a reasonable size so canvas doesn't become excessively large
628
  scaled_image, scale_ratio, paste_x, paste_y = scale_image_to_fixed_size(current_image)
629
 
630
- # Render the canvas. Its internal canvas will be constrained by the wrapper due to CSS above.
631
  canvas_result = st_canvas(
632
  fill_color="rgba(255, 165, 0, 0.3)",
633
  stroke_width=2,
@@ -640,7 +697,6 @@ elif st.session_state.page == 'viewer':
640
  key=f"canvas_{selected_file}_{st.session_state.canvas_key}",
641
  )
642
 
643
- # Only attempt OCR if there's an active OCR target AND the user has drawn something (objects exist)
644
  if canvas_result.json_data is not None and st.session_state.ocr_active_field:
645
  objects = canvas_result.json_data.get("objects", [])
646
  if len(objects) > 0:
@@ -661,14 +717,13 @@ elif st.session_state.page == 'viewer':
661
 
662
  gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
663
 
664
- if st.session_state.ocr_active_section == 'Line_items':
665
- line_items = gt_parse.get('Line_items', [])
666
  row_idx = st.session_state.ocr_line_item_row
667
- if row_idx is not None and row_idx < len(line_items):
668
- line_items[row_idx][st.session_state.ocr_active_field] = ocr_text
669
- gt_parse['Line_items'] = line_items
670
 
671
- # ensure expander stays open for this row after OCR
672
  expander_key = f"line_item_expander_{selected_file}_{row_idx}"
673
  st.session_state[expander_key] = True
674
  else:
@@ -681,10 +736,6 @@ elif st.session_state.page == 'viewer':
681
  st.session_state.edited_data[selected_file]['gt_parse'] = gt_parse
682
  st.session_state.modified_indices.add(selected_file)
683
 
684
- # Keep the OCR field active so user can draw multiple rectangles for the same field
685
- # Field will only change when user clicks a different OCR button
686
-
687
- # Clear canvas for next OCR by bumping canvas_key then rerun
688
  st.session_state.canvas_key += 1
689
  st.rerun()
690
  else:
@@ -695,322 +746,366 @@ elif st.session_state.page == 'viewer':
695
  # RIGHT SIDE: Editable Details
696
  with right_col:
697
  with st.container(height=700, border=False):
698
- st.markdown("### πŸ“ Document Details")
699
 
700
  gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
701
 
702
  tab1, tab2, tab3, tab4 = st.tabs([
703
- "πŸ“„ Remittance Details",
704
  "πŸ‘₯ Party Details",
705
  "🏦 Bank Details",
706
  "πŸ“‹ Line Items"
707
  ])
708
 
709
- # TAB 1: Remittance Details
710
  with tab1:
711
- remittance = gt_parse.get('Remittance_details', {})
 
712
 
713
- # Each field with OCR button
 
 
714
  col_input, col_btn = st.columns([5, 1])
715
  with col_input:
716
- remittance['Remittance_adv_no'] = st.text_input(
717
- "Remittance Advice No",
718
- value=remittance.get('Remittance_adv_no', ''),
719
- key=f"rem_adv_no_{selected_file}"
720
  )
721
  with col_btn:
722
  st.markdown("<br>", unsafe_allow_html=True)
723
- if st.button("πŸ”", key=f"ocr_rem_adv_no_{selected_file}",
724
- type="primary" if is_ocr_active('Remittance_details', 'Remittance_adv_no') else "secondary"):
725
- activate_ocr_field('Remittance_details', 'Remittance_adv_no')
726
 
 
727
  col_input, col_btn = st.columns([5, 1])
728
  with col_input:
729
- remittance['Remittance_adv_date'] = st.text_input(
730
- "Remittance Advice Date",
731
- value=remittance.get('Remittance_adv_date', ''),
732
- key=f"rem_adv_date_{selected_file}"
733
  )
734
  with col_btn:
735
  st.markdown("<br>", unsafe_allow_html=True)
736
- if st.button("πŸ”", key=f"ocr_rem_adv_date_{selected_file}",
737
- type="primary" if is_ocr_active('Remittance_details', 'Remittance_adv_date') else "secondary"):
738
- activate_ocr_field('Remittance_details', 'Remittance_adv_date')
739
 
 
740
  col_input, col_btn = st.columns([5, 1])
741
  with col_input:
742
- remittance['Payment_method'] = st.text_input(
743
- "Payment Method",
744
- value=remittance.get('Payment_method', ''),
745
- key=f"payment_method_{selected_file}"
746
  )
747
  with col_btn:
748
  st.markdown("<br>", unsafe_allow_html=True)
749
- if st.button("πŸ”", key=f"ocr_payment_method_{selected_file}",
750
- type="primary" if is_ocr_active('Remittance_details', 'Payment_method') else "secondary"):
751
- activate_ocr_field('Remittance_details', 'Payment_method')
 
 
752
 
 
753
  col_input, col_btn = st.columns([5, 1])
754
  with col_input:
755
- remittance['FCY'] = st.text_input(
756
- "FCY (Foreign Currency)",
757
- value=remittance.get('FCY', ''),
758
- key=f"fcy_{selected_file}"
759
  )
760
  with col_btn:
761
  st.markdown("<br>", unsafe_allow_html=True)
762
- if st.button("πŸ”", key=f"ocr_fcy_{selected_file}",
763
- type="primary" if is_ocr_active('Remittance_details', 'FCY') else "secondary"):
764
- activate_ocr_field('Remittance_details', 'FCY')
765
 
 
766
  col_input, col_btn = st.columns([5, 1])
767
  with col_input:
768
- remittance['Total_payment_amt_FCY'] = st.text_input(
769
- "Total Payment Amount (FCY)",
770
- value=remittance.get('Total_payment_amt_FCY', ''),
771
- key=f"total_payment_{selected_file}"
772
  )
773
  with col_btn:
774
  st.markdown("<br>", unsafe_allow_html=True)
775
- if st.button("πŸ”", key=f"ocr_total_payment_{selected_file}",
776
- type="primary" if is_ocr_active('Remittance_details', 'Total_payment_amt_FCY') else "secondary"):
777
- activate_ocr_field('Remittance_details', 'Total_payment_amt_FCY')
778
 
 
779
  col_input, col_btn = st.columns([5, 1])
780
  with col_input:
781
- remittance['Payment_date'] = st.text_input(
782
- "Payment Date",
783
- value=remittance.get('Payment_date', ''),
784
- key=f"payment_date_{selected_file}"
785
  )
786
  with col_btn:
787
  st.markdown("<br>", unsafe_allow_html=True)
788
- if st.button("πŸ”", key=f"ocr_payment_date_{selected_file}",
789
- type="primary" if is_ocr_active('Remittance_details', 'Payment_date') else "secondary"):
790
- activate_ocr_field('Remittance_details', 'Payment_date')
791
 
 
792
  col_input, col_btn = st.columns([5, 1])
793
  with col_input:
794
- remittance['Payment_ref_no'] = st.text_input(
795
- "Payment Reference No",
796
- value=remittance.get('Payment_ref_no', ''),
797
- key=f"payment_ref_{selected_file}"
798
  )
799
  with col_btn:
800
  st.markdown("<br>", unsafe_allow_html=True)
801
- if st.button("πŸ”", key=f"ocr_payment_ref_{selected_file}",
802
- type="primary" if is_ocr_active('Remittance_details', 'Payment_ref_no') else "secondary"):
803
- activate_ocr_field('Remittance_details', 'Payment_ref_no')
804
 
805
- gt_parse['Remittance_details'] = remittance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
806
 
807
- # TAB 2: Customer/Supplier Details with SWAP button
808
  with tab2:
809
- # SWAP BUTTON - Centered and prominent
810
  col1, col2, col3 = st.columns([1, 2, 1])
811
  with col2:
812
- if st.button("πŸ”„ Swap Customer ↔ Supplier", key=f"swap_btn_{selected_file}",
813
  type="primary", use_container_width=True):
814
  if not st.session_state.just_swapped:
815
  st.session_state.just_swapped = True
816
- swap_customer_supplier_details(selected_file)
817
  st.rerun()
818
 
819
- # Reset the flag after rerun
820
  if st.session_state.just_swapped:
821
  st.session_state.just_swapped = False
822
 
823
- st.markdown("**Customer Details**")
824
- customer_supplier = gt_parse.get('Customer_supplier_details', {})
825
 
 
826
  col_input, col_btn = st.columns([5, 1])
827
  with col_input:
828
- customer_supplier['Customer_name'] = st.text_input(
829
- "Customer Name",
830
- value=customer_supplier.get('Customer_name', ''),
831
- key=f"cust_name_{selected_file}"
832
  )
833
  with col_btn:
834
  st.markdown("<br>", unsafe_allow_html=True)
835
- if st.button("πŸ”", key=f"ocr_cust_name_{selected_file}",
836
- type="primary" if is_ocr_active('Customer_supplier_details', 'Customer_name') else "secondary"):
837
- activate_ocr_field('Customer_supplier_details', 'Customer_name')
838
 
 
839
  col_input, col_btn = st.columns([5, 1])
840
  with col_input:
841
- customer_supplier['Customer_address'] = st.text_area(
842
- "Customer Address",
843
- value=customer_supplier.get('Customer_address', ''),
844
- key=f"cust_addr_{selected_file}",
845
  height=60
846
  )
847
  with col_btn:
848
  st.markdown("<br>", unsafe_allow_html=True)
849
- if st.button("πŸ”", key=f"ocr_cust_addr_{selected_file}",
850
- type="primary" if is_ocr_active('Customer_supplier_details', 'Customer_address') else "secondary"):
851
- activate_ocr_field('Customer_supplier_details', 'Customer_address')
852
 
853
- col_input, col_btn = st.columns([5, 1])
854
- with col_input:
855
- customer_supplier['Customer_contact_info'] = st.text_input(
856
- "Customer Contact Info",
857
- value=customer_supplier.get('Customer_contact_info', ''),
858
- key=f"cust_contact_{selected_file}"
859
- )
860
- with col_btn:
861
- st.markdown("<br>", unsafe_allow_html=True)
862
- if st.button("πŸ”", key=f"ocr_cust_contact_{selected_file}",
863
- type="primary" if is_ocr_active('Customer_supplier_details', 'Customer_contact_info') else "secondary"):
864
- activate_ocr_field('Customer_supplier_details', 'Customer_contact_info')
865
-
866
- st.markdown("**Supplier Details**")
867
 
 
868
  col_input, col_btn = st.columns([5, 1])
869
  with col_input:
870
- customer_supplier['Supplier_name'] = st.text_input(
871
- "Supplier Name",
872
- value=customer_supplier.get('Supplier_name', ''),
873
- key=f"supp_name_{selected_file}"
874
  )
875
  with col_btn:
876
  st.markdown("<br>", unsafe_allow_html=True)
877
- if st.button("πŸ”", key=f"ocr_supp_name_{selected_file}",
878
- type="primary" if is_ocr_active('Customer_supplier_details', 'Supplier_name') else "secondary"):
879
- activate_ocr_field('Customer_supplier_details', 'Supplier_name')
880
 
 
881
  col_input, col_btn = st.columns([5, 1])
882
  with col_input:
883
- customer_supplier['Supplier_address'] = st.text_area(
884
- "Supplier Address",
885
- value=customer_supplier.get('Supplier_address', ''),
886
- key=f"supp_addr_{selected_file}",
887
  height=60
888
  )
889
  with col_btn:
890
  st.markdown("<br>", unsafe_allow_html=True)
891
- if st.button("πŸ”", key=f"ocr_supp_addr_{selected_file}",
892
- type="primary" if is_ocr_active('Customer_supplier_details', 'Supplier_address') else "secondary"):
893
- activate_ocr_field('Customer_supplier_details', 'Supplier_address')
 
 
 
 
 
 
894
 
 
895
  col_input, col_btn = st.columns([5, 1])
896
  with col_input:
897
- customer_supplier['Supplier_contact_info'] = st.text_input(
898
- "Supplier Contact Info",
899
- value=customer_supplier.get('Supplier_contact_info', ''),
900
- key=f"supp_contact_{selected_file}"
901
  )
902
  with col_btn:
903
  st.markdown("<br>", unsafe_allow_html=True)
904
- if st.button("πŸ”", key=f"ocr_supp_contact_{selected_file}",
905
- type="primary" if is_ocr_active('Customer_supplier_details', 'Supplier_contact_info') else "secondary"):
906
- activate_ocr_field('Customer_supplier_details', 'Supplier_contact_info')
907
-
908
- gt_parse['Customer_supplier_details'] = customer_supplier
909
-
910
- # TAB 3: Bank Details
911
- with tab3:
912
- bank = gt_parse.get('Bank_details', {})
913
 
 
914
  col_input, col_btn = st.columns([5, 1])
915
  with col_input:
916
- bank['Bank_name'] = st.text_input(
917
  "Bank Name",
918
- value=bank.get('Bank_name', ''),
919
  key=f"bank_name_{selected_file}"
920
  )
921
  with col_btn:
922
  st.markdown("<br>", unsafe_allow_html=True)
923
  if st.button("πŸ”", key=f"ocr_bank_name_{selected_file}",
924
- type="primary" if is_ocr_active('Bank_details', 'Bank_name') else "secondary"):
925
- activate_ocr_field('Bank_details', 'Bank_name')
926
 
 
927
  col_input, col_btn = st.columns([5, 1])
928
  with col_input:
929
- bank['Bank_acc_no'] = st.text_input(
930
  "Bank Account No",
931
- value=bank.get('Bank_acc_no', ''),
932
- key=f"bank_acc_{selected_file}"
933
  )
934
  with col_btn:
935
  st.markdown("<br>", unsafe_allow_html=True)
936
- if st.button("πŸ”", key=f"ocr_bank_acc_{selected_file}",
937
- type="primary" if is_ocr_active('Bank_details', 'Bank_acc_no') else "secondary"):
938
- activate_ocr_field('Bank_details', 'Bank_acc_no')
939
 
 
940
  col_input, col_btn = st.columns([5, 1])
941
  with col_input:
942
- bank['Bank_routing_no'] = st.text_input(
943
- "Bank Routing No",
944
- value=bank.get('Bank_routing_no', ''),
945
  key=f"bank_routing_{selected_file}"
946
  )
947
  with col_btn:
948
  st.markdown("<br>", unsafe_allow_html=True)
949
  if st.button("πŸ”", key=f"ocr_bank_routing_{selected_file}",
950
- type="primary" if is_ocr_active('Bank_details', 'Bank_routing_no') else "secondary"):
951
- activate_ocr_field('Bank_details', 'Bank_routing_no')
952
 
 
953
  col_input, col_btn = st.columns([5, 1])
954
  with col_input:
955
- bank['Swift_code'] = st.text_input(
956
- "SWIFT Code",
957
- value=bank.get('Swift_code', ''),
958
- key=f"swift_{selected_file}"
959
  )
960
  with col_btn:
961
  st.markdown("<br>", unsafe_allow_html=True)
962
- if st.button("πŸ”", key=f"ocr_swift_{selected_file}",
963
- type="primary" if is_ocr_active('Bank_details', 'Swift_code') else "secondary"):
964
- activate_ocr_field('Bank_details', 'Swift_code')
965
 
966
- gt_parse['Bank_details'] = bank
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
967
 
968
- # TAB 4: Line Items
969
  with tab4:
970
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
971
- line_items = current_gt_parse.get('Line_items', [])
972
 
973
  # Add/Remove row buttons
974
  col_add, col_remove = st.columns([1, 1])
975
  with col_add:
976
- if st.button("βž• Add New Row", key=f"add_row_{selected_file}", use_container_width=True):
977
  if not st.session_state.button_clicked:
978
  st.session_state.button_clicked = True
979
- new_row = {
980
- "Po_number": "", "Invoice_no": "", "Other_doc_ref_no": "",
981
- "Invoice_date": "", "Invoice_amount_FCY": "",
982
- "Amount_paid_for_each_invoice": "", "Outstanding_balance_FCY": "",
983
- "Discounts_taken_FCY": "", "Adjustments(without_holding_tax)_FCY": "",
984
- "Descriptions": ""
985
  }
986
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
987
- current_line_items = current_gt_parse.get('Line_items', [])
988
- current_line_items.append(new_row)
989
- current_gt_parse['Line_items'] = current_line_items
990
  st.session_state.edited_data[selected_file]['gt_parse'] = current_gt_parse
991
  st.session_state.modified_indices.add(selected_file)
992
 
993
- # Ensure the newly added row's expander is open
994
- new_idx = len(current_line_items) - 1
995
  expander_key_new = f"line_item_expander_{selected_file}_{new_idx}"
996
  st.session_state[expander_key_new] = True
997
 
998
  st.rerun()
999
 
1000
  with col_remove:
1001
- if st.button("βž– Remove Last Row", key=f"remove_row_{selected_file}",
1002
- disabled=(len(line_items) == 0), use_container_width=True):
1003
- if not st.session_state.button_clicked and len(line_items) > 0:
1004
  st.session_state.button_clicked = True
1005
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1006
- current_line_items = current_gt_parse.get('Line_items', [])
1007
- N = len(current_line_items)
1008
- current_line_items.pop()
1009
- current_gt_parse['Line_items'] = current_line_items
1010
  st.session_state.edited_data[selected_file]['gt_parse'] = current_gt_parse
1011
  st.session_state.modified_indices.add(selected_file)
1012
 
1013
- # Remove the expander flag for the popped row (if present)
1014
  popped_idx = N - 1
1015
  expander_key_popped = f"line_item_expander_{selected_file}_{popped_idx}"
1016
  if expander_key_popped in st.session_state:
@@ -1021,178 +1116,127 @@ elif st.session_state.page == 'viewer':
1021
  if st.session_state.button_clicked:
1022
  st.session_state.button_clicked = False
1023
 
1024
- # Display each row as an expander with OCR buttons
1025
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1026
- line_items = current_gt_parse.get('Line_items', [])
1027
 
1028
- if line_items:
1029
- for idx, item in enumerate(line_items):
1030
- # Use a persistent session_state flag so expansion state is preserved across reruns.
1031
  expander_key = f"line_item_expander_{selected_file}_{idx}"
1032
  expanded_default = st.session_state.get(expander_key, False)
1033
 
1034
- # Note: do NOT pass a 'key' arg to st.expander to maintain compatibility; control expanded via session_state flag.
1035
- with st.expander(f"**Row {idx + 1}** - Invoice: {item.get('Invoice_no', 'N/A')}", expanded=expanded_default):
1036
- # PO Number
1037
- col_input, col_btn = st.columns([5, 1])
1038
- with col_input:
1039
- item['Po_number'] = st.text_input(
1040
- "PO Number",
1041
- value=item.get('Po_number', ''),
1042
- key=f"po_num_{selected_file}_{idx}"
1043
- )
1044
- with col_btn:
1045
- st.markdown("<br>", unsafe_allow_html=True)
1046
- if st.button("πŸ”", key=f"ocr_po_{selected_file}_{idx}",
1047
- type="primary" if is_ocr_active('Line_items', 'Po_number', idx) else "secondary"):
1048
- # ensure expander stays open when user explicitly requests OCR
1049
- st.session_state[expander_key] = True
1050
- activate_ocr_field('Line_items', 'Po_number', idx)
1051
-
1052
- # Invoice No
1053
- col_input, col_btn = st.columns([5, 1])
1054
- with col_input:
1055
- item['Invoice_no'] = st.text_input(
1056
- "Invoice No",
1057
- value=item.get('Invoice_no', ''),
1058
- key=f"inv_no_{selected_file}_{idx}"
1059
- )
1060
- with col_btn:
1061
- st.markdown("<br>", unsafe_allow_html=True)
1062
- if st.button("πŸ”", key=f"ocr_inv_{selected_file}_{idx}",
1063
- type="primary" if is_ocr_active('Line_items', 'Invoice_no', idx) else "secondary"):
1064
- st.session_state[expander_key] = True
1065
- activate_ocr_field('Line_items', 'Invoice_no', idx)
1066
-
1067
- # Other Doc Ref No
1068
- col_input, col_btn = st.columns([5, 1])
1069
- with col_input:
1070
- item['Other_doc_ref_no'] = st.text_input(
1071
- "Other Doc Ref No",
1072
- value=item.get('Other_doc_ref_no', ''),
1073
- key=f"other_doc_{selected_file}_{idx}"
1074
- )
1075
- with col_btn:
1076
- st.markdown("<br>", unsafe_allow_html=True)
1077
- if st.button("πŸ”", key=f"ocr_other_{selected_file}_{idx}",
1078
- type="primary" if is_ocr_active('Line_items', 'Other_doc_ref_no', idx) else "secondary"):
1079
- st.session_state[expander_key] = True
1080
- activate_ocr_field('Line_items', 'Other_doc_ref_no', idx)
1081
-
1082
- # Invoice Date
1083
  col_input, col_btn = st.columns([5, 1])
1084
  with col_input:
1085
- item['Invoice_date'] = st.text_input(
1086
- "Invoice Date",
1087
- value=item.get('Invoice_date', ''),
1088
- key=f"inv_date_{selected_file}_{idx}"
 
1089
  )
1090
  with col_btn:
1091
  st.markdown("<br>", unsafe_allow_html=True)
1092
- if st.button("πŸ”", key=f"ocr_inv_date_{selected_file}_{idx}",
1093
- type="primary" if is_ocr_active('Line_items', 'Invoice_date', idx) else "secondary"):
1094
  st.session_state[expander_key] = True
1095
- activate_ocr_field('Line_items', 'Invoice_date', idx)
1096
 
1097
- # Invoice Amount FCY
1098
  col_input, col_btn = st.columns([5, 1])
1099
  with col_input:
1100
- item['Invoice_amount_FCY'] = st.text_input(
1101
- "Invoice Amount FCY",
1102
- value=item.get('Invoice_amount_FCY', ''),
1103
- key=f"inv_amt_{selected_file}_{idx}"
1104
  )
1105
  with col_btn:
1106
  st.markdown("<br>", unsafe_allow_html=True)
1107
- if st.button("πŸ”", key=f"ocr_inv_amt_{selected_file}_{idx}",
1108
- type="primary" if is_ocr_active('Line_items', 'Invoice_amount_FCY', idx) else "secondary"):
1109
  st.session_state[expander_key] = True
1110
- activate_ocr_field('Line_items', 'Invoice_amount_FCY', idx)
1111
 
1112
- # Amount Paid
1113
  col_input, col_btn = st.columns([5, 1])
1114
  with col_input:
1115
- item['Amount_paid_for_each_invoice'] = st.text_input(
1116
- "Amount Paid",
1117
- value=item.get('Amount_paid_for_each_invoice', ''),
1118
- key=f"amt_paid_{selected_file}_{idx}"
1119
  )
1120
  with col_btn:
1121
  st.markdown("<br>", unsafe_allow_html=True)
1122
- if st.button("πŸ”", key=f"ocr_amt_paid_{selected_file}_{idx}",
1123
- type="primary" if is_ocr_active('Line_items', 'Amount_paid_for_each_invoice', idx) else "secondary"):
1124
  st.session_state[expander_key] = True
1125
- activate_ocr_field('Line_items', 'Amount_paid_for_each_invoice', idx)
1126
 
1127
- # Outstanding Balance
1128
  col_input, col_btn = st.columns([5, 1])
1129
  with col_input:
1130
- item['Outstanding_balance_FCY'] = st.text_input(
1131
- "Outstanding Balance FCY",
1132
- value=item.get('Outstanding_balance_FCY', ''),
1133
- key=f"out_bal_{selected_file}_{idx}"
1134
  )
1135
  with col_btn:
1136
  st.markdown("<br>", unsafe_allow_html=True)
1137
- if st.button("πŸ”", key=f"ocr_out_bal_{selected_file}_{idx}",
1138
- type="primary" if is_ocr_active('Line_items', 'Outstanding_balance_FCY', idx) else "secondary"):
1139
  st.session_state[expander_key] = True
1140
- activate_ocr_field('Line_items', 'Outstanding_balance_FCY', idx)
1141
 
1142
- # Discounts
1143
  col_input, col_btn = st.columns([5, 1])
1144
  with col_input:
1145
- item['Discounts_taken_FCY'] = st.text_input(
1146
- "Discounts Taken FCY",
1147
- value=item.get('Discounts_taken_FCY', ''),
1148
- key=f"disc_{selected_file}_{idx}"
1149
  )
1150
  with col_btn:
1151
  st.markdown("<br>", unsafe_allow_html=True)
1152
- if st.button("πŸ”", key=f"ocr_disc_{selected_file}_{idx}",
1153
- type="primary" if is_ocr_active('Line_items', 'Discounts_taken_FCY', idx) else "secondary"):
1154
  st.session_state[expander_key] = True
1155
- activate_ocr_field('Line_items', 'Discounts_taken_FCY', idx)
1156
 
1157
- # Adjustments
1158
  col_input, col_btn = st.columns([5, 1])
1159
  with col_input:
1160
- item['Adjustments(without_holding_tax)_FCY'] = st.text_input(
1161
- "Adjustments FCY",
1162
- value=item.get('Adjustments(without_holding_tax)_FCY', ''),
1163
- key=f"adj_{selected_file}_{idx}"
1164
  )
1165
  with col_btn:
1166
  st.markdown("<br>", unsafe_allow_html=True)
1167
- if st.button("πŸ”", key=f"ocr_adj_{selected_file}_{idx}",
1168
- type="primary" if is_ocr_active('Line_items', 'Adjustments(without_holding_tax)_FCY', idx) else "secondary"):
1169
  st.session_state[expander_key] = True
1170
- activate_ocr_field('Line_items', 'Adjustments(without_holding_tax)_FCY', idx)
1171
 
1172
- # Descriptions
1173
  col_input, col_btn = st.columns([5, 1])
1174
  with col_input:
1175
- item['Descriptions'] = st.text_area(
1176
- "Descriptions",
1177
- value=item.get('Descriptions', ''),
1178
- key=f"desc_{selected_file}_{idx}",
1179
- height=60
1180
  )
1181
  with col_btn:
1182
  st.markdown("<br>", unsafe_allow_html=True)
1183
- if st.button("πŸ”", key=f"ocr_desc_{selected_file}_{idx}",
1184
- type="primary" if is_ocr_active('Line_items', 'Descriptions', idx) else "secondary"):
1185
  st.session_state[expander_key] = True
1186
- activate_ocr_field('Line_items', 'Descriptions', idx)
1187
 
1188
- # Update line items back to gt_parse
1189
- current_gt_parse['Line_items'] = line_items
1190
 
1191
- st.markdown("**πŸ“Š Line Items Summary Table**")
1192
 
1193
- # Display summary table with index starting from 1
1194
- df = pd.DataFrame(line_items)
1195
- df.index = df.index + 1 # Start index from 1
1196
  df.index.name = 'SL No'
1197
 
1198
  st.dataframe(
@@ -1201,7 +1245,7 @@ elif st.session_state.page == 'viewer':
1201
  height=300
1202
  )
1203
  else:
1204
- st.info("No line items. Click 'βž• Add New Row' to add a new row.")
1205
 
1206
  st.session_state.edited_data[selected_file]['gt_parse'] = gt_parse
1207
 
 
37
  pass
38
 
39
  # Page configuration
40
+ st.set_page_config(page_title="Invoice Data Viewer", layout="wide")
41
 
42
  # Custom CSS
43
  st.markdown("""
 
141
  data.append(json.loads(line))
142
  return data
143
 
144
+ def reorder_record_fields(record):
145
+ """Reorder record fields to put file_name/file_names first, then gt_parse, then others"""
146
+ ordered_record = {}
147
+
148
+ # First: Add file_name or file_names
149
+ if 'file_name' in record:
150
+ ordered_record['file_name'] = record['file_name']
151
+ if 'file_names' in record:
152
+ ordered_record['file_names'] = record['file_names']
153
+
154
+ # Second: Add gt_parse
155
+ if 'gt_parse' in record:
156
+ ordered_record['gt_parse'] = record['gt_parse']
157
+
158
+ # Third: Add any remaining fields
159
+ for key, value in record.items():
160
+ if key not in ordered_record:
161
+ ordered_record[key] = value
162
+
163
+ return ordered_record
164
+
165
  def save_to_jsonl(data):
166
+ """Convert data list to JSONL format with proper field ordering"""
167
+ jsonl_lines = []
168
+ for record in data:
169
+ ordered_record = reorder_record_fields(record)
170
+ jsonl_lines.append(json.dumps(ordered_record))
171
+ return '\n'.join(jsonl_lines)
172
 
173
  def pdf_to_images(pdf_file):
174
  """Convert PDF to list of PIL Images (one per page)"""
 
179
 
180
  for page_num in range(pdf_document.page_count):
181
  page = pdf_document[page_num]
 
 
182
  pix = page.get_pixmap(matrix=fitz.Matrix(3, 3), alpha=False)
183
  img_data = pix.tobytes("png")
184
  img = Image.open(io.BytesIO(img_data))
 
204
 
205
  def scale_image_to_fixed_size(image, max_width=900, max_height=1100):
206
  """Scale image to fit within max dimensions while maintaining aspect ratio - NO PADDING"""
 
207
  if image.mode not in ('RGB', 'RGBA'):
208
  image = image.convert('RGB')
209
  elif image.mode == 'RGBA':
 
211
  background.paste(image, mask=image.split()[3])
212
  image = background
213
 
 
214
  width_ratio = max_width / image.width
215
  height_ratio = max_height / image.height
216
  ratio = min(width_ratio, height_ratio)
 
218
  new_width = int(image.width * ratio)
219
  new_height = int(image.height * ratio)
220
 
 
221
  resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
222
 
 
223
  return resized_image, ratio, 0, 0
224
 
225
+ def get_base_filename(record):
226
+ """Get base filename from record, handling both file_name and file_names"""
227
+ # Check for file_names (plural) first
228
+ if 'file_names' in record and record['file_names']:
229
+ if isinstance(record['file_names'], list) and len(record['file_names']) > 0:
230
+ # Extract base name from first file (remove _pageN.png suffix)
231
+ first_file = record['file_names'][0]
232
+ # Remove .png extension
233
+ base = first_file.rsplit('.png', 1)[0]
234
+ # Remove _pageN suffix if exists
235
+ if '_page' in base:
236
+ base = base.rsplit('_page', 1)[0]
237
+ return base
238
+ return record['file_names']
239
+
240
+ # Fall back to file_name (singular)
241
+ return record.get('file_name', '')
242
+
243
+ def swap_sender_recipient_details(index):
244
+ """Swap sender and recipient details"""
245
  gt_parse = st.session_state.edited_data[index].get('gt_parse', {})
246
+ header = gt_parse.get('header', {})
247
 
248
+ # Store sender values
249
+ temp_sender_name = header.get('sender_name', '')
250
+ temp_sender_addr = header.get('sender_addr', '')
 
251
 
252
+ # Swap: Sender ← Recipient
253
+ header['sender_name'] = header.get('rcpt_name', '')
254
+ header['sender_addr'] = header.get('rcpt_addr', '')
 
255
 
256
+ # Swap: Recipient ← Sender (from temp)
257
+ header['rcpt_name'] = temp_sender_name
258
+ header['rcpt_addr'] = temp_sender_addr
 
259
 
260
  # Update session state
261
+ gt_parse['header'] = header
262
  st.session_state.edited_data[index]['gt_parse'] = gt_parse
263
  st.session_state.modified_indices.add(index)
264
 
 
274
  if 'images' not in st.session_state:
275
  st.session_state.images = {}
276
  if 'pdf_metadata' not in st.session_state:
277
+ st.session_state.pdf_metadata = {}
278
  if 'current_page_num' not in st.session_state:
279
+ st.session_state.current_page_num = {}
280
  if 'modified_indices' not in st.session_state:
281
  st.session_state.modified_indices = set()
282
  if 'ocr_active_section' not in st.session_state:
 
287
  st.session_state.ocr_line_item_row = None
288
  if 'canvas_key' not in st.session_state:
289
  st.session_state.canvas_key = 0
 
 
290
  if 'button_clicked' not in st.session_state:
291
  st.session_state.button_clicked = False
292
  if 'save_message' not in st.session_state:
 
303
  def auto_save(index):
304
  """Automatically save changes to session state and mark as modified"""
305
  if st.session_state.edited_data:
306
+ # Get current record
307
+ current_record = st.session_state.edited_data[index]
308
+
309
+ # Get base filename using the helper function
310
+ base_file_name = get_base_filename(current_record)
311
+
312
+ if not base_file_name:
313
+ st.warning("Cannot save: No file name found in record")
314
+ return
315
+
316
+ # Find the actual file name in uploaded files
317
+ actual_file_name = None
318
+ if base_file_name in st.session_state.images:
319
+ actual_file_name = base_file_name
320
+ else:
321
+ # Try with extensions
322
+ for ext in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
323
+ if base_file_name + ext in st.session_state.images:
324
+ actual_file_name = base_file_name + ext
325
+ break
326
+
327
+ # Try matching base name
328
+ if not actual_file_name:
329
+ for uploaded_name in st.session_state.images.keys():
330
+ uploaded_base = uploaded_name.rsplit('.', 1)[0]
331
+ if uploaded_base == base_file_name:
332
+ actual_file_name = uploaded_name
333
+ break
334
+
335
+ # Check if it's a PDF and update file_name accordingly
336
+ if actual_file_name and actual_file_name in st.session_state.pdf_metadata:
337
+ # It's a PDF - get page count
338
+ pdf_meta = st.session_state.pdf_metadata[actual_file_name]
339
+ total_pages = pdf_meta['total_pages']
340
+
341
+ # Get base name without extension
342
+ base_name = actual_file_name.rsplit('.', 1)[0]
343
+
344
+ if total_pages > 1:
345
+ # Multi-page PDF: use file_names array
346
+ file_names_array = [f"{base_name}_page{i+1}.png" for i in range(total_pages)]
347
+ st.session_state.edited_data[index]['file_names'] = file_names_array
348
+ # Remove old file_name field if it exists
349
+ if 'file_name' in st.session_state.edited_data[index]:
350
+ del st.session_state.edited_data[index]['file_name']
351
+ else:
352
+ # Single-page PDF: use file_name string
353
+ st.session_state.edited_data[index]['file_name'] = f"{base_name}.png"
354
+ # Remove old file_names field if it exists
355
+ if 'file_names' in st.session_state.edited_data[index]:
356
+ del st.session_state.edited_data[index]['file_names']
357
+
358
  st.session_state.data = st.session_state.edited_data.copy()
359
  st.session_state.modified_indices.add(index)
360
 
361
  def activate_ocr_field(section, field, row_idx=None):
362
+ """Activate OCR for a specific field"""
 
 
 
 
363
  if (st.session_state.ocr_active_section == section and
364
  st.session_state.ocr_active_field == field and
365
  st.session_state.ocr_line_item_row == row_idx):
 
367
  st.session_state.ocr_active_field = None
368
  st.session_state.ocr_line_item_row = None
369
  else:
 
370
  st.session_state.ocr_active_section = section
371
  st.session_state.ocr_active_field = field
372
  st.session_state.ocr_line_item_row = row_idx
373
 
374
+ if section == 'items' and row_idx is not None:
 
375
  current_idx = st.session_state.get('current_index', 0)
376
  expander_key = f"line_item_expander_{current_idx}_{row_idx}"
377
  st.session_state[expander_key] = True
378
 
 
379
  st.session_state.canvas_key += 1
380
  st.rerun()
381
 
 
387
 
388
  # PAGE 1: Upload Page
389
  if st.session_state.page == 'upload':
390
+ st.title("πŸ“€ Invoice Data Viewer with OCR")
391
  st.markdown("### Upload your files to begin")
392
 
393
  st.markdown("**Step 1: Upload JSONL File**")
 
420
  file_ext = file.name.lower().split('.')[-1]
421
 
422
  if file_ext == 'pdf':
 
423
  pdf_images = pdf_to_images(file)
424
  if pdf_images:
 
425
  images_dict[file.name] = pdf_images[0]
 
426
  pdf_metadata[file.name] = {
427
  'pages': pdf_images,
428
  'total_pages': len(pdf_images),
429
  'current_page': 0
430
  }
 
431
  else:
 
432
  image = Image.open(file)
433
  images_dict[file.name] = image
434
 
 
438
  st.session_state.images = images_dict
439
  st.session_state.pdf_metadata = pdf_metadata
440
 
 
441
  for filename in pdf_metadata.keys():
442
  if filename not in st.session_state.current_page_num:
443
  st.session_state.current_page_num[filename] = 0
444
 
445
  if st.session_state.data is not None:
446
+ gt_file_names = []
447
+ for rec in st.session_state.data:
448
+ base_fname = get_base_filename(rec)
449
+ if base_fname:
450
+ gt_file_names.append(base_fname)
451
+
452
  matched_images = set()
453
  unmatched_gt_files = []
454
 
 
455
  for fname in gt_file_names:
456
  if not fname:
457
  continue
458
 
 
459
  if fname in images_dict:
460
  matched_images.add(fname)
461
  else:
 
462
  found = False
463
  for ext in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
464
  if fname + ext in images_dict:
 
466
  found = True
467
  break
468
 
 
469
  if not found:
470
  for uploaded_name in images_dict.keys():
471
  uploaded_base = uploaded_name.rsplit('.', 1)[0]
 
529
  st.download_button(
530
  label=f"⬇️ Download Modified ({len(modified_data)})",
531
  data=jsonl_modified,
532
+ file_name=f"modified_invoice_data_{today_date}.jsonl",
533
  mime="application/jsonl",
534
  type="primary",
535
  use_container_width=True
 
545
  st.download_button(
546
  label=f"⬇️ Download Unmodified ({len(unmodified_data)})",
547
  data=jsonl_unmodified,
548
+ file_name=f"unmodified_invoice_data_{today_date}.jsonl",
549
  mime="application/jsonl",
550
  use_container_width=True
551
  )
 
557
  st.download_button(
558
  label=f"⬇️ Download All ({len(st.session_state.edited_data)})",
559
  data=jsonl_all,
560
+ file_name=f"all_invoice_data_{today_date}.jsonl",
561
  mime="application/jsonl",
562
  use_container_width=True
563
  )
564
 
565
+ # Build file names list for dropdown using helper function
566
+ file_names = []
567
+ for i, record in enumerate(st.session_state.data or []):
568
+ base_name = get_base_filename(record)
569
+ file_names.append(base_name if base_name else f'Record {i}')
570
 
 
571
  if not file_names:
572
  st.error("No records loaded. Please upload a JSONL file on the Upload page.")
573
  if st.button("← Back to Upload"):
574
  st.session_state.page = 'upload'
575
  st.rerun()
576
  else:
 
577
  options = list(range(len(file_names)))
578
 
 
579
  if not st.session_state.edited_data or len(st.session_state.edited_data) != len(file_names):
 
580
  st.session_state.edited_data = (st.session_state.data or []).copy()
581
 
 
582
  cur_idx = st.session_state.get('current_index', 0)
583
  try:
584
  cur_idx = int(cur_idx)
 
589
  if cur_idx >= len(options):
590
  cur_idx = len(options) - 1
591
 
 
592
  selected_file = st.selectbox(
593
  "Select a file to view:",
594
  options=options,
 
596
  index=cur_idx
597
  )
598
 
 
599
  st.session_state.current_index = selected_file
 
 
600
  current_record = st.session_state.edited_data[selected_file]
601
 
602
  left_col, right_col = st.columns([1.6, 1.0], gap="small")
 
604
  # LEFT SIDE: Image Display with OCR Canvas
605
  with left_col:
606
  with st.container(height=700, border=False):
607
+ # Use helper function to get base file name
608
+ file_name = get_base_filename(current_record)
609
 
610
  if file_name:
 
611
  actual_file_name = None
612
  if file_name in st.session_state.images:
613
  actual_file_name = file_name
614
  else:
 
615
  for ext in ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
616
  if file_name + ext in st.session_state.images:
617
  actual_file_name = file_name + ext
618
  break
619
 
 
620
  if not actual_file_name:
621
  for uploaded_name in st.session_state.images.keys():
622
  uploaded_base = uploaded_name.rsplit('.', 1)[0]
 
625
  break
626
 
627
  if actual_file_name:
 
628
  is_pdf = actual_file_name in st.session_state.pdf_metadata
629
 
630
  if is_pdf:
 
632
  total_pages = pdf_meta['total_pages']
633
  current_page = st.session_state.current_page_num.get(actual_file_name, 0)
634
 
 
635
  col_prev, col_info, col_next = st.columns([1, 2, 1])
636
 
637
  with col_prev:
 
645
  next_clicked = st.button("Next ➑️", key=f"next_page_{selected_file}_{actual_file_name}",
646
  disabled=(current_page >= total_pages - 1), use_container_width=True)
647
 
 
648
  if not st.session_state.navigating_page:
649
  if prev_clicked:
650
  st.session_state.navigating_page = True
 
661
  st.session_state.ocr_active_field = None
662
  st.rerun()
663
  else:
 
664
  st.session_state.navigating_page = False
665
 
666
  if actual_file_name:
 
667
  is_pdf = actual_file_name in st.session_state.pdf_metadata
668
 
669
  if is_pdf:
 
670
  current_page = st.session_state.current_page_num.get(actual_file_name, 0)
671
  pdf_meta = st.session_state.pdf_metadata[actual_file_name]
672
  current_image = pdf_meta['pages'][current_page]
 
680
  st.text(f" β€’ {img_name}")
681
  if len(st.session_state.images) > 20:
682
  st.text(f" ... and {len(st.session_state.images) - 20} more")
683
+ current_image = None
684
 
685
  if current_image:
 
686
  scaled_image, scale_ratio, paste_x, paste_y = scale_image_to_fixed_size(current_image)
687
 
 
688
  canvas_result = st_canvas(
689
  fill_color="rgba(255, 165, 0, 0.3)",
690
  stroke_width=2,
 
697
  key=f"canvas_{selected_file}_{st.session_state.canvas_key}",
698
  )
699
 
 
700
  if canvas_result.json_data is not None and st.session_state.ocr_active_field:
701
  objects = canvas_result.json_data.get("objects", [])
702
  if len(objects) > 0:
 
717
 
718
  gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
719
 
720
+ if st.session_state.ocr_active_section == 'items':
721
+ items = gt_parse.get('items', [])
722
  row_idx = st.session_state.ocr_line_item_row
723
+ if row_idx is not None and row_idx < len(items):
724
+ items[row_idx][st.session_state.ocr_active_field] = ocr_text
725
+ gt_parse['items'] = items
726
 
 
727
  expander_key = f"line_item_expander_{selected_file}_{row_idx}"
728
  st.session_state[expander_key] = True
729
  else:
 
736
  st.session_state.edited_data[selected_file]['gt_parse'] = gt_parse
737
  st.session_state.modified_indices.add(selected_file)
738
 
 
 
 
 
739
  st.session_state.canvas_key += 1
740
  st.rerun()
741
  else:
 
746
  # RIGHT SIDE: Editable Details
747
  with right_col:
748
  with st.container(height=700, border=False):
749
+ st.markdown("### πŸ“ Invoice Details")
750
 
751
  gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
752
 
753
  tab1, tab2, tab3, tab4 = st.tabs([
754
+ "πŸ“„ Invoice Details",
755
  "πŸ‘₯ Party Details",
756
  "🏦 Bank Details",
757
  "πŸ“‹ Line Items"
758
  ])
759
 
760
+ # TAB 1: Header (includes invoice details + summary fields)
761
  with tab1:
762
+ header = gt_parse.get('header', {})
763
+ summary = gt_parse.get('summary', {})
764
 
765
+ st.markdown("**Invoice Information**")
766
+
767
+ # Invoice No
768
  col_input, col_btn = st.columns([5, 1])
769
  with col_input:
770
+ header['invoice_no'] = st.text_input(
771
+ "Invoice No",
772
+ value=header.get('invoice_no', ''),
773
+ key=f"invoice_no_{selected_file}"
774
  )
775
  with col_btn:
776
  st.markdown("<br>", unsafe_allow_html=True)
777
+ if st.button("πŸ”", key=f"ocr_invoice_no_{selected_file}",
778
+ type="primary" if is_ocr_active('header', 'invoice_no') else "secondary"):
779
+ activate_ocr_field('header', 'invoice_no')
780
 
781
+ # Invoice Date
782
  col_input, col_btn = st.columns([5, 1])
783
  with col_input:
784
+ header['invoice_date'] = st.text_input(
785
+ "Invoice Date",
786
+ value=header.get('invoice_date', ''),
787
+ key=f"invoice_date_{selected_file}"
788
  )
789
  with col_btn:
790
  st.markdown("<br>", unsafe_allow_html=True)
791
+ if st.button("πŸ”", key=f"ocr_invoice_date_{selected_file}",
792
+ type="primary" if is_ocr_active('header', 'invoice_date') else "secondary"):
793
+ activate_ocr_field('header', 'invoice_date')
794
 
795
+ # Due Date
796
  col_input, col_btn = st.columns([5, 1])
797
  with col_input:
798
+ header['due_date'] = st.text_input(
799
+ "Due Date",
800
+ value=header.get('due_date', ''),
801
+ key=f"due_date_{selected_file}"
802
  )
803
  with col_btn:
804
  st.markdown("<br>", unsafe_allow_html=True)
805
+ if st.button("πŸ”", key=f"ocr_due_date_{selected_file}",
806
+ type="primary" if is_ocr_active('header', 'due_date') else "secondary"):
807
+ activate_ocr_field('header', 'due_date')
808
+
809
+ st.markdown("**Financial Summary**")
810
 
811
+ # Subtotal
812
  col_input, col_btn = st.columns([5, 1])
813
  with col_input:
814
+ summary['subtotal'] = st.text_input(
815
+ "Subtotal",
816
+ value=summary.get('subtotal', ''),
817
+ key=f"subtotal_{selected_file}"
818
  )
819
  with col_btn:
820
  st.markdown("<br>", unsafe_allow_html=True)
821
+ if st.button("πŸ”", key=f"ocr_subtotal_{selected_file}",
822
+ type="primary" if is_ocr_active('summary', 'subtotal') else "secondary"):
823
+ activate_ocr_field('summary', 'subtotal')
824
 
825
+ # Tax Rate
826
  col_input, col_btn = st.columns([5, 1])
827
  with col_input:
828
+ summary['tax_rate'] = st.text_input(
829
+ "Tax Rate",
830
+ value=summary.get('tax_rate', ''),
831
+ key=f"tax_rate_{selected_file}"
832
  )
833
  with col_btn:
834
  st.markdown("<br>", unsafe_allow_html=True)
835
+ if st.button("πŸ”", key=f"ocr_tax_rate_{selected_file}",
836
+ type="primary" if is_ocr_active('summary', 'tax_rate') else "secondary"):
837
+ activate_ocr_field('summary', 'tax_rate')
838
 
839
+ # Tax Amount
840
  col_input, col_btn = st.columns([5, 1])
841
  with col_input:
842
+ summary['tax_amount'] = st.text_input(
843
+ "Tax Amount",
844
+ value=summary.get('tax_amount', ''),
845
+ key=f"tax_amount_{selected_file}"
846
  )
847
  with col_btn:
848
  st.markdown("<br>", unsafe_allow_html=True)
849
+ if st.button("πŸ”", key=f"ocr_tax_amount_{selected_file}",
850
+ type="primary" if is_ocr_active('summary', 'tax_amount') else "secondary"):
851
+ activate_ocr_field('summary', 'tax_amount')
852
 
853
+ # Total Amount
854
  col_input, col_btn = st.columns([5, 1])
855
  with col_input:
856
+ summary['total_amount'] = st.text_input(
857
+ "Total Amount",
858
+ value=summary.get('total_amount', ''),
859
+ key=f"total_amount_{selected_file}"
860
  )
861
  with col_btn:
862
  st.markdown("<br>", unsafe_allow_html=True)
863
+ if st.button("πŸ”", key=f"ocr_total_amount_{selected_file}",
864
+ type="primary" if is_ocr_active('summary', 'total_amount') else "secondary"):
865
+ activate_ocr_field('summary', 'total_amount')
866
 
867
+ # Currency
868
+ col_input, col_btn = st.columns([5, 1])
869
+ with col_input:
870
+ summary['currency'] = st.text_input(
871
+ "Currency",
872
+ value=summary.get('currency', ''),
873
+ key=f"currency_{selected_file}"
874
+ )
875
+ with col_btn:
876
+ st.markdown("<br>", unsafe_allow_html=True)
877
+ if st.button("πŸ”", key=f"ocr_currency_{selected_file}",
878
+ type="primary" if is_ocr_active('summary', 'currency') else "secondary"):
879
+ activate_ocr_field('summary', 'currency')
880
+
881
+ gt_parse['header'] = header
882
+ gt_parse['summary'] = summary
883
 
884
+ # TAB 2: Party Details (without bank details)
885
  with tab2:
886
+ # SWAP BUTTON
887
  col1, col2, col3 = st.columns([1, 2, 1])
888
  with col2:
889
+ if st.button("πŸ”„ Swap Sender ↔ Recipient", key=f"swap_btn_{selected_file}",
890
  type="primary", use_container_width=True):
891
  if not st.session_state.just_swapped:
892
  st.session_state.just_swapped = True
893
+ swap_sender_recipient_details(selected_file)
894
  st.rerun()
895
 
 
896
  if st.session_state.just_swapped:
897
  st.session_state.just_swapped = False
898
 
899
+ st.markdown("**Sender Details**")
900
+ header = gt_parse.get('header', {})
901
 
902
+ # Sender Name
903
  col_input, col_btn = st.columns([5, 1])
904
  with col_input:
905
+ header['sender_name'] = st.text_input(
906
+ "Sender Name",
907
+ value=header.get('sender_name', ''),
908
+ key=f"sender_name_{selected_file}"
909
  )
910
  with col_btn:
911
  st.markdown("<br>", unsafe_allow_html=True)
912
+ if st.button("πŸ”", key=f"ocr_sender_name_{selected_file}",
913
+ type="primary" if is_ocr_active('header', 'sender_name') else "secondary"):
914
+ activate_ocr_field('header', 'sender_name')
915
 
916
+ # Sender Address
917
  col_input, col_btn = st.columns([5, 1])
918
  with col_input:
919
+ header['sender_addr'] = st.text_area(
920
+ "Sender Address",
921
+ value=header.get('sender_addr', ''),
922
+ key=f"sender_addr_{selected_file}",
923
  height=60
924
  )
925
  with col_btn:
926
  st.markdown("<br>", unsafe_allow_html=True)
927
+ if st.button("πŸ”", key=f"ocr_sender_addr_{selected_file}",
928
+ type="primary" if is_ocr_active('header', 'sender_addr') else "secondary"):
929
+ activate_ocr_field('header', 'sender_addr')
930
 
931
+ st.markdown("**Recipient Details**")
 
 
 
 
 
 
 
 
 
 
 
 
 
932
 
933
+ # Recipient Name
934
  col_input, col_btn = st.columns([5, 1])
935
  with col_input:
936
+ header['rcpt_name'] = st.text_input(
937
+ "Recipient Name",
938
+ value=header.get('rcpt_name', ''),
939
+ key=f"rcpt_name_{selected_file}"
940
  )
941
  with col_btn:
942
  st.markdown("<br>", unsafe_allow_html=True)
943
+ if st.button("πŸ”", key=f"ocr_rcpt_name_{selected_file}",
944
+ type="primary" if is_ocr_active('header', 'rcpt_name') else "secondary"):
945
+ activate_ocr_field('header', 'rcpt_name')
946
 
947
+ # Recipient Address
948
  col_input, col_btn = st.columns([5, 1])
949
  with col_input:
950
+ header['rcpt_addr'] = st.text_area(
951
+ "Recipient Address",
952
+ value=header.get('rcpt_addr', ''),
953
+ key=f"rcpt_addr_{selected_file}",
954
  height=60
955
  )
956
  with col_btn:
957
  st.markdown("<br>", unsafe_allow_html=True)
958
+ if st.button("πŸ”", key=f"ocr_rcpt_addr_{selected_file}",
959
+ type="primary" if is_ocr_active('header', 'rcpt_addr') else "secondary"):
960
+ activate_ocr_field('header', 'rcpt_addr')
961
+
962
+ gt_parse['header'] = header
963
+
964
+ # TAB 3: Bank Details
965
+ with tab3:
966
+ header = gt_parse.get('header', {})
967
 
968
+ # Bank IBAN
969
  col_input, col_btn = st.columns([5, 1])
970
  with col_input:
971
+ header['bank_iban'] = st.text_input(
972
+ "Bank IBAN",
973
+ value=header.get('bank_iban', ''),
974
+ key=f"bank_iban_{selected_file}"
975
  )
976
  with col_btn:
977
  st.markdown("<br>", unsafe_allow_html=True)
978
+ if st.button("πŸ”", key=f"ocr_bank_iban_{selected_file}",
979
+ type="primary" if is_ocr_active('header', 'bank_iban') else "secondary"):
980
+ activate_ocr_field('header', 'bank_iban')
 
 
 
 
 
 
981
 
982
+ # Bank Name
983
  col_input, col_btn = st.columns([5, 1])
984
  with col_input:
985
+ header['bank_name'] = st.text_input(
986
  "Bank Name",
987
+ value=header.get('bank_name', ''),
988
  key=f"bank_name_{selected_file}"
989
  )
990
  with col_btn:
991
  st.markdown("<br>", unsafe_allow_html=True)
992
  if st.button("πŸ”", key=f"ocr_bank_name_{selected_file}",
993
+ type="primary" if is_ocr_active('header', 'bank_name') else "secondary"):
994
+ activate_ocr_field('header', 'bank_name')
995
 
996
+ # Bank Account No
997
  col_input, col_btn = st.columns([5, 1])
998
  with col_input:
999
+ header['bank_acc_no'] = st.text_input(
1000
  "Bank Account No",
1001
+ value=header.get('bank_acc_no', ''),
1002
+ key=f"bank_acc_no_{selected_file}"
1003
  )
1004
  with col_btn:
1005
  st.markdown("<br>", unsafe_allow_html=True)
1006
+ if st.button("πŸ”", key=f"ocr_bank_acc_no_{selected_file}",
1007
+ type="primary" if is_ocr_active('header', 'bank_acc_no') else "secondary"):
1008
+ activate_ocr_field('header', 'bank_acc_no')
1009
 
1010
+ # Bank Routing
1011
  col_input, col_btn = st.columns([5, 1])
1012
  with col_input:
1013
+ header['bank_routing'] = st.text_input(
1014
+ "Bank Routing",
1015
+ value=header.get('bank_routing', ''),
1016
  key=f"bank_routing_{selected_file}"
1017
  )
1018
  with col_btn:
1019
  st.markdown("<br>", unsafe_allow_html=True)
1020
  if st.button("πŸ”", key=f"ocr_bank_routing_{selected_file}",
1021
+ type="primary" if is_ocr_active('header', 'bank_routing') else "secondary"):
1022
+ activate_ocr_field('header', 'bank_routing')
1023
 
1024
+ # Bank SWIFT
1025
  col_input, col_btn = st.columns([5, 1])
1026
  with col_input:
1027
+ header['bank_swift'] = st.text_input(
1028
+ "Bank SWIFT",
1029
+ value=header.get('bank_swift', ''),
1030
+ key=f"bank_swift_{selected_file}"
1031
  )
1032
  with col_btn:
1033
  st.markdown("<br>", unsafe_allow_html=True)
1034
+ if st.button("πŸ”", key=f"ocr_bank_swift_{selected_file}",
1035
+ type="primary" if is_ocr_active('header', 'bank_swift') else "secondary"):
1036
+ activate_ocr_field('header', 'bank_swift')
1037
 
1038
+ # Bank Account Name
1039
+ col_input, col_btn = st.columns([5, 1])
1040
+ with col_input:
1041
+ header['bank_acc_name'] = st.text_input(
1042
+ "Bank Account Name",
1043
+ value=header.get('bank_acc_name', ''),
1044
+ key=f"bank_acc_name_{selected_file}"
1045
+ )
1046
+ with col_btn:
1047
+ st.markdown("<br>", unsafe_allow_html=True)
1048
+ if st.button("πŸ”", key=f"ocr_bank_acc_name_{selected_file}",
1049
+ type="primary" if is_ocr_active('header', 'bank_acc_name') else "secondary"):
1050
+ activate_ocr_field('header', 'bank_acc_name')
1051
+
1052
+ # Bank Branch
1053
+ col_input, col_btn = st.columns([5, 1])
1054
+ with col_input:
1055
+ header['bank_branch'] = st.text_input(
1056
+ "Bank Branch",
1057
+ value=header.get('bank_branch', ''),
1058
+ key=f"bank_branch_{selected_file}"
1059
+ )
1060
+ with col_btn:
1061
+ st.markdown("<br>", unsafe_allow_html=True)
1062
+ if st.button("πŸ”", key=f"ocr_bank_branch_{selected_file}",
1063
+ type="primary" if is_ocr_active('header', 'bank_branch') else "secondary"):
1064
+ activate_ocr_field('header', 'bank_branch')
1065
+
1066
+ gt_parse['header'] = header
1067
 
1068
+ # TAB 4: Items
1069
  with tab4:
1070
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1071
+ items = current_gt_parse.get('items', [])
1072
 
1073
  # Add/Remove row buttons
1074
  col_add, col_remove = st.columns([1, 1])
1075
  with col_add:
1076
+ if st.button("βž• Add New Item", key=f"add_item_{selected_file}", use_container_width=True):
1077
  if not st.session_state.button_clicked:
1078
  st.session_state.button_clicked = True
1079
+ new_item = {
1080
+ "descriptions": "", "SKU": "", "quantity": "",
1081
+ "unit_price": "", "amount": "", "tax": "", "Line_total": ""
 
 
 
1082
  }
1083
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1084
+ current_items = current_gt_parse.get('items', [])
1085
+ current_items.append(new_item)
1086
+ current_gt_parse['items'] = current_items
1087
  st.session_state.edited_data[selected_file]['gt_parse'] = current_gt_parse
1088
  st.session_state.modified_indices.add(selected_file)
1089
 
1090
+ new_idx = len(current_items) - 1
 
1091
  expander_key_new = f"line_item_expander_{selected_file}_{new_idx}"
1092
  st.session_state[expander_key_new] = True
1093
 
1094
  st.rerun()
1095
 
1096
  with col_remove:
1097
+ if st.button("βž– Remove Last Item", key=f"remove_item_{selected_file}",
1098
+ disabled=(len(items) == 0), use_container_width=True):
1099
+ if not st.session_state.button_clicked and len(items) > 0:
1100
  st.session_state.button_clicked = True
1101
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1102
+ current_items = current_gt_parse.get('items', [])
1103
+ N = len(current_items)
1104
+ current_items.pop()
1105
+ current_gt_parse['items'] = current_items
1106
  st.session_state.edited_data[selected_file]['gt_parse'] = current_gt_parse
1107
  st.session_state.modified_indices.add(selected_file)
1108
 
 
1109
  popped_idx = N - 1
1110
  expander_key_popped = f"line_item_expander_{selected_file}_{popped_idx}"
1111
  if expander_key_popped in st.session_state:
 
1116
  if st.session_state.button_clicked:
1117
  st.session_state.button_clicked = False
1118
 
 
1119
  current_gt_parse = st.session_state.edited_data[selected_file].get('gt_parse', {})
1120
+ items = current_gt_parse.get('items', [])
1121
 
1122
+ if items:
1123
+ for idx, item in enumerate(items):
 
1124
  expander_key = f"line_item_expander_{selected_file}_{idx}"
1125
  expanded_default = st.session_state.get(expander_key, False)
1126
 
1127
+ with st.expander(f"**Item {idx + 1}** - {item.get('descriptions', 'N/A')[:30]}", expanded=expanded_default):
1128
+ # Descriptions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1129
  col_input, col_btn = st.columns([5, 1])
1130
  with col_input:
1131
+ item['descriptions'] = st.text_area(
1132
+ "Descriptions",
1133
+ value=item.get('descriptions', ''),
1134
+ key=f"desc_{selected_file}_{idx}",
1135
+ height=60
1136
  )
1137
  with col_btn:
1138
  st.markdown("<br>", unsafe_allow_html=True)
1139
+ if st.button("πŸ”", key=f"ocr_desc_{selected_file}_{idx}",
1140
+ type="primary" if is_ocr_active('items', 'descriptions', idx) else "secondary"):
1141
  st.session_state[expander_key] = True
1142
+ activate_ocr_field('items', 'descriptions', idx)
1143
 
1144
+ # SKU
1145
  col_input, col_btn = st.columns([5, 1])
1146
  with col_input:
1147
+ item['SKU'] = st.text_input(
1148
+ "SKU",
1149
+ value=item.get('SKU', ''),
1150
+ key=f"sku_{selected_file}_{idx}"
1151
  )
1152
  with col_btn:
1153
  st.markdown("<br>", unsafe_allow_html=True)
1154
+ if st.button("πŸ”", key=f"ocr_sku_{selected_file}_{idx}",
1155
+ type="primary" if is_ocr_active('items', 'SKU', idx) else "secondary"):
1156
  st.session_state[expander_key] = True
1157
+ activate_ocr_field('items', 'SKU', idx)
1158
 
1159
+ # Quantity
1160
  col_input, col_btn = st.columns([5, 1])
1161
  with col_input:
1162
+ item['quantity'] = st.text_input(
1163
+ "Quantity",
1164
+ value=item.get('quantity', ''),
1165
+ key=f"qty_{selected_file}_{idx}"
1166
  )
1167
  with col_btn:
1168
  st.markdown("<br>", unsafe_allow_html=True)
1169
+ if st.button("πŸ”", key=f"ocr_qty_{selected_file}_{idx}",
1170
+ type="primary" if is_ocr_active('items', 'quantity', idx) else "secondary"):
1171
  st.session_state[expander_key] = True
1172
+ activate_ocr_field('items', 'quantity', idx)
1173
 
1174
+ # Unit Price
1175
  col_input, col_btn = st.columns([5, 1])
1176
  with col_input:
1177
+ item['unit_price'] = st.text_input(
1178
+ "Unit Price",
1179
+ value=item.get('unit_price', ''),
1180
+ key=f"unit_price_{selected_file}_{idx}"
1181
  )
1182
  with col_btn:
1183
  st.markdown("<br>", unsafe_allow_html=True)
1184
+ if st.button("πŸ”", key=f"ocr_unit_price_{selected_file}_{idx}",
1185
+ type="primary" if is_ocr_active('items', 'unit_price', idx) else "secondary"):
1186
  st.session_state[expander_key] = True
1187
+ activate_ocr_field('items', 'unit_price', idx)
1188
 
1189
+ # Amount
1190
  col_input, col_btn = st.columns([5, 1])
1191
  with col_input:
1192
+ item['amount'] = st.text_input(
1193
+ "Amount",
1194
+ value=item.get('amount', ''),
1195
+ key=f"amount_{selected_file}_{idx}"
1196
  )
1197
  with col_btn:
1198
  st.markdown("<br>", unsafe_allow_html=True)
1199
+ if st.button("πŸ”", key=f"ocr_amount_{selected_file}_{idx}",
1200
+ type="primary" if is_ocr_active('items', 'amount', idx) else "secondary"):
1201
  st.session_state[expander_key] = True
1202
+ activate_ocr_field('items', 'amount', idx)
1203
 
1204
+ # Tax
1205
  col_input, col_btn = st.columns([5, 1])
1206
  with col_input:
1207
+ item['tax'] = st.text_input(
1208
+ "Tax",
1209
+ value=item.get('tax', ''),
1210
+ key=f"tax_{selected_file}_{idx}"
1211
  )
1212
  with col_btn:
1213
  st.markdown("<br>", unsafe_allow_html=True)
1214
+ if st.button("πŸ”", key=f"ocr_tax_{selected_file}_{idx}",
1215
+ type="primary" if is_ocr_active('items', 'tax', idx) else "secondary"):
1216
  st.session_state[expander_key] = True
1217
+ activate_ocr_field('items', 'tax', idx)
1218
 
1219
+ # Line Total
1220
  col_input, col_btn = st.columns([5, 1])
1221
  with col_input:
1222
+ item['Line_total'] = st.text_input(
1223
+ "Line Total",
1224
+ value=item.get('Line_total', ''),
1225
+ key=f"line_total_{selected_file}_{idx}"
 
1226
  )
1227
  with col_btn:
1228
  st.markdown("<br>", unsafe_allow_html=True)
1229
+ if st.button("πŸ”", key=f"ocr_line_total_{selected_file}_{idx}",
1230
+ type="primary" if is_ocr_active('items', 'Line_total', idx) else "secondary"):
1231
  st.session_state[expander_key] = True
1232
+ activate_ocr_field('items', 'Line_total', idx)
1233
 
1234
+ current_gt_parse['items'] = items
 
1235
 
1236
+ st.markdown("**πŸ“Š Items Summary Table**")
1237
 
1238
+ df = pd.DataFrame(items)
1239
+ df.index = df.index + 1
 
1240
  df.index.name = 'SL No'
1241
 
1242
  st.dataframe(
 
1245
  height=300
1246
  )
1247
  else:
1248
+ st.info("No items. Click 'βž• Add New Item' to add a new item.")
1249
 
1250
  st.session_state.edited_data[selected_file]['gt_parse'] = gt_parse
1251