MikeMai commited on
Commit
dd620cd
·
verified ·
1 Parent(s): 73fb567

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -1
app.py CHANGED
@@ -602,6 +602,80 @@ Contract data in JSON format:""" + f"""
602
 
603
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
604
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
 
606
  def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
607
  """
@@ -971,6 +1045,9 @@ Do not force map 名称(英文) to 单价
971
 
972
  # Save to file if requested
973
  if save_json and transformed_data:
 
 
 
974
  with open(json_name, "w", encoding="utf-8") as f:
975
  json.dump(transformed_data, f, ensure_ascii=False, indent=4)
976
  print(f"✅ Saved to {json_name}")
@@ -1090,7 +1167,7 @@ def extract_po(docx_path):
1090
 
1091
  # Example Usage
1092
 
1093
- # extract_po("test-contract-converted.docx")
1094
  # extract_po("EPC简明合同格式-中英对照版.docx")
1095
 
1096
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 
602
 
603
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
604
 
605
+ def handle_weight_conversion_edge_case(transformed_data):
606
+ """
607
+ Handles the edge case where converted weight is in '其他' field.
608
+ If found, replaces quantity and unit with the converted weight values.
609
+ Extracts unit from the bracket in the column header.
610
+ """
611
+ for row in transformed_data:
612
+ if "其他" not in row or not isinstance(row["其他"], dict):
613
+ continue
614
+
615
+ other_fields = row["其他"]
616
+
617
+ # Look for weight conversion column with various possible names
618
+ weight_key = None
619
+ weight_patterns = [
620
+ r"换算重量(吨)",
621
+ r"converted weight(t)",
622
+ r"换算重量",
623
+ r"converted weight",
624
+ r"重量换算",
625
+ r"weight conversion"
626
+ ]
627
+
628
+ for key in other_fields:
629
+ # Check if any pattern is contained within the key
630
+ if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
631
+ weight_key = key
632
+ break
633
+
634
+ if weight_key and other_fields[weight_key]:
635
+ try:
636
+ # Try to convert to float to ensure it's a valid number
637
+ weight_value = float(other_fields[weight_key])
638
+
639
+ # Only replace if the weight value is valid
640
+ if weight_value > 0:
641
+ # Store original values in case we need to revert
642
+ original_quantity = row.get("数量", "")
643
+ original_unit = row.get("单位", "")
644
+
645
+ # Extract unit from the bracket in the column header
646
+ unit = "吨" # default unit
647
+ bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
648
+ if bracket_match:
649
+ unit = bracket_match.group(1).strip()
650
+ # Clean up the unit (remove any extra text)
651
+ unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
652
+
653
+ # Replace with converted weight
654
+ row["数量"] = str(weight_value)
655
+ row["单位"] = unit
656
+
657
+ # Log the conversion
658
+ print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
659
+
660
+ # Remove the weight field from other_fields
661
+ del other_fields[weight_key]
662
+ except (ValueError, TypeError):
663
+ # If conversion fails, log and skip
664
+ print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
665
+ continue
666
+
667
+ return transformed_data
668
+
669
+ def handle_edge_cases(transformed_data):
670
+ """
671
+ Main function to handle all edge cases in the transformed data.
672
+ Currently handles:
673
+ 1. Weight conversion from '其他' field
674
+ """
675
+ # Handle weight conversion edge case
676
+ transformed_data = handle_weight_conversion_edge_case(transformed_data)
677
+
678
+ return transformed_data
679
 
680
  def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
681
  """
 
1045
 
1046
  # Save to file if requested
1047
  if save_json and transformed_data:
1048
+ # Handle edge cases before saving
1049
+ transformed_data = handle_edge_cases(transformed_data)
1050
+
1051
  with open(json_name, "w", encoding="utf-8") as f:
1052
  json.dump(transformed_data, f, ensure_ascii=False, indent=4)
1053
  print(f"✅ Saved to {json_name}")
 
1167
 
1168
  # Example Usage
1169
 
1170
+ # extract_po("GN-WARJS2504-282GC烟台嘉益钢材采购合同(1).docx")
1171
  # extract_po("EPC简明合同格式-中英对照版.docx")
1172
 
1173
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))