Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -602,6 +602,80 @@ Contract data in JSON format:""" + f"""
|
|
| 602 |
|
| 603 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 604 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
|
| 606 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
| 607 |
"""
|
|
@@ -971,6 +1045,9 @@ Do not force map 名称(英文) to 单价
|
|
| 971 |
|
| 972 |
# Save to file if requested
|
| 973 |
if save_json and transformed_data:
|
|
|
|
|
|
|
|
|
|
| 974 |
with open(json_name, "w", encoding="utf-8") as f:
|
| 975 |
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
| 976 |
print(f"✅ Saved to {json_name}")
|
|
@@ -1090,7 +1167,7 @@ def extract_po(docx_path):
|
|
| 1090 |
|
| 1091 |
# Example Usage
|
| 1092 |
|
| 1093 |
-
# extract_po("
|
| 1094 |
# extract_po("EPC简明合同格式-中英对照版.docx")
|
| 1095 |
|
| 1096 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
|
|
|
| 602 |
|
| 603 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 604 |
|
| 605 |
+
def handle_weight_conversion_edge_case(transformed_data):
|
| 606 |
+
"""
|
| 607 |
+
Handles the edge case where converted weight is in '其他' field.
|
| 608 |
+
If found, replaces quantity and unit with the converted weight values.
|
| 609 |
+
Extracts unit from the bracket in the column header.
|
| 610 |
+
"""
|
| 611 |
+
for row in transformed_data:
|
| 612 |
+
if "其他" not in row or not isinstance(row["其他"], dict):
|
| 613 |
+
continue
|
| 614 |
+
|
| 615 |
+
other_fields = row["其他"]
|
| 616 |
+
|
| 617 |
+
# Look for weight conversion column with various possible names
|
| 618 |
+
weight_key = None
|
| 619 |
+
weight_patterns = [
|
| 620 |
+
r"换算重量(吨)",
|
| 621 |
+
r"converted weight(t)",
|
| 622 |
+
r"换算重量",
|
| 623 |
+
r"converted weight",
|
| 624 |
+
r"重量换算",
|
| 625 |
+
r"weight conversion"
|
| 626 |
+
]
|
| 627 |
+
|
| 628 |
+
for key in other_fields:
|
| 629 |
+
# Check if any pattern is contained within the key
|
| 630 |
+
if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
|
| 631 |
+
weight_key = key
|
| 632 |
+
break
|
| 633 |
+
|
| 634 |
+
if weight_key and other_fields[weight_key]:
|
| 635 |
+
try:
|
| 636 |
+
# Try to convert to float to ensure it's a valid number
|
| 637 |
+
weight_value = float(other_fields[weight_key])
|
| 638 |
+
|
| 639 |
+
# Only replace if the weight value is valid
|
| 640 |
+
if weight_value > 0:
|
| 641 |
+
# Store original values in case we need to revert
|
| 642 |
+
original_quantity = row.get("数量", "")
|
| 643 |
+
original_unit = row.get("单位", "")
|
| 644 |
+
|
| 645 |
+
# Extract unit from the bracket in the column header
|
| 646 |
+
unit = "吨" # default unit
|
| 647 |
+
bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
|
| 648 |
+
if bracket_match:
|
| 649 |
+
unit = bracket_match.group(1).strip()
|
| 650 |
+
# Clean up the unit (remove any extra text)
|
| 651 |
+
unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
|
| 652 |
+
|
| 653 |
+
# Replace with converted weight
|
| 654 |
+
row["数量"] = str(weight_value)
|
| 655 |
+
row["单位"] = unit
|
| 656 |
+
|
| 657 |
+
# Log the conversion
|
| 658 |
+
print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
|
| 659 |
+
|
| 660 |
+
# Remove the weight field from other_fields
|
| 661 |
+
del other_fields[weight_key]
|
| 662 |
+
except (ValueError, TypeError):
|
| 663 |
+
# If conversion fails, log and skip
|
| 664 |
+
print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
|
| 665 |
+
continue
|
| 666 |
+
|
| 667 |
+
return transformed_data
|
| 668 |
+
|
| 669 |
+
def handle_edge_cases(transformed_data):
|
| 670 |
+
"""
|
| 671 |
+
Main function to handle all edge cases in the transformed data.
|
| 672 |
+
Currently handles:
|
| 673 |
+
1. Weight conversion from '其他' field
|
| 674 |
+
"""
|
| 675 |
+
# Handle weight conversion edge case
|
| 676 |
+
transformed_data = handle_weight_conversion_edge_case(transformed_data)
|
| 677 |
+
|
| 678 |
+
return transformed_data
|
| 679 |
|
| 680 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
| 681 |
"""
|
|
|
|
| 1045 |
|
| 1046 |
# Save to file if requested
|
| 1047 |
if save_json and transformed_data:
|
| 1048 |
+
# Handle edge cases before saving
|
| 1049 |
+
transformed_data = handle_edge_cases(transformed_data)
|
| 1050 |
+
|
| 1051 |
with open(json_name, "w", encoding="utf-8") as f:
|
| 1052 |
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
| 1053 |
print(f"✅ Saved to {json_name}")
|
|
|
|
| 1167 |
|
| 1168 |
# Example Usage
|
| 1169 |
|
| 1170 |
+
# extract_po("GN-WARJS2504-282GC烟台嘉益钢材采购合同(1).docx")
|
| 1171 |
# extract_po("EPC简明合同格式-中英对照版.docx")
|
| 1172 |
|
| 1173 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|