Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,6 +18,10 @@ from pydantic import BaseModel, Field, ValidationError, RootModel
|
|
| 18 |
from typing import List, Optional
|
| 19 |
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 22 |
|
| 23 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
|
@@ -71,7 +75,7 @@ def extract_text_from_cell(cell):
|
|
| 71 |
return lines # Return list of lines to preserve line breaks
|
| 72 |
|
| 73 |
def clean_spaces(text):
|
| 74 |
-
"""
|
| 75 |
Removes excessive spaces between Chinese characters while preserving spaces in English words.
|
| 76 |
Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
|
| 77 |
"""
|
|
@@ -268,6 +272,10 @@ def process_long_table(rows):
|
|
| 268 |
cells = row.findall('.//w:tc', NS)
|
| 269 |
running_index = 0
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
for cell in cells:
|
| 272 |
cell_text = " ".join(extract_text_from_cell(cell))
|
| 273 |
|
|
@@ -319,10 +327,16 @@ def process_long_table(rows):
|
|
| 319 |
filtered_table_data = []
|
| 320 |
for row in cleaned_table_data:
|
| 321 |
|
| 322 |
-
# Check if any cell contains "合计" (total)
|
|
|
|
| 323 |
contains_total = False
|
| 324 |
for key, value in row.items():
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
contains_total = True
|
| 327 |
break
|
| 328 |
|
|
@@ -332,7 +346,7 @@ def process_long_table(rows):
|
|
| 332 |
# Check potential serial number columns (use both Chinese and English variants)
|
| 333 |
serial_number = None
|
| 334 |
for column in row:
|
| 335 |
-
if any(term in column for term in ["序号"]):
|
| 336 |
serial_number = row[column]
|
| 337 |
break
|
| 338 |
|
|
@@ -351,6 +365,11 @@ def process_long_table(rows):
|
|
| 351 |
# If we couldn't find a serial number column, keep the row
|
| 352 |
filtered_table_data.append(row)
|
| 353 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
return filtered_table_data
|
| 355 |
|
| 356 |
def identify_table_type_and_header_row(rows):
|
|
@@ -416,6 +435,11 @@ def extract_tables(root):
|
|
| 416 |
if long_table_data:
|
| 417 |
table_data[f"long_table_{table_index}"] = long_table_data
|
| 418 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
|
| 420 |
return table_data, table_paragraphs
|
| 421 |
|
|
@@ -469,7 +493,7 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
|
|
| 469 |
return json.dumps(extracted_data, ensure_ascii=False, indent=4)
|
| 470 |
|
| 471 |
|
| 472 |
-
def
|
| 473 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
| 474 |
|
| 475 |
# Step 1: Convert JSON string to Python dictionary
|
|
@@ -498,7 +522,7 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
|
|
| 498 |
Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
|
| 499 |
|
| 500 |
{
|
| 501 |
-
"合同编号":
|
| 502 |
"接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
|
| 503 |
"Recipient":
|
| 504 |
"接收地": (注意:不是交货地点是目的港,只写中文,英文写在 place of receipt)
|
|
@@ -587,84 +611,11 @@ Contract data in JSON format:""" + f"""
|
|
| 587 |
|
| 588 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 589 |
|
| 590 |
-
def handle_weight_conversion_edge_case(transformed_data):
|
| 591 |
-
"""
|
| 592 |
-
Handles the edge case where converted weight is in '其他' field.
|
| 593 |
-
If found, replaces quantity and unit with the converted weight values.
|
| 594 |
-
Extracts unit from the bracket in the column header.
|
| 595 |
-
"""
|
| 596 |
-
for row in transformed_data:
|
| 597 |
-
if "其他" not in row or not isinstance(row["其他"], dict):
|
| 598 |
-
continue
|
| 599 |
-
|
| 600 |
-
other_fields = row["其他"]
|
| 601 |
-
|
| 602 |
-
# Look for weight conversion column with various possible names
|
| 603 |
-
weight_key = None
|
| 604 |
-
weight_patterns = [
|
| 605 |
-
r"换算重量(吨)",
|
| 606 |
-
r"converted weight(t)",
|
| 607 |
-
r"换算重量",
|
| 608 |
-
r"converted weight",
|
| 609 |
-
r"重量换算",
|
| 610 |
-
r"weight conversion"
|
| 611 |
-
]
|
| 612 |
-
|
| 613 |
-
for key in other_fields:
|
| 614 |
-
# Check if any pattern is contained within the key
|
| 615 |
-
if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
|
| 616 |
-
weight_key = key
|
| 617 |
-
break
|
| 618 |
-
|
| 619 |
-
if weight_key and other_fields[weight_key]:
|
| 620 |
-
try:
|
| 621 |
-
# Try to convert to float to ensure it's a valid number
|
| 622 |
-
weight_value = float(other_fields[weight_key])
|
| 623 |
-
|
| 624 |
-
# Only replace if the weight value is valid
|
| 625 |
-
if weight_value > 0:
|
| 626 |
-
# Store original values in case we need to revert
|
| 627 |
-
original_quantity = row.get("数量", "")
|
| 628 |
-
original_unit = row.get("单位", "")
|
| 629 |
-
|
| 630 |
-
# Extract unit from the bracket in the column header
|
| 631 |
-
unit = "吨" # default unit
|
| 632 |
-
bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
|
| 633 |
-
if bracket_match:
|
| 634 |
-
unit = bracket_match.group(1).strip()
|
| 635 |
-
# Clean up the unit (remove any extra text)
|
| 636 |
-
unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
|
| 637 |
-
|
| 638 |
-
# Replace with converted weight
|
| 639 |
-
row["数量"] = str(weight_value)
|
| 640 |
-
row["单位"] = unit
|
| 641 |
-
|
| 642 |
-
# Log the conversion
|
| 643 |
-
print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
|
| 644 |
-
|
| 645 |
-
# Remove the weight field from other_fields
|
| 646 |
-
del other_fields[weight_key]
|
| 647 |
-
except (ValueError, TypeError):
|
| 648 |
-
# If conversion fails, log and skip
|
| 649 |
-
print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
|
| 650 |
-
continue
|
| 651 |
-
|
| 652 |
-
return transformed_data
|
| 653 |
-
|
| 654 |
-
def handle_edge_cases(transformed_data):
|
| 655 |
-
"""
|
| 656 |
-
Main function to handle all edge cases in the transformed data.
|
| 657 |
-
Currently handles:
|
| 658 |
-
1. Weight conversion from '其他' field
|
| 659 |
-
"""
|
| 660 |
-
# Handle weight conversion edge case
|
| 661 |
-
transformed_data = handle_weight_conversion_edge_case(transformed_data)
|
| 662 |
-
|
| 663 |
-
return transformed_data
|
| 664 |
|
| 665 |
-
def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
| 666 |
"""
|
| 667 |
Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
|
|
|
|
| 668 |
"""
|
| 669 |
|
| 670 |
# If price_list is empty, return an empty list
|
|
@@ -718,10 +669,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
|
|
| 718 |
cleaned_headers.append(header.strip())
|
| 719 |
|
| 720 |
return cleaned_headers
|
| 721 |
-
|
| 722 |
-
# Apply the cleaning function to extracted headers
|
| 723 |
-
extracted_headers = clean_header_spaces(extracted_headers)
|
| 724 |
-
|
| 725 |
# Define our target fields from the Pydantic model
|
| 726 |
target_fields = [
|
| 727 |
"序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
|
|
@@ -732,9 +680,9 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
|
|
| 732 |
# Hardcoded mapping dictionary
|
| 733 |
hardcoded_mapping = {
|
| 734 |
# 序号 mappings
|
| 735 |
-
"序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号"],
|
| 736 |
# 名称 mappings
|
| 737 |
-
"名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
|
| 738 |
# 名称(英文) mappings
|
| 739 |
"名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
|
| 740 |
# 品牌 mappings
|
|
@@ -749,47 +697,83 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
|
|
| 749 |
# 单位 mappings
|
| 750 |
"���位": ["单位 unit", "单位unit", "unit", "单位"],
|
| 751 |
# 单价 mappings
|
| 752 |
-
"单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price",
|
| 753 |
-
"单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)"
|
|
|
|
|
|
|
| 754 |
# 总价 mappings
|
| 755 |
"总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
|
| 756 |
-
"总价(元)", "总额(元)", "总价 total amount (cny)", "总价(欧元) amount(eur)", "总价", "总价(元)amount (cny)", "总价(元)amount(cny)"
|
|
|
|
| 757 |
# 几郎单价 mappings
|
| 758 |
"几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
|
| 759 |
-
"几郎单价(元)", "单价(几郎)", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price(几郎)(gnf)", "单价(元)unit price(cny)", "几郎单价 unit price(gnf)"],
|
| 760 |
# 几郎总价 mappings
|
| 761 |
"几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
|
| 762 |
-
"几郎总价(元)", "总额(几郎)", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount(几郎)(gnf)", "总价(元)amount(cny)", "几郎总价 amount(gnf)"],
|
| 763 |
# 备注 mappings
|
| 764 |
"备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
|
| 765 |
# 计划来源 mappings
|
| 766 |
"计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源(唛头信息)",
|
| 767 |
-
"计划来源 planned source", "计划来源planned source", "planned source", "计划来源"]
|
| 768 |
}
|
| 769 |
|
| 770 |
-
# Try to map headers using hardcoded mapping
|
| 771 |
-
standard_field_mapping = {}
|
| 772 |
-
unmapped_headers = []
|
| 773 |
-
|
| 774 |
# Clean the extracted headers first
|
| 775 |
cleaned_extracted_headers = clean_header_spaces(extracted_headers)
|
| 776 |
-
|
| 777 |
# Clean all possible headers in the hardcoded mapping
|
| 778 |
cleaned_hardcoded_mapping = {
|
| 779 |
std_field: [clean_header_spaces([h])[0] for h in possible_headers]
|
| 780 |
for std_field, possible_headers in hardcoded_mapping.items()
|
| 781 |
}
|
| 782 |
|
| 783 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
print("-" * 50)
|
|
|
|
| 785 |
for header in cleaned_extracted_headers:
|
| 786 |
header_mapped = False
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
header_mapped = True
|
| 791 |
-
print(f"✅ {
|
| 792 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
if not header_mapped:
|
| 794 |
unmapped_headers.append(header)
|
| 795 |
print(f"❌ No match found for: {header}")
|
|
@@ -947,11 +931,43 @@ Do not force map 名称(英文) to 单价
|
|
| 947 |
# Find the last Chinese character position
|
| 948 |
last_chinese_pos = chinese_positions[-1]
|
| 949 |
|
| 950 |
-
#
|
| 951 |
-
|
| 952 |
|
| 953 |
-
#
|
| 954 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 955 |
|
| 956 |
# If English part doesn't actually contain English letters, treat it as empty
|
| 957 |
if not re.search(r'[a-zA-Z]', english_part):
|
|
@@ -984,7 +1000,7 @@ Do not force map 名称(英文) to 单价
|
|
| 984 |
new_row["名称"] = chinese
|
| 985 |
if english:
|
| 986 |
new_row["名称(英文)"] = english
|
| 987 |
-
print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
|
| 988 |
else:
|
| 989 |
# Just set the name directly
|
| 990 |
new_row["名称"] = str(value)
|
|
@@ -1003,17 +1019,21 @@ Do not force map 名称(英文) to 单价
|
|
| 1003 |
# Clean the header for comparison
|
| 1004 |
cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
|
| 1005 |
|
| 1006 |
-
# Check if this maps to a standard field
|
| 1007 |
matched_field = None
|
|
|
|
|
|
|
| 1008 |
for std_field, mapped_header in standard_field_mapping.items():
|
| 1009 |
# Skip if mapped_header is None
|
| 1010 |
if mapped_header is None:
|
| 1011 |
continue
|
| 1012 |
|
| 1013 |
-
#
|
| 1014 |
-
|
|
|
|
|
|
|
|
|
|
| 1015 |
matched_field = std_field
|
| 1016 |
-
break
|
| 1017 |
|
| 1018 |
# If we found a mapping, use it (but don't overwrite name fields)
|
| 1019 |
if matched_field:
|
|
@@ -1047,8 +1067,12 @@ Do not force map 名称(英文) to 单价
|
|
| 1047 |
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
| 1048 |
print(f"✅ Saved to {json_name}")
|
| 1049 |
|
|
|
|
|
|
|
|
|
|
| 1050 |
return transformed_data
|
| 1051 |
|
|
|
|
| 1052 |
def json_to_excel(contract_summary, json_data, excel_path):
|
| 1053 |
"""Converts extracted JSON tables to an Excel file."""
|
| 1054 |
|
|
@@ -1073,24 +1097,144 @@ def find_price_list_table(extracted_data, min_matches=3):
|
|
| 1073 |
price_keywords = [
|
| 1074 |
"名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
|
| 1075 |
"单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
|
| 1076 |
-
"几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no"
|
|
|
|
| 1077 |
]
|
| 1078 |
-
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
for key, table in extracted_data.items()
|
| 1082 |
-
|
| 1083 |
-
|
| 1084 |
-
|
| 1085 |
-
|
| 1086 |
-
|
| 1087 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
match_count += 1
|
| 1089 |
-
|
| 1090 |
-
|
| 1091 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1092 |
|
| 1093 |
-
return best_table
|
| 1094 |
|
| 1095 |
#--- Extract PO ------------------------------
|
| 1096 |
|
|
@@ -1123,13 +1267,16 @@ def extract_po(docx_path):
|
|
| 1123 |
extracted_data_dict = json.loads(extracted_data)
|
| 1124 |
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1125 |
|
| 1126 |
-
#
|
| 1127 |
if price_list_table:
|
| 1128 |
-
#
|
| 1129 |
-
for key in
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
|
|
|
|
|
|
|
|
|
| 1133 |
# Update the extracted_data string with proper formatting
|
| 1134 |
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1135 |
else:
|
|
@@ -1148,12 +1295,12 @@ def extract_po(docx_path):
|
|
| 1148 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1149 |
print("Processing Contract Summary data with AI...")
|
| 1150 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
| 1151 |
-
contract_summary =
|
| 1152 |
|
| 1153 |
# Process the price list
|
| 1154 |
print("Processing Price List data with AI...")
|
| 1155 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 1156 |
-
price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
|
| 1157 |
|
| 1158 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 1159 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
@@ -1172,23 +1319,28 @@ def extract_po(docx_path):
|
|
| 1172 |
|
| 1173 |
# Example Usage
|
| 1174 |
|
| 1175 |
-
# extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
|
| 1176 |
-
#extract_po("UAT Contracts
|
| 1177 |
|
| 1178 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1179 |
|
| 1180 |
# Gradio Interface ------------------------------
|
| 1181 |
|
| 1182 |
-
import gradio as gr
|
| 1183 |
-
from gradio.themes.base import Base
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1184 |
|
| 1185 |
-
interface = gr.Interface(
|
| 1186 |
-
fn=extract_po,
|
| 1187 |
-
title="PO Extractor 买卖合同数据提取",
|
| 1188 |
-
inputs=gr.File(label="买卖合同 (.docx)"),
|
| 1189 |
-
outputs=gr.Json(label="提取结果"),
|
| 1190 |
-
flagging_mode="never",
|
| 1191 |
-
theme=Base()
|
| 1192 |
-
)
|
| 1193 |
|
| 1194 |
-
interface.launch(show_error=True)
|
|
|
|
| 18 |
from typing import List, Optional
|
| 19 |
|
| 20 |
|
| 21 |
+
from fuzzywuzzy import fuzz
|
| 22 |
+
from fuzzywuzzy import process
|
| 23 |
+
|
| 24 |
+
|
| 25 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 26 |
|
| 27 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
|
|
|
| 75 |
return lines # Return list of lines to preserve line breaks
|
| 76 |
|
| 77 |
def clean_spaces(text):
|
| 78 |
+
r"""
|
| 79 |
Removes excessive spaces between Chinese characters while preserving spaces in English words.
|
| 80 |
Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
|
| 81 |
"""
|
|
|
|
| 272 |
cells = row.findall('.//w:tc', NS)
|
| 273 |
running_index = 0
|
| 274 |
|
| 275 |
+
# Skip rows with only 1 or 2 columns (merged cells)
|
| 276 |
+
if len(cells) <= 2:
|
| 277 |
+
continue
|
| 278 |
+
|
| 279 |
for cell in cells:
|
| 280 |
cell_text = " ".join(extract_text_from_cell(cell))
|
| 281 |
|
|
|
|
| 327 |
filtered_table_data = []
|
| 328 |
for row in cleaned_table_data:
|
| 329 |
|
| 330 |
+
# Check if any cell contains "合计" (total), "折扣" (discount), or "明细见附件" (details in attachment)
|
| 331 |
+
# But exclude the remarks column from this check
|
| 332 |
contains_total = False
|
| 333 |
for key, value in row.items():
|
| 334 |
+
# Skip if this is a remarks column
|
| 335 |
+
key_lower = key.lower()
|
| 336 |
+
if any(term in key_lower for term in ["备注", "remarks", "note", "notes"]):
|
| 337 |
+
continue # Skip remarks column
|
| 338 |
+
|
| 339 |
+
if isinstance(value, str) and ("小计" in value or "总金额" in value or "合计" in value or "折扣" in value or "明细见附件" in value):
|
| 340 |
contains_total = True
|
| 341 |
break
|
| 342 |
|
|
|
|
| 346 |
# Check potential serial number columns (use both Chinese and English variants)
|
| 347 |
serial_number = None
|
| 348 |
for column in row:
|
| 349 |
+
if any(term in column.lower() for term in ["序号"]):
|
| 350 |
serial_number = row[column]
|
| 351 |
break
|
| 352 |
|
|
|
|
| 365 |
# If we couldn't find a serial number column, keep the row
|
| 366 |
filtered_table_data.append(row)
|
| 367 |
|
| 368 |
+
print(f"Table process_long_table output: {filtered_table_data}")
|
| 369 |
+
|
| 370 |
+
# Remove duplicate columns (ending with _2, _3, etc.)
|
| 371 |
+
filtered_table_data = merge_duplicate_columns(filtered_table_data)
|
| 372 |
+
|
| 373 |
return filtered_table_data
|
| 374 |
|
| 375 |
def identify_table_type_and_header_row(rows):
|
|
|
|
| 435 |
if long_table_data:
|
| 436 |
table_data[f"long_table_{table_index}"] = long_table_data
|
| 437 |
continue
|
| 438 |
+
|
| 439 |
+
# # Print the first row's cell texts for debugging
|
| 440 |
+
# header_cells = rows[0].findall('.//w:tc', NS)
|
| 441 |
+
# header_texts = ["|".join(extract_text_from_cell(cell)) for cell in header_cells]
|
| 442 |
+
# print(f"Table {table_index} header: {header_texts}")
|
| 443 |
|
| 444 |
return table_data, table_paragraphs
|
| 445 |
|
|
|
|
| 493 |
return json.dumps(extracted_data, ensure_ascii=False, indent=4)
|
| 494 |
|
| 495 |
|
| 496 |
+
def extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
|
| 497 |
"""Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
|
| 498 |
|
| 499 |
# Step 1: Convert JSON string to Python dictionary
|
|
|
|
| 522 |
Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
|
| 523 |
|
| 524 |
{
|
| 525 |
+
"合同编号": 如果合同编号出现多次,只需填一个,不要重复,优先填写有"-"的合同编号
|
| 526 |
"接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
|
| 527 |
"Recipient":
|
| 528 |
"接收地": (注意:不是交货地点是目的港,只写中文,英文写在 place of receipt)
|
|
|
|
| 611 |
|
| 612 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 613 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 614 |
|
| 615 |
+
def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
|
| 616 |
"""
|
| 617 |
Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
|
| 618 |
+
Set fuzzy=False to use direct string matching for mapping.
|
| 619 |
"""
|
| 620 |
|
| 621 |
# If price_list is empty, return an empty list
|
|
|
|
| 669 |
cleaned_headers.append(header.strip())
|
| 670 |
|
| 671 |
return cleaned_headers
|
| 672 |
+
|
|
|
|
|
|
|
|
|
|
| 673 |
# Define our target fields from the Pydantic model
|
| 674 |
target_fields = [
|
| 675 |
"序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
|
|
|
|
| 680 |
# Hardcoded mapping dictionary
|
| 681 |
hardcoded_mapping = {
|
| 682 |
# 序号 mappings
|
| 683 |
+
"序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号", "序号 no.:"],
|
| 684 |
# 名称 mappings
|
| 685 |
+
"名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name", "货描", "commodity",],
|
| 686 |
# 名称(英文) mappings
|
| 687 |
"名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
|
| 688 |
# 品牌 mappings
|
|
|
|
| 697 |
# 单位 mappings
|
| 698 |
"���位": ["单位 unit", "单位unit", "unit", "单位"],
|
| 699 |
# 单价 mappings
|
| 700 |
+
"单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
|
| 701 |
+
"单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
|
| 702 |
+
"价格 price", "价格price", "价格",
|
| 703 |
+
"美元单价"],
|
| 704 |
# 总价 mappings
|
| 705 |
"总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
|
| 706 |
+
"总价(元)", "总额(元)", "总价 total amount (cny)", "总价(欧元) amount(eur)", "总价", "总价(元)amount (cny)", "总价(元)amount(cny)",
|
| 707 |
+
"总额 total amount (cny)", "总额", "总额 total amount","美元总价"],
|
| 708 |
# 几郎单价 mappings
|
| 709 |
"几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
|
| 710 |
+
"几郎单价(元)", "单价(几郎)","单价 unit price (gnf)", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price(几郎)(gnf)", "单价(元)unit price(cny)", "几郎单价 unit price(gnf)"],
|
| 711 |
# 几郎总价 mappings
|
| 712 |
"几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
|
| 713 |
+
"几郎总价(元)", "总额(几郎)", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount(几郎)(gnf)", "总价(元)amount(cny)", "几郎总价 amount(gnf)","总额 total amount (gnf)"],
|
| 714 |
# 备注 mappings
|
| 715 |
"备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
|
| 716 |
# 计划来源 mappings
|
| 717 |
"计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源(唛头信息)",
|
| 718 |
+
"计划来源 planned source", "计划来源planned source", "planned source", "计划来源","计划号 plan no."]
|
| 719 |
}
|
| 720 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
# Clean the extracted headers first
|
| 722 |
cleaned_extracted_headers = clean_header_spaces(extracted_headers)
|
|
|
|
| 723 |
# Clean all possible headers in the hardcoded mapping
|
| 724 |
cleaned_hardcoded_mapping = {
|
| 725 |
std_field: [clean_header_spaces([h])[0] for h in possible_headers]
|
| 726 |
for std_field, possible_headers in hardcoded_mapping.items()
|
| 727 |
}
|
| 728 |
|
| 729 |
+
# Fuzzy matching function
|
| 730 |
+
def fuzzy_match_header(header, possible_headers, threshold=70):
|
| 731 |
+
if not possible_headers:
|
| 732 |
+
return None, 0
|
| 733 |
+
best_match = process.extractOne(header, possible_headers, scorer=fuzz.ratio)
|
| 734 |
+
if best_match and best_match[1] >= threshold:
|
| 735 |
+
return best_match[0], best_match[1]
|
| 736 |
+
else:
|
| 737 |
+
return None, 0
|
| 738 |
+
|
| 739 |
+
# Try to map headers using hardcoded mapping (fuzzy or direct)
|
| 740 |
+
standard_field_mapping = {}
|
| 741 |
+
unmapped_headers = []
|
| 742 |
+
|
| 743 |
+
if fuzzy:
|
| 744 |
+
print("\n🔍 Fuzzy Hardcoded Mapping Results:")
|
| 745 |
+
else:
|
| 746 |
+
print("\n🔍 Direct Hardcoded Mapping Results:")
|
| 747 |
+
|
| 748 |
print("-" * 50)
|
| 749 |
+
|
| 750 |
for header in cleaned_extracted_headers:
|
| 751 |
header_mapped = False
|
| 752 |
+
if fuzzy:
|
| 753 |
+
best_match_score = 0
|
| 754 |
+
best_match_field = None
|
| 755 |
+
best_match_header = None
|
| 756 |
+
for std_field, possible_headers in cleaned_hardcoded_mapping.items():
|
| 757 |
+
if std_field in standard_field_mapping:
|
| 758 |
+
continue
|
| 759 |
+
matched_header, score = fuzzy_match_header(header, possible_headers, threshold=70)
|
| 760 |
+
if matched_header and score > best_match_score:
|
| 761 |
+
best_match_score = score
|
| 762 |
+
best_match_field = std_field
|
| 763 |
+
best_match_header = matched_header
|
| 764 |
+
if best_match_field and best_match_score >= 70:
|
| 765 |
+
standard_field_mapping[best_match_field] = header
|
| 766 |
header_mapped = True
|
| 767 |
+
print(f"✅ {best_match_field} -> {header} (score: {best_match_score})")
|
| 768 |
+
else:
|
| 769 |
+
for std_field, possible_headers in cleaned_hardcoded_mapping.items():
|
| 770 |
+
if std_field in standard_field_mapping:
|
| 771 |
+
continue
|
| 772 |
+
if header in possible_headers:
|
| 773 |
+
standard_field_mapping[std_field] = header
|
| 774 |
+
header_mapped = True
|
| 775 |
+
print(f"✅ {std_field} -> {header}")
|
| 776 |
+
break
|
| 777 |
if not header_mapped:
|
| 778 |
unmapped_headers.append(header)
|
| 779 |
print(f"❌ No match found for: {header}")
|
|
|
|
| 931 |
# Find the last Chinese character position
|
| 932 |
last_chinese_pos = chinese_positions[-1]
|
| 933 |
|
| 934 |
+
# Look for the best split point that preserves brackets and punctuation
|
| 935 |
+
split_pos = last_chinese_pos + 1
|
| 936 |
|
| 937 |
+
# Check if there are brackets or parentheses that should be kept together
|
| 938 |
+
# Look ahead to see if there are closing brackets that belong to the Chinese part
|
| 939 |
+
remaining_text = text[split_pos:]
|
| 940 |
+
|
| 941 |
+
# If the remaining text starts with closing brackets/parentheses, include them in the Chinese part
|
| 942 |
+
# This handles both Chinese brackets () and English brackets () that belong to Chinese text
|
| 943 |
+
if remaining_text:
|
| 944 |
+
# Check for closing brackets that should stay with Chinese
|
| 945 |
+
# Use raw string to avoid escape sequence warning
|
| 946 |
+
closing_brackets = ')】」』》〉""''()]'
|
| 947 |
+
if remaining_text[0] in closing_brackets:
|
| 948 |
+
# Find how many closing brackets we have
|
| 949 |
+
bracket_count = 0
|
| 950 |
+
for char in remaining_text:
|
| 951 |
+
if char in closing_brackets:
|
| 952 |
+
bracket_count += 1
|
| 953 |
+
else:
|
| 954 |
+
break
|
| 955 |
+
split_pos += bracket_count
|
| 956 |
+
|
| 957 |
+
# Everything up to the split point is Chinese
|
| 958 |
+
chinese_part = text[:split_pos].strip()
|
| 959 |
+
|
| 960 |
+
# Everything after the split point is English
|
| 961 |
+
english_part = text[split_pos:].strip()
|
| 962 |
+
|
| 963 |
+
# Clean up the parts
|
| 964 |
+
# Remove any trailing Chinese punctuation from English part if it doesn't make sense
|
| 965 |
+
if english_part:
|
| 966 |
+
# If English part starts with Chinese punctuation that doesn't belong, move it to Chinese
|
| 967 |
+
chinese_punct_start = re.match(r'^[、,。;:!?]+', english_part)
|
| 968 |
+
if chinese_punct_start:
|
| 969 |
+
chinese_part += chinese_punct_start.group()
|
| 970 |
+
english_part = english_part[len(chinese_punct_start.group()):].strip()
|
| 971 |
|
| 972 |
# If English part doesn't actually contain English letters, treat it as empty
|
| 973 |
if not re.search(r'[a-zA-Z]', english_part):
|
|
|
|
| 1000 |
new_row["名称"] = chinese
|
| 1001 |
if english:
|
| 1002 |
new_row["名称(英文)"] = english
|
| 1003 |
+
# print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
|
| 1004 |
else:
|
| 1005 |
# Just set the name directly
|
| 1006 |
new_row["名称"] = str(value)
|
|
|
|
| 1019 |
# Clean the header for comparison
|
| 1020 |
cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
|
| 1021 |
|
| 1022 |
+
# Check if this maps to a standard field using fuzzy matching
|
| 1023 |
matched_field = None
|
| 1024 |
+
best_match_score = 0
|
| 1025 |
+
|
| 1026 |
for std_field, mapped_header in standard_field_mapping.items():
|
| 1027 |
# Skip if mapped_header is None
|
| 1028 |
if mapped_header is None:
|
| 1029 |
continue
|
| 1030 |
|
| 1031 |
+
# Use fuzzy matching for more flexible comparison
|
| 1032 |
+
score = fuzz.ratio(cleaned_header.lower().strip(), mapped_header.lower().strip())
|
| 1033 |
+
|
| 1034 |
+
if score > best_match_score and score >= 80: # High threshold for data processing
|
| 1035 |
+
best_match_score = score
|
| 1036 |
matched_field = std_field
|
|
|
|
| 1037 |
|
| 1038 |
# If we found a mapping, use it (but don't overwrite name fields)
|
| 1039 |
if matched_field:
|
|
|
|
| 1067 |
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
| 1068 |
print(f"✅ Saved to {json_name}")
|
| 1069 |
|
| 1070 |
+
# Handle edge cases (including duplicate column merging) before returning
|
| 1071 |
+
transformed_data = handle_edge_cases(transformed_data)
|
| 1072 |
+
|
| 1073 |
return transformed_data
|
| 1074 |
|
| 1075 |
+
|
| 1076 |
def json_to_excel(contract_summary, json_data, excel_path):
|
| 1077 |
"""Converts extracted JSON tables to an Excel file."""
|
| 1078 |
|
|
|
|
| 1097 |
price_keywords = [
|
| 1098 |
"名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
|
| 1099 |
"单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
|
| 1100 |
+
"几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
|
| 1101 |
+
"货描", "commodity",
|
| 1102 |
]
|
| 1103 |
+
last_price_list_table = None
|
| 1104 |
+
|
| 1105 |
+
# Get all long tables and sort them by key to ensure we process them in order
|
| 1106 |
+
long_tables = [(key, table) for key, table in extracted_data.items()
|
| 1107 |
+
if "long_table" in key and isinstance(table, list) and table]
|
| 1108 |
+
long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
|
| 1109 |
+
|
| 1110 |
+
for key, table in long_tables:
|
| 1111 |
+
|
| 1112 |
+
headers = list(table[0].keys())
|
| 1113 |
+
|
| 1114 |
+
match_count = 0
|
| 1115 |
+
for header in headers:
|
| 1116 |
+
header_lower = header.lower()
|
| 1117 |
+
# Use fuzzy matching for keyword detection
|
| 1118 |
+
for keyword in price_keywords:
|
| 1119 |
+
if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
|
| 1120 |
match_count += 1
|
| 1121 |
+
break # Found a match for this header, move to next
|
| 1122 |
+
|
| 1123 |
+
if match_count >= min_matches:
|
| 1124 |
+
last_price_list_table = table # Keep the last table that meets criteria
|
| 1125 |
+
|
| 1126 |
+
return last_price_list_table
|
| 1127 |
+
|
| 1128 |
+
|
| 1129 |
+
#--- Handle Edge Cases ------------------------------
|
| 1130 |
+
|
| 1131 |
+
def handle_weight_conversion_edge_case(transformed_data):
|
| 1132 |
+
"""
|
| 1133 |
+
Handles the edge case where converted weight is in '其他' field.
|
| 1134 |
+
If found, replaces quantity and unit with the converted weight values.
|
| 1135 |
+
Extracts unit from the bracket in the column header.
|
| 1136 |
+
"""
|
| 1137 |
+
for row in transformed_data:
|
| 1138 |
+
if "其他" not in row or not isinstance(row["其他"], dict):
|
| 1139 |
+
continue
|
| 1140 |
+
|
| 1141 |
+
other_fields = row["其他"]
|
| 1142 |
+
|
| 1143 |
+
# Look for weight conversion column with various possible names
|
| 1144 |
+
weight_key = None
|
| 1145 |
+
weight_patterns = [
|
| 1146 |
+
r"换算重量(吨)",
|
| 1147 |
+
r"converted weight(t)",
|
| 1148 |
+
r"换算重量",
|
| 1149 |
+
r"converted weight",
|
| 1150 |
+
r"重量换算",
|
| 1151 |
+
r"weight conversion"
|
| 1152 |
+
]
|
| 1153 |
+
|
| 1154 |
+
for key in other_fields:
|
| 1155 |
+
# Check if any pattern is contained within the key
|
| 1156 |
+
if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
|
| 1157 |
+
weight_key = key
|
| 1158 |
+
break
|
| 1159 |
+
|
| 1160 |
+
if weight_key and other_fields[weight_key]:
|
| 1161 |
+
try:
|
| 1162 |
+
# Try to convert to float to ensure it's a valid number
|
| 1163 |
+
weight_value = float(other_fields[weight_key])
|
| 1164 |
+
|
| 1165 |
+
# Only replace if the weight value is valid
|
| 1166 |
+
if weight_value > 0:
|
| 1167 |
+
# Store original values in case we need to revert
|
| 1168 |
+
original_quantity = row.get("数量", "")
|
| 1169 |
+
original_unit = row.get("单位", "")
|
| 1170 |
+
|
| 1171 |
+
# Extract unit from the bracket in the column header
|
| 1172 |
+
unit = "吨" # default unit
|
| 1173 |
+
bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
|
| 1174 |
+
if bracket_match:
|
| 1175 |
+
unit = bracket_match.group(1).strip()
|
| 1176 |
+
# Clean up the unit (remove any extra text)
|
| 1177 |
+
unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
|
| 1178 |
+
|
| 1179 |
+
# Replace with converted weight
|
| 1180 |
+
row["数量"] = str(weight_value)
|
| 1181 |
+
row["单位"] = unit
|
| 1182 |
+
|
| 1183 |
+
# Log the conversion
|
| 1184 |
+
print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
|
| 1185 |
+
|
| 1186 |
+
# Remove the weight field from other_fields
|
| 1187 |
+
del other_fields[weight_key]
|
| 1188 |
+
except (ValueError, TypeError):
|
| 1189 |
+
# If conversion fails, log and skip
|
| 1190 |
+
print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
|
| 1191 |
+
continue
|
| 1192 |
+
|
| 1193 |
+
return transformed_data
|
| 1194 |
+
|
| 1195 |
+
def handle_edge_cases(transformed_data):
|
| 1196 |
+
"""
|
| 1197 |
+
Main function to handle all edge cases in the transformed data.
|
| 1198 |
+
Currently handles:
|
| 1199 |
+
1. Weight conversion from '其他' field
|
| 1200 |
+
2. Duplicate column merging
|
| 1201 |
+
"""
|
| 1202 |
+
# Handle weight conversion edge case
|
| 1203 |
+
transformed_data = handle_weight_conversion_edge_case(transformed_data)
|
| 1204 |
+
|
| 1205 |
+
# Handle duplicate column merging
|
| 1206 |
+
transformed_data = merge_duplicate_columns(transformed_data)
|
| 1207 |
+
|
| 1208 |
+
return transformed_data
|
| 1209 |
+
|
| 1210 |
+
def merge_duplicate_columns(transformed_data):
|
| 1211 |
+
"""
|
| 1212 |
+
Removes duplicate columns that were created due to column spanning in headers.
|
| 1213 |
+
Simply deletes columns with names ending in _2, _3, etc.
|
| 1214 |
+
"""
|
| 1215 |
+
if not transformed_data:
|
| 1216 |
+
return transformed_data
|
| 1217 |
+
|
| 1218 |
+
# Find all duplicate columns (ending with _number)
|
| 1219 |
+
duplicate_columns = set()
|
| 1220 |
+
|
| 1221 |
+
for row in transformed_data:
|
| 1222 |
+
for column in row.keys():
|
| 1223 |
+
# Check if this is a duplicate column (ends with _number)
|
| 1224 |
+
if re.match(r'^.+_\d+$', column):
|
| 1225 |
+
duplicate_columns.add(column)
|
| 1226 |
+
|
| 1227 |
+
# Remove all duplicate columns from all rows
|
| 1228 |
+
if duplicate_columns:
|
| 1229 |
+
print(f"🗑️ Removing duplicate columns: {sorted(duplicate_columns)}")
|
| 1230 |
+
|
| 1231 |
+
for row in transformed_data:
|
| 1232 |
+
for dup_col in duplicate_columns:
|
| 1233 |
+
if dup_col in row:
|
| 1234 |
+
del row[dup_col]
|
| 1235 |
+
|
| 1236 |
+
return transformed_data
|
| 1237 |
|
|
|
|
| 1238 |
|
| 1239 |
#--- Extract PO ------------------------------
|
| 1240 |
|
|
|
|
| 1267 |
extracted_data_dict = json.loads(extracted_data)
|
| 1268 |
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1269 |
|
| 1270 |
+
# Add the combined price list table to the extracted data
|
| 1271 |
if price_list_table:
|
| 1272 |
+
# Remove all long_table keys that were used to create the price list
|
| 1273 |
+
keys_to_remove = [key for key in extracted_data_dict.keys() if "long_table" in key]
|
| 1274 |
+
for key in keys_to_remove:
|
| 1275 |
+
del extracted_data_dict[key]
|
| 1276 |
+
|
| 1277 |
+
# Add the combined price list table
|
| 1278 |
+
extracted_data_dict["price_list"] = price_list_table
|
| 1279 |
+
|
| 1280 |
# Update the extracted_data string with proper formatting
|
| 1281 |
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1282 |
else:
|
|
|
|
| 1295 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1296 |
print("Processing Contract Summary data with AI...")
|
| 1297 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
| 1298 |
+
contract_summary = extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
|
| 1299 |
|
| 1300 |
# Process the price list
|
| 1301 |
print("Processing Price List data with AI...")
|
| 1302 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 1303 |
+
price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename, fuzzy=True)
|
| 1304 |
|
| 1305 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 1306 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
|
|
| 1319 |
|
| 1320 |
# Example Usage
|
| 1321 |
|
| 1322 |
+
# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
|
| 1323 |
+
# print(extract_po(r"UAT Contracts\修改后合同\GN-CGS202410-AMC-169BJ 柳工设备配件采购合同-广西柳工.docx"))
|
| 1324 |
|
| 1325 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1326 |
|
| 1327 |
# Gradio Interface ------------------------------
|
| 1328 |
|
| 1329 |
+
# import gradio as gr
|
| 1330 |
+
# from gradio.themes.base import Base
|
| 1331 |
+
|
| 1332 |
+
# interface = gr.Interface(
|
| 1333 |
+
# fn=extract_po,
|
| 1334 |
+
# title="PO Extractor 买卖合同数据提取",
|
| 1335 |
+
# inputs=gr.File(label="买卖合同 (.docx)"),
|
| 1336 |
+
# outputs=gr.Json(label="提取结果"),
|
| 1337 |
+
# flagging_mode="never",
|
| 1338 |
+
# theme=Base()
|
| 1339 |
+
# )
|
| 1340 |
+
|
| 1341 |
+
# interface.launch(show_error=True)
|
| 1342 |
+
|
| 1343 |
+
|
| 1344 |
+
|
| 1345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1346 |
|
|
|