MikeMai commited on
Commit
2720901
·
verified ·
1 Parent(s): a4dc2f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +301 -149
app.py CHANGED
@@ -18,6 +18,10 @@ from pydantic import BaseModel, Field, ValidationError, RootModel
18
  from typing import List, Optional
19
 
20
 
 
 
 
 
21
  HF_API_KEY = os.getenv("HF_API_KEY")
22
 
23
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
@@ -71,7 +75,7 @@ def extract_text_from_cell(cell):
71
  return lines # Return list of lines to preserve line breaks
72
 
73
  def clean_spaces(text):
74
- """
75
  Removes excessive spaces between Chinese characters while preserving spaces in English words.
76
  Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
77
  """
@@ -268,6 +272,10 @@ def process_long_table(rows):
268
  cells = row.findall('.//w:tc', NS)
269
  running_index = 0
270
 
 
 
 
 
271
  for cell in cells:
272
  cell_text = " ".join(extract_text_from_cell(cell))
273
 
@@ -319,10 +327,16 @@ def process_long_table(rows):
319
  filtered_table_data = []
320
  for row in cleaned_table_data:
321
 
322
- # Check if any cell contains "合计" (total) or "折扣" (discount)
 
323
  contains_total = False
324
  for key, value in row.items():
325
- if isinstance(value, str) and ("合计" in value or "折扣" in value):
 
 
 
 
 
326
  contains_total = True
327
  break
328
 
@@ -332,7 +346,7 @@ def process_long_table(rows):
332
  # Check potential serial number columns (use both Chinese and English variants)
333
  serial_number = None
334
  for column in row:
335
- if any(term in column for term in ["序号"]):
336
  serial_number = row[column]
337
  break
338
 
@@ -351,6 +365,11 @@ def process_long_table(rows):
351
  # If we couldn't find a serial number column, keep the row
352
  filtered_table_data.append(row)
353
 
 
 
 
 
 
354
  return filtered_table_data
355
 
356
  def identify_table_type_and_header_row(rows):
@@ -416,6 +435,11 @@ def extract_tables(root):
416
  if long_table_data:
417
  table_data[f"long_table_{table_index}"] = long_table_data
418
  continue
 
 
 
 
 
419
 
420
  return table_data, table_paragraphs
421
 
@@ -469,7 +493,7 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
469
  return json.dumps(extracted_data, ensure_ascii=False, indent=4)
470
 
471
 
472
- def deepseek_extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
473
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
474
 
475
  # Step 1: Convert JSON string to Python dictionary
@@ -498,7 +522,7 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
498
  Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
499
 
500
  {
501
- "合同编号":
502
  "接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
503
  "Recipient":
504
  "接收地": (注意:不是交货地点是目的港,只写中文,英文写在 place of receipt)
@@ -587,84 +611,11 @@ Contract data in JSON format:""" + f"""
587
 
588
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
589
 
590
- def handle_weight_conversion_edge_case(transformed_data):
591
- """
592
- Handles the edge case where converted weight is in '其他' field.
593
- If found, replaces quantity and unit with the converted weight values.
594
- Extracts unit from the bracket in the column header.
595
- """
596
- for row in transformed_data:
597
- if "其他" not in row or not isinstance(row["其他"], dict):
598
- continue
599
-
600
- other_fields = row["其他"]
601
-
602
- # Look for weight conversion column with various possible names
603
- weight_key = None
604
- weight_patterns = [
605
- r"换算重量(吨)",
606
- r"converted weight(t)",
607
- r"换算重量",
608
- r"converted weight",
609
- r"重量换算",
610
- r"weight conversion"
611
- ]
612
-
613
- for key in other_fields:
614
- # Check if any pattern is contained within the key
615
- if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
616
- weight_key = key
617
- break
618
-
619
- if weight_key and other_fields[weight_key]:
620
- try:
621
- # Try to convert to float to ensure it's a valid number
622
- weight_value = float(other_fields[weight_key])
623
-
624
- # Only replace if the weight value is valid
625
- if weight_value > 0:
626
- # Store original values in case we need to revert
627
- original_quantity = row.get("数量", "")
628
- original_unit = row.get("单位", "")
629
-
630
- # Extract unit from the bracket in the column header
631
- unit = "吨" # default unit
632
- bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
633
- if bracket_match:
634
- unit = bracket_match.group(1).strip()
635
- # Clean up the unit (remove any extra text)
636
- unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
637
-
638
- # Replace with converted weight
639
- row["数量"] = str(weight_value)
640
- row["单位"] = unit
641
-
642
- # Log the conversion
643
- print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
644
-
645
- # Remove the weight field from other_fields
646
- del other_fields[weight_key]
647
- except (ValueError, TypeError):
648
- # If conversion fails, log and skip
649
- print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
650
- continue
651
-
652
- return transformed_data
653
-
654
- def handle_edge_cases(transformed_data):
655
- """
656
- Main function to handle all edge cases in the transformed data.
657
- Currently handles:
658
- 1. Weight conversion from '其他' field
659
- """
660
- # Handle weight conversion edge case
661
- transformed_data = handle_weight_conversion_edge_case(transformed_data)
662
-
663
- return transformed_data
664
 
665
- def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
666
  """
667
  Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
 
668
  """
669
 
670
  # If price_list is empty, return an empty list
@@ -718,10 +669,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
718
  cleaned_headers.append(header.strip())
719
 
720
  return cleaned_headers
721
-
722
- # Apply the cleaning function to extracted headers
723
- extracted_headers = clean_header_spaces(extracted_headers)
724
-
725
  # Define our target fields from the Pydantic model
726
  target_fields = [
727
  "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
@@ -732,9 +680,9 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
732
  # Hardcoded mapping dictionary
733
  hardcoded_mapping = {
734
  # 序号 mappings
735
- "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号"],
736
  # 名称 mappings
737
- "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
738
  # 名称(英文) mappings
739
  "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
740
  # 品牌 mappings
@@ -749,47 +697,83 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
749
  # 单位 mappings
750
  "���位": ["单位 unit", "单位unit", "unit", "单位"],
751
  # 单价 mappings
752
- "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price",
753
- "单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)"],
 
 
754
  # 总价 mappings
755
  "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
756
- "总价(元)", "总额(元)", "总价 total amount (cny)", "总价(欧元) amount(eur)", "总价", "总价(元)amount (cny)", "总价(元)amount(cny)"],
 
757
  # 几郎单价 mappings
758
  "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
759
- "几郎单价(元)", "单价(几郎)", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price(几郎)(gnf)", "单价(元)unit price(cny)", "几郎单价 unit price(gnf)"],
760
  # 几郎总价 mappings
761
  "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
762
- "几郎总价(元)", "总额(几郎)", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount(几郎)(gnf)", "总价(元)amount(cny)", "几郎总价 amount(gnf)"],
763
  # 备注 mappings
764
  "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
765
  # 计划来源 mappings
766
  "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源(唛头信息)",
767
- "计划来源 planned source", "计划来源planned source", "planned source", "计划来源"]
768
  }
769
 
770
- # Try to map headers using hardcoded mapping
771
- standard_field_mapping = {}
772
- unmapped_headers = []
773
-
774
  # Clean the extracted headers first
775
  cleaned_extracted_headers = clean_header_spaces(extracted_headers)
776
-
777
  # Clean all possible headers in the hardcoded mapping
778
  cleaned_hardcoded_mapping = {
779
  std_field: [clean_header_spaces([h])[0] for h in possible_headers]
780
  for std_field, possible_headers in hardcoded_mapping.items()
781
  }
782
 
783
- print("\n🔍 Hardcoded Mapping Results:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
784
  print("-" * 50)
 
785
  for header in cleaned_extracted_headers:
786
  header_mapped = False
787
- for std_field, possible_headers in cleaned_hardcoded_mapping.items():
788
- if header in possible_headers:
789
- standard_field_mapping[std_field] = header
 
 
 
 
 
 
 
 
 
 
 
790
  header_mapped = True
791
- print(f"✅ {std_field} -> {header}")
792
- break
 
 
 
 
 
 
 
 
793
  if not header_mapped:
794
  unmapped_headers.append(header)
795
  print(f"❌ No match found for: {header}")
@@ -947,11 +931,43 @@ Do not force map 名称(英文) to 单价
947
  # Find the last Chinese character position
948
  last_chinese_pos = chinese_positions[-1]
949
 
950
- # Everything up to and including the last Chinese character is Chinese
951
- chinese_part = text[:last_chinese_pos + 1].strip()
952
 
953
- # Everything after the last Chinese character is English
954
- english_part = text[last_chinese_pos + 1:].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
 
956
  # If English part doesn't actually contain English letters, treat it as empty
957
  if not re.search(r'[a-zA-Z]', english_part):
@@ -984,7 +1000,7 @@ Do not force map 名称(英文) to 单价
984
  new_row["名称"] = chinese
985
  if english:
986
  new_row["名称(英文)"] = english
987
- print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
988
  else:
989
  # Just set the name directly
990
  new_row["名称"] = str(value)
@@ -1003,17 +1019,21 @@ Do not force map 名称(英文) to 单价
1003
  # Clean the header for comparison
1004
  cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
1005
 
1006
- # Check if this maps to a standard field
1007
  matched_field = None
 
 
1008
  for std_field, mapped_header in standard_field_mapping.items():
1009
  # Skip if mapped_header is None
1010
  if mapped_header is None:
1011
  continue
1012
 
1013
- # Make comparison more flexible by lowercasing and stripping spaces
1014
- if mapped_header.lower().strip() == cleaned_header.lower().strip():
 
 
 
1015
  matched_field = std_field
1016
- break
1017
 
1018
  # If we found a mapping, use it (but don't overwrite name fields)
1019
  if matched_field:
@@ -1047,8 +1067,12 @@ Do not force map 名称(英文) to 单价
1047
  json.dump(transformed_data, f, ensure_ascii=False, indent=4)
1048
  print(f"✅ Saved to {json_name}")
1049
 
 
 
 
1050
  return transformed_data
1051
 
 
1052
  def json_to_excel(contract_summary, json_data, excel_path):
1053
  """Converts extracted JSON tables to an Excel file."""
1054
 
@@ -1073,24 +1097,144 @@ def find_price_list_table(extracted_data, min_matches=3):
1073
  price_keywords = [
1074
  "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
1075
  "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
1076
- "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no"
 
1077
  ]
1078
- best_table = None
1079
- best_match_count = 0
1080
-
1081
- for key, table in extracted_data.items():
1082
- if "long_table" in key and isinstance(table, list) and table:
1083
- headers = list(table[0].keys())
1084
- match_count = 0
1085
- for header in headers:
1086
- header_lower = header.lower()
1087
- if any(kw in header_lower for kw in price_keywords):
 
 
 
 
 
 
 
1088
  match_count += 1
1089
- if match_count > best_match_count and match_count >= min_matches:
1090
- best_match_count = match_count
1091
- best_table = table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1092
 
1093
- return best_table
1094
 
1095
  #--- Extract PO ------------------------------
1096
 
@@ -1123,13 +1267,16 @@ def extract_po(docx_path):
1123
  extracted_data_dict = json.loads(extracted_data)
1124
  price_list_table = find_price_list_table(extracted_data_dict)
1125
 
1126
- # Rename the price list table key
1127
  if price_list_table:
1128
- # Find and rename the key containing the price list table
1129
- for key in list(extracted_data_dict.keys()):
1130
- if "long_table" in key and extracted_data_dict[key] == price_list_table:
1131
- extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
1132
- break
 
 
 
1133
  # Update the extracted_data string with proper formatting
1134
  extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1135
  else:
@@ -1148,12 +1295,12 @@ def extract_po(docx_path):
1148
  # Step 3: Process JSON with OpenAI to get structured output
1149
  print("Processing Contract Summary data with AI...")
1150
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1151
- contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
1152
 
1153
  # Process the price list
1154
  print("Processing Price List data with AI...")
1155
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1156
- price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
1157
 
1158
  # Step 4: Combine contract summary and long table data into a single JSON object
1159
  print("Combining AI Generated JSON with Extracted Data...")
@@ -1172,23 +1319,28 @@ def extract_po(docx_path):
1172
 
1173
  # Example Usage
1174
 
1175
- # extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
1176
- #extract_po("UAT Contracts\GN-WCIE2025-WCSP-276BJ-稳定土拌合机配件-合同.docx")
1177
 
1178
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1179
 
1180
  # Gradio Interface ------------------------------
1181
 
1182
- import gradio as gr
1183
- from gradio.themes.base import Base
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1184
 
1185
- interface = gr.Interface(
1186
- fn=extract_po,
1187
- title="PO Extractor 买卖合同数据提取",
1188
- inputs=gr.File(label="买卖合同 (.docx)"),
1189
- outputs=gr.Json(label="提取结果"),
1190
- flagging_mode="never",
1191
- theme=Base()
1192
- )
1193
 
1194
- interface.launch(show_error=True)
 
18
  from typing import List, Optional
19
 
20
 
21
+ from fuzzywuzzy import fuzz
22
+ from fuzzywuzzy import process
23
+
24
+
25
  HF_API_KEY = os.getenv("HF_API_KEY")
26
 
27
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
 
75
  return lines # Return list of lines to preserve line breaks
76
 
77
  def clean_spaces(text):
78
+ r"""
79
  Removes excessive spaces between Chinese characters while preserving spaces in English words.
80
  Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
81
  """
 
272
  cells = row.findall('.//w:tc', NS)
273
  running_index = 0
274
 
275
+ # Skip rows with only 1 or 2 columns (merged cells)
276
+ if len(cells) <= 2:
277
+ continue
278
+
279
  for cell in cells:
280
  cell_text = " ".join(extract_text_from_cell(cell))
281
 
 
327
  filtered_table_data = []
328
  for row in cleaned_table_data:
329
 
330
+ # Check if any cell contains "合计" (total), "折扣" (discount), or "明细见附件" (details in attachment)
331
+ # But exclude the remarks column from this check
332
  contains_total = False
333
  for key, value in row.items():
334
+ # Skip if this is a remarks column
335
+ key_lower = key.lower()
336
+ if any(term in key_lower for term in ["备注", "remarks", "note", "notes"]):
337
+ continue # Skip remarks column
338
+
339
+ if isinstance(value, str) and ("小计" in value or "总金额" in value or "合计" in value or "折扣" in value or "明细见附件" in value):
340
  contains_total = True
341
  break
342
 
 
346
  # Check potential serial number columns (use both Chinese and English variants)
347
  serial_number = None
348
  for column in row:
349
+ if any(term in column.lower() for term in ["序号"]):
350
  serial_number = row[column]
351
  break
352
 
 
365
  # If we couldn't find a serial number column, keep the row
366
  filtered_table_data.append(row)
367
 
368
+ print(f"Table process_long_table output: {filtered_table_data}")
369
+
370
+ # Remove duplicate columns (ending with _2, _3, etc.)
371
+ filtered_table_data = merge_duplicate_columns(filtered_table_data)
372
+
373
  return filtered_table_data
374
 
375
  def identify_table_type_and_header_row(rows):
 
435
  if long_table_data:
436
  table_data[f"long_table_{table_index}"] = long_table_data
437
  continue
438
+
439
+ # # Print the first row's cell texts for debugging
440
+ # header_cells = rows[0].findall('.//w:tc', NS)
441
+ # header_texts = ["|".join(extract_text_from_cell(cell)) for cell in header_cells]
442
+ # print(f"Table {table_index} header: {header_texts}")
443
 
444
  return table_data, table_paragraphs
445
 
 
493
  return json.dumps(extracted_data, ensure_ascii=False, indent=4)
494
 
495
 
496
+ def extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
497
  """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
498
 
499
  # Step 1: Convert JSON string to Python dictionary
 
522
  Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
523
 
524
  {
525
+ "合同编号": 如果合同编号出现多次,只需填一个,不要重复,优先填写有"-"的合同编号
526
  "接收人": (注意:不是买家必须是接收人,不是一个公司而是一个人)
527
  "Recipient":
528
  "接收地": (注意:不是交货地点是目的港,只写中文,英文写在 place of receipt)
 
611
 
612
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
613
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
614
 
615
+ def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
616
  """
617
  Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
618
+ Set fuzzy=False to use direct string matching for mapping.
619
  """
620
 
621
  # If price_list is empty, return an empty list
 
669
  cleaned_headers.append(header.strip())
670
 
671
  return cleaned_headers
672
+
 
 
 
673
  # Define our target fields from the Pydantic model
674
  target_fields = [
675
  "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
 
680
  # Hardcoded mapping dictionary
681
  hardcoded_mapping = {
682
  # 序号 mappings
683
+ "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号", "序号 no.:"],
684
  # 名称 mappings
685
+ "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name", "货描", "commodity",],
686
  # 名称(英文) mappings
687
  "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
688
  # 品牌 mappings
 
697
  # 单位 mappings
698
  "���位": ["单位 unit", "单位unit", "unit", "单位"],
699
  # 单价 mappings
700
+ "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
701
+ "单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
702
+ "价格 price", "价格price", "价格",
703
+ "美元单价"],
704
  # 总价 mappings
705
  "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
706
+ "总价(元)", "总额(元)", "总价 total amount (cny)", "总价(欧元) amount(eur)", "总价", "总价(元)amount (cny)", "总价(元)amount(cny)",
707
+ "总额 total amount (cny)", "总额", "总额 total amount","美元总价"],
708
  # 几郎单价 mappings
709
  "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
710
+ "几郎单价(元)", "单价(几郎)","单价 unit price (gnf)", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price(几郎)(gnf)", "单价(元)unit price(cny)", "几郎单价 unit price(gnf)"],
711
  # 几郎总价 mappings
712
  "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
713
+ "几郎总价(元)", "总额(几郎)", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount(几郎)(gnf)", "总价(元)amount(cny)", "几郎总价 amount(gnf)","总额 total amount (gnf)"],
714
  # 备注 mappings
715
  "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
716
  # 计划来源 mappings
717
  "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源(唛头信息)",
718
+ "计划来源 planned source", "计划来源planned source", "planned source", "计划来源","计划号 plan no."]
719
  }
720
 
 
 
 
 
721
  # Clean the extracted headers first
722
  cleaned_extracted_headers = clean_header_spaces(extracted_headers)
 
723
  # Clean all possible headers in the hardcoded mapping
724
  cleaned_hardcoded_mapping = {
725
  std_field: [clean_header_spaces([h])[0] for h in possible_headers]
726
  for std_field, possible_headers in hardcoded_mapping.items()
727
  }
728
 
729
+ # Fuzzy matching function
730
+ def fuzzy_match_header(header, possible_headers, threshold=70):
731
+ if not possible_headers:
732
+ return None, 0
733
+ best_match = process.extractOne(header, possible_headers, scorer=fuzz.ratio)
734
+ if best_match and best_match[1] >= threshold:
735
+ return best_match[0], best_match[1]
736
+ else:
737
+ return None, 0
738
+
739
+ # Try to map headers using hardcoded mapping (fuzzy or direct)
740
+ standard_field_mapping = {}
741
+ unmapped_headers = []
742
+
743
+ if fuzzy:
744
+ print("\n🔍 Fuzzy Hardcoded Mapping Results:")
745
+ else:
746
+ print("\n🔍 Direct Hardcoded Mapping Results:")
747
+
748
  print("-" * 50)
749
+
750
  for header in cleaned_extracted_headers:
751
  header_mapped = False
752
+ if fuzzy:
753
+ best_match_score = 0
754
+ best_match_field = None
755
+ best_match_header = None
756
+ for std_field, possible_headers in cleaned_hardcoded_mapping.items():
757
+ if std_field in standard_field_mapping:
758
+ continue
759
+ matched_header, score = fuzzy_match_header(header, possible_headers, threshold=70)
760
+ if matched_header and score > best_match_score:
761
+ best_match_score = score
762
+ best_match_field = std_field
763
+ best_match_header = matched_header
764
+ if best_match_field and best_match_score >= 70:
765
+ standard_field_mapping[best_match_field] = header
766
  header_mapped = True
767
+ print(f"✅ {best_match_field} -> {header} (score: {best_match_score})")
768
+ else:
769
+ for std_field, possible_headers in cleaned_hardcoded_mapping.items():
770
+ if std_field in standard_field_mapping:
771
+ continue
772
+ if header in possible_headers:
773
+ standard_field_mapping[std_field] = header
774
+ header_mapped = True
775
+ print(f"✅ {std_field} -> {header}")
776
+ break
777
  if not header_mapped:
778
  unmapped_headers.append(header)
779
  print(f"❌ No match found for: {header}")
 
931
  # Find the last Chinese character position
932
  last_chinese_pos = chinese_positions[-1]
933
 
934
+ # Look for the best split point that preserves brackets and punctuation
935
+ split_pos = last_chinese_pos + 1
936
 
937
+ # Check if there are brackets or parentheses that should be kept together
938
+ # Look ahead to see if there are closing brackets that belong to the Chinese part
939
+ remaining_text = text[split_pos:]
940
+
941
+ # If the remaining text starts with closing brackets/parentheses, include them in the Chinese part
942
+ # This handles both Chinese brackets () and English brackets () that belong to Chinese text
943
+ if remaining_text:
944
+ # Check for closing brackets that should stay with Chinese
945
+ # Use raw string to avoid escape sequence warning
946
+ closing_brackets = ')】」』》〉""''()]'
947
+ if remaining_text[0] in closing_brackets:
948
+ # Find how many closing brackets we have
949
+ bracket_count = 0
950
+ for char in remaining_text:
951
+ if char in closing_brackets:
952
+ bracket_count += 1
953
+ else:
954
+ break
955
+ split_pos += bracket_count
956
+
957
+ # Everything up to the split point is Chinese
958
+ chinese_part = text[:split_pos].strip()
959
+
960
+ # Everything after the split point is English
961
+ english_part = text[split_pos:].strip()
962
+
963
+ # Clean up the parts
964
+ # Remove any trailing Chinese punctuation from English part if it doesn't make sense
965
+ if english_part:
966
+ # If English part starts with Chinese punctuation that doesn't belong, move it to Chinese
967
+ chinese_punct_start = re.match(r'^[、,。;:!?]+', english_part)
968
+ if chinese_punct_start:
969
+ chinese_part += chinese_punct_start.group()
970
+ english_part = english_part[len(chinese_punct_start.group()):].strip()
971
 
972
  # If English part doesn't actually contain English letters, treat it as empty
973
  if not re.search(r'[a-zA-Z]', english_part):
 
1000
  new_row["名称"] = chinese
1001
  if english:
1002
  new_row["名称(英文)"] = english
1003
+ # print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
1004
  else:
1005
  # Just set the name directly
1006
  new_row["名称"] = str(value)
 
1019
  # Clean the header for comparison
1020
  cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
1021
 
1022
+ # Check if this maps to a standard field using fuzzy matching
1023
  matched_field = None
1024
+ best_match_score = 0
1025
+
1026
  for std_field, mapped_header in standard_field_mapping.items():
1027
  # Skip if mapped_header is None
1028
  if mapped_header is None:
1029
  continue
1030
 
1031
+ # Use fuzzy matching for more flexible comparison
1032
+ score = fuzz.ratio(cleaned_header.lower().strip(), mapped_header.lower().strip())
1033
+
1034
+ if score > best_match_score and score >= 80: # High threshold for data processing
1035
+ best_match_score = score
1036
  matched_field = std_field
 
1037
 
1038
  # If we found a mapping, use it (but don't overwrite name fields)
1039
  if matched_field:
 
1067
  json.dump(transformed_data, f, ensure_ascii=False, indent=4)
1068
  print(f"✅ Saved to {json_name}")
1069
 
1070
+ # Handle edge cases (including duplicate column merging) before returning
1071
+ transformed_data = handle_edge_cases(transformed_data)
1072
+
1073
  return transformed_data
1074
 
1075
+
1076
  def json_to_excel(contract_summary, json_data, excel_path):
1077
  """Converts extracted JSON tables to an Excel file."""
1078
 
 
1097
  price_keywords = [
1098
  "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
1099
  "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
1100
+ "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
1101
+ "货描", "commodity",
1102
  ]
1103
+ last_price_list_table = None
1104
+
1105
+ # Get all long tables and sort them by key to ensure we process them in order
1106
+ long_tables = [(key, table) for key, table in extracted_data.items()
1107
+ if "long_table" in key and isinstance(table, list) and table]
1108
+ long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
1109
+
1110
+ for key, table in long_tables:
1111
+
1112
+ headers = list(table[0].keys())
1113
+
1114
+ match_count = 0
1115
+ for header in headers:
1116
+ header_lower = header.lower()
1117
+ # Use fuzzy matching for keyword detection
1118
+ for keyword in price_keywords:
1119
+ if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
1120
  match_count += 1
1121
+ break # Found a match for this header, move to next
1122
+
1123
+ if match_count >= min_matches:
1124
+ last_price_list_table = table # Keep the last table that meets criteria
1125
+
1126
+ return last_price_list_table
1127
+
1128
+
1129
+ #--- Handle Edge Cases ------------------------------
1130
+
1131
+ def handle_weight_conversion_edge_case(transformed_data):
1132
+ """
1133
+ Handles the edge case where converted weight is in '其他' field.
1134
+ If found, replaces quantity and unit with the converted weight values.
1135
+ Extracts unit from the bracket in the column header.
1136
+ """
1137
+ for row in transformed_data:
1138
+ if "其他" not in row or not isinstance(row["其他"], dict):
1139
+ continue
1140
+
1141
+ other_fields = row["其他"]
1142
+
1143
+ # Look for weight conversion column with various possible names
1144
+ weight_key = None
1145
+ weight_patterns = [
1146
+ r"换算重量(吨)",
1147
+ r"converted weight(t)",
1148
+ r"换算重量",
1149
+ r"converted weight",
1150
+ r"重量换算",
1151
+ r"weight conversion"
1152
+ ]
1153
+
1154
+ for key in other_fields:
1155
+ # Check if any pattern is contained within the key
1156
+ if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
1157
+ weight_key = key
1158
+ break
1159
+
1160
+ if weight_key and other_fields[weight_key]:
1161
+ try:
1162
+ # Try to convert to float to ensure it's a valid number
1163
+ weight_value = float(other_fields[weight_key])
1164
+
1165
+ # Only replace if the weight value is valid
1166
+ if weight_value > 0:
1167
+ # Store original values in case we need to revert
1168
+ original_quantity = row.get("数量", "")
1169
+ original_unit = row.get("单位", "")
1170
+
1171
+ # Extract unit from the bracket in the column header
1172
+ unit = "吨" # default unit
1173
+ bracket_match = re.search(r'[((]([^))]+)[))]', weight_key)
1174
+ if bracket_match:
1175
+ unit = bracket_match.group(1).strip()
1176
+ # Clean up the unit (remove any extra text)
1177
+ unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
1178
+
1179
+ # Replace with converted weight
1180
+ row["数量"] = str(weight_value)
1181
+ row["单位"] = unit
1182
+
1183
+ # Log the conversion
1184
+ print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
1185
+
1186
+ # Remove the weight field from other_fields
1187
+ del other_fields[weight_key]
1188
+ except (ValueError, TypeError):
1189
+ # If conversion fails, log and skip
1190
+ print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
1191
+ continue
1192
+
1193
+ return transformed_data
1194
+
1195
+ def handle_edge_cases(transformed_data):
1196
+ """
1197
+ Main function to handle all edge cases in the transformed data.
1198
+ Currently handles:
1199
+ 1. Weight conversion from '其他' field
1200
+ 2. Duplicate column merging
1201
+ """
1202
+ # Handle weight conversion edge case
1203
+ transformed_data = handle_weight_conversion_edge_case(transformed_data)
1204
+
1205
+ # Handle duplicate column merging
1206
+ transformed_data = merge_duplicate_columns(transformed_data)
1207
+
1208
+ return transformed_data
1209
+
1210
+ def merge_duplicate_columns(transformed_data):
1211
+ """
1212
+ Removes duplicate columns that were created due to column spanning in headers.
1213
+ Simply deletes columns with names ending in _2, _3, etc.
1214
+ """
1215
+ if not transformed_data:
1216
+ return transformed_data
1217
+
1218
+ # Find all duplicate columns (ending with _number)
1219
+ duplicate_columns = set()
1220
+
1221
+ for row in transformed_data:
1222
+ for column in row.keys():
1223
+ # Check if this is a duplicate column (ends with _number)
1224
+ if re.match(r'^.+_\d+$', column):
1225
+ duplicate_columns.add(column)
1226
+
1227
+ # Remove all duplicate columns from all rows
1228
+ if duplicate_columns:
1229
+ print(f"🗑️ Removing duplicate columns: {sorted(duplicate_columns)}")
1230
+
1231
+ for row in transformed_data:
1232
+ for dup_col in duplicate_columns:
1233
+ if dup_col in row:
1234
+ del row[dup_col]
1235
+
1236
+ return transformed_data
1237
 
 
1238
 
1239
  #--- Extract PO ------------------------------
1240
 
 
1267
  extracted_data_dict = json.loads(extracted_data)
1268
  price_list_table = find_price_list_table(extracted_data_dict)
1269
 
1270
+ # Add the combined price list table to the extracted data
1271
  if price_list_table:
1272
+ # Remove all long_table keys that were used to create the price list
1273
+ keys_to_remove = [key for key in extracted_data_dict.keys() if "long_table" in key]
1274
+ for key in keys_to_remove:
1275
+ del extracted_data_dict[key]
1276
+
1277
+ # Add the combined price list table
1278
+ extracted_data_dict["price_list"] = price_list_table
1279
+
1280
  # Update the extracted_data string with proper formatting
1281
  extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1282
  else:
 
1295
  # Step 3: Process JSON with OpenAI to get structured output
1296
  print("Processing Contract Summary data with AI...")
1297
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1298
+ contract_summary = extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
1299
 
1300
  # Process the price list
1301
  print("Processing Price List data with AI...")
1302
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1303
+ price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename, fuzzy=True)
1304
 
1305
  # Step 4: Combine contract summary and long table data into a single JSON object
1306
  print("Combining AI Generated JSON with Extracted Data...")
 
1319
 
1320
  # Example Usage
1321
 
1322
+ # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
1323
+ # print(extract_po(r"UAT Contracts\修改后合同\GN-CGS202410-AMC-169BJ 柳工设备配件采购合同-广西柳工.docx"))
1324
 
1325
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1326
 
1327
  # Gradio Interface ------------------------------
1328
 
1329
+ # import gradio as gr
1330
+ # from gradio.themes.base import Base
1331
+
1332
+ # interface = gr.Interface(
1333
+ # fn=extract_po,
1334
+ # title="PO Extractor 买卖合同数据提取",
1335
+ # inputs=gr.File(label="买卖合同 (.docx)"),
1336
+ # outputs=gr.Json(label="提取结果"),
1337
+ # flagging_mode="never",
1338
+ # theme=Base()
1339
+ # )
1340
+
1341
+ # interface.launch(show_error=True)
1342
+
1343
+
1344
+
1345
 
 
 
 
 
 
 
 
 
1346