Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on Jul 1, 2025

Commit

2720901

verified ·

1 Parent(s): a4dc2f9

Update app.py

Browse files

Files changed (1) hide show

app.py +301 -149

app.py CHANGED Viewed

@@ -18,6 +18,10 @@ from pydantic import BaseModel, Field, ValidationError, RootModel
 from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
@@ -71,7 +75,7 @@ def extract_text_from_cell(cell):
     return lines  # Return list of lines to preserve line breaks
 def clean_spaces(text):
-    """
     Removes excessive spaces between Chinese characters while preserving spaces in English words.
     Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
     """
@@ -268,6 +272,10 @@ def process_long_table(rows):
         cells = row.findall('.//w:tc', NS)
         running_index = 0
         for cell in cells:
             cell_text = " ".join(extract_text_from_cell(cell))
@@ -319,10 +327,16 @@ def process_long_table(rows):
     filtered_table_data = []
     for row in cleaned_table_data:
-        # Check if any cell contains "合计" (total) or "折扣" (discount)
         contains_total = False
         for key, value in row.items():
-            if isinstance(value, str) and ("合计" in value or "折扣" in value):
                 contains_total = True
                 break
@@ -332,7 +346,7 @@ def process_long_table(rows):
         # Check potential serial number columns (use both Chinese and English variants)
         serial_number = None
         for column in row:
-            if any(term in column for term in ["序号"]):
                 serial_number = row[column]
                 break
@@ -351,6 +365,11 @@ def process_long_table(rows):
             # If we couldn't find a serial number column, keep the row
             filtered_table_data.append(row)
     return filtered_table_data
 def identify_table_type_and_header_row(rows):
@@ -416,6 +435,11 @@ def extract_tables(root):
             if long_table_data:
                 table_data[f"long_table_{table_index}"] = long_table_data
             continue
     return table_data, table_paragraphs
@@ -469,7 +493,7 @@ def xml_to_json(xml_content, save_json=False, json_filename="extracted_data.json
     return json.dumps(extracted_data, ensure_ascii=False, indent=4)
-def deepseek_extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
     # Step 1: Convert JSON string to Python dictionary
@@ -498,7 +522,7 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
 Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
 {
-    "合同编号":
     "接收人": （注意：不是买家必须是接收人，不是一个公司而是一个人）
     "Recipient":
     "接收地": （注意：不是交货地点是目的港，只写中文，英文写在 place of receipt）
@@ -587,84 +611,11 @@ Contract data in JSON format:""" + f"""
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
-def handle_weight_conversion_edge_case(transformed_data):
-    """
-    Handles the edge case where converted weight is in '其他' field.
-    If found, replaces quantity and unit with the converted weight values.
-    Extracts unit from the bracket in the column header.
-    """
-    for row in transformed_data:
-        if "其他" not in row or not isinstance(row["其他"], dict):
-            continue
-        other_fields = row["其他"]
-        # Look for weight conversion column with various possible names
-        weight_key = None
-        weight_patterns = [
-            r"换算重量（吨）",
-            r"converted weight（t）",
-            r"换算重量",
-            r"converted weight",
-            r"重量换算",
-            r"weight conversion"
-        ]
-        for key in other_fields:
-            # Check if any pattern is contained within the key
-            if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
-                weight_key = key
-                break
-        if weight_key and other_fields[weight_key]:
-            try:
-                # Try to convert to float to ensure it's a valid number
-                weight_value = float(other_fields[weight_key])
-                # Only replace if the weight value is valid
-                if weight_value > 0:
-                    # Store original values in case we need to revert
-                    original_quantity = row.get("数量", "")
-                    original_unit = row.get("单位", "")
-                    # Extract unit from the bracket in the column header
-                    unit = "吨"  # default unit
-                    bracket_match = re.search(r'[（(]([^）)]+)[）)]', weight_key)
-                    if bracket_match:
-                        unit = bracket_match.group(1).strip()
-                        # Clean up the unit (remove any extra text)
-                        unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
-                    # Replace with converted weight
-                    row["数量"] = str(weight_value)
-                    row["单位"] = unit
-                    # Log the conversion
-                    print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
-                    # Remove the weight field from other_fields
-                    del other_fields[weight_key]
-            except (ValueError, TypeError):
-                # If conversion fails, log and skip
-                print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
-                continue
-    return transformed_data
-def handle_edge_cases(transformed_data):
-    """
-    Main function to handle all edge cases in the transformed data.
-    Currently handles:
-    1. Weight conversion from '其他' field
-    """
-    # Handle weight conversion edge case
-    transformed_data = handle_weight_conversion_edge_case(transformed_data)
-    return transformed_data
-def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     """
     Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
     """
     # If price_list is empty, return an empty list
@@ -718,10 +669,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
             cleaned_headers.append(header.strip())
         return cleaned_headers
-    # Apply the cleaning function to extracted headers
-    extracted_headers = clean_header_spaces(extracted_headers)
     # Define our target fields from the Pydantic model
     target_fields = [
         "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
@@ -732,9 +680,9 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
     # Hardcoded mapping dictionary
     hardcoded_mapping = {
         # 序号 mappings
-        "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号"],
         # 名称 mappings
-        "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
         # 名称(英文) mappings
         "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
         # 品牌 mappings
@@ -749,47 +697,83 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
         # 单位 mappings
         "���位": ["单位 unit", "单位unit", "unit", "单位"],
         # 单价 mappings
-        "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price",
-               "单价（元）", "单价(cny)", "单价 unit price (cny)", "单价（欧元） unit price（eur）", "单价", "单价（元） unit price(cny)", "单价（元）unit price（cny）", "单价（欧元） unit price（eur）"],
         # 总价 mappings
         "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
-               "总价（元）", "总额（元）", "总价 total amount (cny)", "总价（欧元） amount（eur）", "总价", "总价（元）amount (cny)", "总价（元）amount（cny）"],
         # 几郎单价 mappings
         "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
-                 "几郎单价（元）", "单价（几郎）", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price（几郎）(gnf)", "单价（元）unit price（cny）", "几郎单价 unit price(gnf)"],
         # 几郎总价 mappings
         "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
-                 "几郎总价（元）", "总额（几郎）", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount（几郎）(gnf)", "总价（元）amount（cny）", "几郎总价 amount(gnf)"],
         # 备注 mappings
         "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
         # 计划来源 mappings
         "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源（唛头信息）",
-                 "计划来源 planned source", "计划来源planned source", "planned source", "计划来源"]
     }
-    # Try to map headers using hardcoded mapping
-    standard_field_mapping = {}
-    unmapped_headers = []
     # Clean the extracted headers first
     cleaned_extracted_headers = clean_header_spaces(extracted_headers)
     # Clean all possible headers in the hardcoded mapping
     cleaned_hardcoded_mapping = {
         std_field: [clean_header_spaces([h])[0] for h in possible_headers]
         for std_field, possible_headers in hardcoded_mapping.items()
     }
-    print("\n🔍 Hardcoded Mapping Results:")
     print("-" * 50)
     for header in cleaned_extracted_headers:
         header_mapped = False
-        for std_field, possible_headers in cleaned_hardcoded_mapping.items():
-            if header in possible_headers:
-                standard_field_mapping[std_field] = header
                 header_mapped = True
-                print(f"✅ {std_field} -> {header}")
-                break
         if not header_mapped:
             unmapped_headers.append(header)
             print(f"❌ No match found for: {header}")
@@ -947,11 +931,43 @@ Do not force map 名称(英文) to 单价
         # Find the last Chinese character position
         last_chinese_pos = chinese_positions[-1]
-        # Everything up to and including the last Chinese character is Chinese
-        chinese_part = text[:last_chinese_pos + 1].strip()
-        # Everything after the last Chinese character is English
-        english_part = text[last_chinese_pos + 1:].strip()
         # If English part doesn't actually contain English letters, treat it as empty
         if not re.search(r'[a-zA-Z]', english_part):
@@ -984,7 +1000,7 @@ Do not force map 名称(英文) to 单价
                         new_row["名称"] = chinese
                     if english:
                         new_row["名称(英文)"] = english
-                    print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
                 else:
                     # Just set the name directly
                     new_row["名称"] = str(value)
@@ -1003,17 +1019,21 @@ Do not force map 名称(英文) to 单价
             # Clean the header for comparison
             cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
-            # Check if this maps to a standard field
             matched_field = None
             for std_field, mapped_header in standard_field_mapping.items():
                 # Skip if mapped_header is None
                 if mapped_header is None:
                     continue
-                # Make comparison more flexible by lowercasing and stripping spaces
-                if mapped_header.lower().strip() == cleaned_header.lower().strip():
                     matched_field = std_field
-                    break
             # If we found a mapping, use it (but don't overwrite name fields)
             if matched_field:
@@ -1047,8 +1067,12 @@ Do not force map 名称(英文) to 单价
             json.dump(transformed_data, f, ensure_ascii=False, indent=4)
         print(f"✅ Saved to {json_name}")
     return transformed_data
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
@@ -1073,24 +1097,144 @@ def find_price_list_table(extracted_data, min_matches=3):
     price_keywords = [
         "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
         "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
-        "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no"
     ]
-    best_table = None
-    best_match_count = 0
-    for key, table in extracted_data.items():
-        if "long_table" in key and isinstance(table, list) and table:
-            headers = list(table[0].keys())
-            match_count = 0
-            for header in headers:
-                header_lower = header.lower()
-                if any(kw in header_lower for kw in price_keywords):
                     match_count += 1
-            if match_count > best_match_count and match_count >= min_matches:
-                best_match_count = match_count
-                best_table = table
-    return best_table
 #--- Extract PO ------------------------------
@@ -1123,13 +1267,16 @@ def extract_po(docx_path):
         extracted_data_dict = json.loads(extracted_data)
         price_list_table = find_price_list_table(extracted_data_dict)
-        # Rename the price list table key
         if price_list_table:
-            # Find and rename the key containing the price list table
-            for key in list(extracted_data_dict.keys()):
-                if "long_table" in key and extracted_data_dict[key] == price_list_table:
-                    extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
-                    break
             # Update the extracted_data string with proper formatting
             extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
         else:
@@ -1148,12 +1295,12 @@ def extract_po(docx_path):
         # Step 3: Process JSON with OpenAI to get structured output
         print("Processing Contract Summary data with AI...")
         contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
-        contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
         # Process the price list
         print("Processing Price List data with AI...")
         price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
-        price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
         # Step 4: Combine contract summary and long table data into a single JSON object
         print("Combining AI Generated JSON with Extracted Data...")
@@ -1172,23 +1319,28 @@ def extract_po(docx_path):
 # Example Usage
-# extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
-#extract_po("UAT Contracts\GN-WCIE2025-WCSP-276BJ-稳定土拌合机配件-合同.docx")
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------
-import gradio as gr
-from gradio.themes.base import Base
-interface = gr.Interface(
-    fn=extract_po,
-    title="PO Extractor 买卖合同数据提取",
-    inputs=gr.File(label="买卖合同 （.docx）"),
-    outputs=gr.Json(label="提取结果"),
-    flagging_mode="never",
-    theme=Base()
-)
-interface.launch(show_error=True)

 from typing import List, Optional
+from fuzzywuzzy import fuzz
+from fuzzywuzzy import process
 HF_API_KEY = os.getenv("HF_API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
     return lines  # Return list of lines to preserve line breaks
 def clean_spaces(text):
+    r"""
     Removes excessive spaces between Chinese characters while preserving spaces in English words.
     Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
     """
         cells = row.findall('.//w:tc', NS)
         running_index = 0
+        # Skip rows with only 1 or 2 columns (merged cells)
+        if len(cells) <= 2:
+            continue
         for cell in cells:
             cell_text = " ".join(extract_text_from_cell(cell))
     filtered_table_data = []
     for row in cleaned_table_data:
+        # Check if any cell contains "合计" (total), "折扣" (discount), or "明细见附件" (details in attachment)
+        # But exclude the remarks column from this check
         contains_total = False
         for key, value in row.items():
+            # Skip if this is a remarks column
+            key_lower = key.lower()
+            if any(term in key_lower for term in ["备注", "remarks", "note", "notes"]):
+                continue  # Skip remarks column
+            if isinstance(value, str) and ("小计" in value or "总金额" in value or "合计" in value or "折扣" in value or "明细见附件" in value):
                 contains_total = True
                 break
         # Check potential serial number columns (use both Chinese and English variants)
         serial_number = None
         for column in row:
+            if any(term in column.lower() for term in ["序号"]):
                 serial_number = row[column]
                 break
             # If we couldn't find a serial number column, keep the row
             filtered_table_data.append(row)
+    print(f"Table process_long_table output: {filtered_table_data}")
+    # Remove duplicate columns (ending with _2, _3, etc.)
+    filtered_table_data = merge_duplicate_columns(filtered_table_data)
     return filtered_table_data
 def identify_table_type_and_header_row(rows):
             if long_table_data:
                 table_data[f"long_table_{table_index}"] = long_table_data
             continue
+    # # Print the first row's cell texts for debugging
+    # header_cells = rows[0].findall('.//w:tc', NS)
+    # header_texts = ["|".join(extract_text_from_cell(cell)) for cell in header_cells]
+    # print(f"Table {table_index} header: {header_texts}")
     return table_data, table_paragraphs
     return json.dumps(extracted_data, ensure_ascii=False, indent=4)
+def extract_contract_summary(json_data, save_json=False, json_filename="contract_summary.json"):
     """Sends extracted JSON data to OpenAI and returns formatted structured JSON."""
     # Step 1: Convert JSON string to Python dictionary
 Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
 {
+    "合同编号": 如果合同编号出现多次，只需填一个，不要重复，优先填写有"-"的合同编号
     "接收人": （注意：不是买家必须是接收人，不是一个公司而是一个人）
     "Recipient":
     "接收地": （注意：不是交货地点是目的港，只写中文，英文写在 place of receipt）
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
+def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
     """
     Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
+    Set fuzzy=False to use direct string matching for mapping.
     """
     # If price_list is empty, return an empty list
             cleaned_headers.append(header.strip())
         return cleaned_headers
     # Define our target fields from the Pydantic model
     target_fields = [
         "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
     # Hardcoded mapping dictionary
     hardcoded_mapping = {
         # 序号 mappings
+        "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号", "序号 no.:"],
         # 名称 mappings
+        "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name", "货描", "commodity",],
         # 名称(英文) mappings
         "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
         # 品牌 mappings
         # 单位 mappings
         "���位": ["单位 unit", "单位unit", "unit", "单位"],
         # 单价 mappings
+        "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price（cny）",
+               "单价（元）", "单价(cny)", "单价 unit price (cny)", "单价（欧元） unit price（eur）", "单价", "单价（元） unit price(cny)", "单价（元）unit price（cny）", "单价（欧元） unit price（eur）",
+               "价格 price", "价格price", "价格",
+               "美元单价"],
         # 总价 mappings
         "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
+               "总价（元）", "总额（元）", "总价 total amount (cny)", "总价（欧元） amount（eur）", "总价", "总价（元）amount (cny)", "总价（元）amount（cny）",
+               "总额 total amount (cny)", "总额", "总额 total amount","美元总价"],
         # 几郎单价 mappings
         "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
+                 "几郎单价（元）", "单价（几郎）","单价 unit price （gnf）", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price（几郎）(gnf)", "单价（元）unit price（cny）", "几郎单价 unit price(gnf)"],
         # 几郎总价 mappings
         "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
+                 "几郎总价（元）", "总额（几郎）", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount（几郎）(gnf)", "总价（元）amount（cny）", "几郎总价 amount(gnf)","总额 total amount (gnf)"],
         # 备注 mappings
         "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
         # 计划来源 mappings
         "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源（唛头信息）",
+                 "计划来源 planned source", "计划来源planned source", "planned source", "计划来源","计划号 plan no."]
     }
     # Clean the extracted headers first
     cleaned_extracted_headers = clean_header_spaces(extracted_headers)
     # Clean all possible headers in the hardcoded mapping
     cleaned_hardcoded_mapping = {
         std_field: [clean_header_spaces([h])[0] for h in possible_headers]
         for std_field, possible_headers in hardcoded_mapping.items()
     }
+    # Fuzzy matching function
+    def fuzzy_match_header(header, possible_headers, threshold=70):
+        if not possible_headers:
+            return None, 0
+        best_match = process.extractOne(header, possible_headers, scorer=fuzz.ratio)
+        if best_match and best_match[1] >= threshold:
+            return best_match[0], best_match[1]
+        else:
+            return None, 0
+    # Try to map headers using hardcoded mapping (fuzzy or direct)
+    standard_field_mapping = {}
+    unmapped_headers = []
+    if fuzzy:
+        print("\n🔍 Fuzzy Hardcoded Mapping Results:")
+    else:
+        print("\n🔍 Direct Hardcoded Mapping Results:")
     print("-" * 50)
     for header in cleaned_extracted_headers:
         header_mapped = False
+        if fuzzy:
+            best_match_score = 0
+            best_match_field = None
+            best_match_header = None
+            for std_field, possible_headers in cleaned_hardcoded_mapping.items():
+                if std_field in standard_field_mapping:
+                    continue
+                matched_header, score = fuzzy_match_header(header, possible_headers, threshold=70)
+                if matched_header and score > best_match_score:
+                    best_match_score = score
+                    best_match_field = std_field
+                    best_match_header = matched_header
+            if best_match_field and best_match_score >= 70:
+                standard_field_mapping[best_match_field] = header
                 header_mapped = True
+                print(f"✅ {best_match_field} -> {header} (score: {best_match_score})")
+        else:
+            for std_field, possible_headers in cleaned_hardcoded_mapping.items():
+                if std_field in standard_field_mapping:
+                    continue
+                if header in possible_headers:
+                    standard_field_mapping[std_field] = header
+                    header_mapped = True
+                    print(f"✅ {std_field} -> {header}")
+                    break
         if not header_mapped:
             unmapped_headers.append(header)
             print(f"❌ No match found for: {header}")
         # Find the last Chinese character position
         last_chinese_pos = chinese_positions[-1]
+        # Look for the best split point that preserves brackets and punctuation
+        split_pos = last_chinese_pos + 1
+        # Check if there are brackets or parentheses that should be kept together
+        # Look ahead to see if there are closing brackets that belong to the Chinese part
+        remaining_text = text[split_pos:]
+        # If the remaining text starts with closing brackets/parentheses, include them in the Chinese part
+        # This handles both Chinese brackets （） and English brackets () that belong to Chinese text
+        if remaining_text:
+            # Check for closing brackets that should stay with Chinese
+            # Use raw string to avoid escape sequence warning
+            closing_brackets = '）】」』》〉""''()]'
+            if remaining_text[0] in closing_brackets:
+                # Find how many closing brackets we have
+                bracket_count = 0
+                for char in remaining_text:
+                    if char in closing_brackets:
+                        bracket_count += 1
+                    else:
+                        break
+                split_pos += bracket_count
+        # Everything up to the split point is Chinese
+        chinese_part = text[:split_pos].strip()
+        # Everything after the split point is English
+        english_part = text[split_pos:].strip()
+        # Clean up the parts
+        # Remove any trailing Chinese punctuation from English part if it doesn't make sense
+        if english_part:
+            # If English part starts with Chinese punctuation that doesn't belong, move it to Chinese
+            chinese_punct_start = re.match(r'^[、，。；：！？]+', english_part)
+            if chinese_punct_start:
+                chinese_part += chinese_punct_start.group()
+                english_part = english_part[len(chinese_punct_start.group()):].strip()
         # If English part doesn't actually contain English letters, treat it as empty
         if not re.search(r'[a-zA-Z]', english_part):
                         new_row["名称"] = chinese
                     if english:
                         new_row["名称(英文)"] = english
+                    # print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
                 else:
                     # Just set the name directly
                     new_row["名称"] = str(value)
             # Clean the header for comparison
             cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
+            # Check if this maps to a standard field using fuzzy matching
             matched_field = None
+            best_match_score = 0
             for std_field, mapped_header in standard_field_mapping.items():
                 # Skip if mapped_header is None
                 if mapped_header is None:
                     continue
+                # Use fuzzy matching for more flexible comparison
+                score = fuzz.ratio(cleaned_header.lower().strip(), mapped_header.lower().strip())
+                if score > best_match_score and score >= 80:  # High threshold for data processing
+                    best_match_score = score
                     matched_field = std_field
             # If we found a mapping, use it (but don't overwrite name fields)
             if matched_field:
             json.dump(transformed_data, f, ensure_ascii=False, indent=4)
         print(f"✅ Saved to {json_name}")
+    # Handle edge cases (including duplicate column merging) before returning
+    transformed_data = handle_edge_cases(transformed_data)
     return transformed_data
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
     price_keywords = [
         "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
         "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
+        "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
+        "货描", "commodity",
     ]
+    last_price_list_table = None
+    # Get all long tables and sort them by key to ensure we process them in order
+    long_tables = [(key, table) for key, table in extracted_data.items()
+                   if "long_table" in key and isinstance(table, list) and table]
+    long_tables.sort(key=lambda x: x[0])  # Sort by key to maintain order
+    for key, table in long_tables:
+        headers = list(table[0].keys())
+        match_count = 0
+        for header in headers:
+            header_lower = header.lower()
+            # Use fuzzy matching for keyword detection
+            for keyword in price_keywords:
+                if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
                     match_count += 1
+                    break  # Found a match for this header, move to next
+        if match_count >= min_matches:
+            last_price_list_table = table  # Keep the last table that meets criteria
+    return last_price_list_table
+#--- Handle Edge Cases ------------------------------
+def handle_weight_conversion_edge_case(transformed_data):
+    """
+    Handles the edge case where converted weight is in '其他' field.
+    If found, replaces quantity and unit with the converted weight values.
+    Extracts unit from the bracket in the column header.
+    """
+    for row in transformed_data:
+        if "其他" not in row or not isinstance(row["其他"], dict):
+            continue
+        other_fields = row["其他"]
+        # Look for weight conversion column with various possible names
+        weight_key = None
+        weight_patterns = [
+            r"换算重量（吨）",
+            r"converted weight（t）",
+            r"换算重量",
+            r"converted weight",
+            r"重量换算",
+            r"weight conversion"
+        ]
+        for key in other_fields:
+            # Check if any pattern is contained within the key
+            if any(re.search(pattern, key, re.IGNORECASE) for pattern in weight_patterns):
+                weight_key = key
+                break
+        if weight_key and other_fields[weight_key]:
+            try:
+                # Try to convert to float to ensure it's a valid number
+                weight_value = float(other_fields[weight_key])
+                # Only replace if the weight value is valid
+                if weight_value > 0:
+                    # Store original values in case we need to revert
+                    original_quantity = row.get("数量", "")
+                    original_unit = row.get("单位", "")
+                    # Extract unit from the bracket in the column header
+                    unit = "吨"  # default unit
+                    bracket_match = re.search(r'[（(]([^）)]+)[）)]', weight_key)
+                    if bracket_match:
+                        unit = bracket_match.group(1).strip()
+                        # Clean up the unit (remove any extra text)
+                        unit = re.sub(r'[^a-zA-Z\u4e00-\u9fff]', '', unit)
+                    # Replace with converted weight
+                    row["数量"] = str(weight_value)
+                    row["单位"] = unit
+                    # Log the conversion
+                    print(f"Converted weight: {weight_value}{unit} (original: {original_quantity} {original_unit})")
+                    # Remove the weight field from other_fields
+                    del other_fields[weight_key]
+            except (ValueError, TypeError):
+                # If conversion fails, log and skip
+                print(f"Warning: Invalid weight value '{other_fields[weight_key]}' in row")
+                continue
+    return transformed_data
+def handle_edge_cases(transformed_data):
+    """
+    Main function to handle all edge cases in the transformed data.
+    Currently handles:
+    1. Weight conversion from '其他' field
+    2. Duplicate column merging
+    """
+    # Handle weight conversion edge case
+    transformed_data = handle_weight_conversion_edge_case(transformed_data)
+    # Handle duplicate column merging
+    transformed_data = merge_duplicate_columns(transformed_data)
+    return transformed_data
+def merge_duplicate_columns(transformed_data):
+    """
+    Removes duplicate columns that were created due to column spanning in headers.
+    Simply deletes columns with names ending in _2, _3, etc.
+    """
+    if not transformed_data:
+        return transformed_data
+    # Find all duplicate columns (ending with _number)
+    duplicate_columns = set()
+    for row in transformed_data:
+        for column in row.keys():
+            # Check if this is a duplicate column (ends with _number)
+            if re.match(r'^.+_\d+$', column):
+                duplicate_columns.add(column)
+    # Remove all duplicate columns from all rows
+    if duplicate_columns:
+        print(f"🗑️ Removing duplicate columns: {sorted(duplicate_columns)}")
+        for row in transformed_data:
+            for dup_col in duplicate_columns:
+                if dup_col in row:
+                    del row[dup_col]
+    return transformed_data
 #--- Extract PO ------------------------------
         extracted_data_dict = json.loads(extracted_data)
         price_list_table = find_price_list_table(extracted_data_dict)
+        # Add the combined price list table to the extracted data
         if price_list_table:
+            # Remove all long_table keys that were used to create the price list
+            keys_to_remove = [key for key in extracted_data_dict.keys() if "long_table" in key]
+            for key in keys_to_remove:
+                del extracted_data_dict[key]
+            # Add the combined price list table
+            extracted_data_dict["price_list"] = price_list_table
             # Update the extracted_data string with proper formatting
             extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
         else:
         # Step 3: Process JSON with OpenAI to get structured output
         print("Processing Contract Summary data with AI...")
         contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
+        contract_summary = extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
         # Process the price list
         print("Processing Price List data with AI...")
         price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
+        price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename, fuzzy=True)
         # Step 4: Combine contract summary and long table data into a single JSON object
         print("Combining AI Generated JSON with Extracted Data...")
 # Example Usage
+# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
+# print(extract_po(r"UAT Contracts\修改后合同\GN-CGS202410-AMC-169BJ 柳工设备配件采购合同-广西柳工.docx"))
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------
+# import gradio as gr
+# from gradio.themes.base import Base
+# interface = gr.Interface(
+#     fn=extract_po,
+#     title="PO Extractor 买卖合同数据提取",
+#     inputs=gr.File(label="买卖合同 （.docx）"),
+#     outputs=gr.Json(label="提取结果"),
+#     flagging_mode="never",
+#     theme=Base()
+# )
+# interface.launch(show_error=True)