Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on May 30, 2025

Commit

fb7be42

verified ·

1 Parent(s): dd620cd

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -12

app.py CHANGED Viewed

@@ -985,14 +985,18 @@ Do not force map 名称(英文) to 单价
         # Step 1: Handle name fields first - look for any field with "名称" or "name"
         for header, value in row.items():
             # Clean the header for comparison
-            cleaned_header = re.sub(r'\s+', ' ', header).strip()
             header_lower = cleaned_header.lower()
             if ("名称" in header_lower or "name" in header_lower) and value:
                 # If field contains both Chinese and English, separate them
-                if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
-                    chinese, english = separate_chinese_english(value)
                     if chinese:
                         new_row["名称"] = chinese
                     if english:
@@ -1000,21 +1004,29 @@ Do not force map 名称(英文) to 单价
                     print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
                 else:
                     # Just set the name directly
-                    new_row["名称"] = value
                 break  # Stop after finding first name field
         # Step 2: Fill in all other fields using standard mapping
         for header, value in row.items():
             # Skip empty values
             if not value:
                 continue
             # Clean the header for comparison
-            cleaned_header = re.sub(r'\s+', ' ', header).strip()
             # Check if this maps to a standard field
             matched_field = None
             for std_field, mapped_header in standard_field_mapping.items():
                 # Make comparison more flexible by lowercasing and stripping spaces
                 if mapped_header.lower().strip() == cleaned_header.lower().strip():
                     matched_field = std_field
@@ -1023,13 +1035,13 @@ Do not force map 名称(英文) to 单价
             # If we found a mapping, use it (but don't overwrite name fields)
             if matched_field:
                 if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
-                    new_row[matched_field] = value
             # If no mapping found, add to other_fields
             else:
                 # Skip name fields we already processed
                 header_lower = cleaned_header.lower()
                 if not ("名称" in header_lower or "name" in header_lower):
-                    other_fields[header] = value
         # Add remaining fields to "其他"
         if other_fields:
@@ -1167,7 +1179,7 @@ def extract_po(docx_path):
 # Example Usage
-# extract_po("GN-WARJS2504-282GC烟台嘉益钢材采购合同(1).docx")
 # extract_po("EPC简明合同格式-中英对照版.docx")
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
@@ -1193,7 +1205,4 @@ interface = gr.Interface(
     theme=Base()
 )
-interface.launch(show_error=True)

         # Step 1: Handle name fields first - look for any field with "名称" or "name"
         for header, value in row.items():
+            # Skip if header is None
+            if header is None:
+                continue
             # Clean the header for comparison
+            cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
             header_lower = cleaned_header.lower()
             if ("名称" in header_lower or "name" in header_lower) and value:
                 # If field contains both Chinese and English, separate them
+                if re.search(r'[\u4e00-\u9fff]', str(value)) and re.search(r'[a-zA-Z]', str(value)):
+                    chinese, english = separate_chinese_english(str(value))
                     if chinese:
                         new_row["名称"] = chinese
                     if english:
                     print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
                 else:
                     # Just set the name directly
+                    new_row["名称"] = str(value)
                 break  # Stop after finding first name field
         # Step 2: Fill in all other fields using standard mapping
         for header, value in row.items():
+            # Skip if header is None
+            if header is None:
+                continue
             # Skip empty values
             if not value:
                 continue
             # Clean the header for comparison
+            cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
             # Check if this maps to a standard field
             matched_field = None
             for std_field, mapped_header in standard_field_mapping.items():
+                # Skip if mapped_header is None
+                if mapped_header is None:
+                    continue
                 # Make comparison more flexible by lowercasing and stripping spaces
                 if mapped_header.lower().strip() == cleaned_header.lower().strip():
                     matched_field = std_field
             # If we found a mapping, use it (but don't overwrite name fields)
             if matched_field:
                 if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
+                    new_row[matched_field] = str(value)
             # If no mapping found, add to other_fields
             else:
                 # Skip name fields we already processed
                 header_lower = cleaned_header.lower()
                 if not ("名称" in header_lower or "name" in header_lower):
+                    other_fields[header] = str(value)
         # Add remaining fields to "其他"
         if other_fields:
 # Example Usage
+# extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
 # extract_po("EPC简明合同格式-中英对照版.docx")
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
     theme=Base()
 )
+interface.launch(show_error=True)