MikeMai commited on
Commit
fb7be42
·
verified ·
1 Parent(s): dd620cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -12
app.py CHANGED
@@ -985,14 +985,18 @@ Do not force map 名称(英文) to 单价
985
 
986
  # Step 1: Handle name fields first - look for any field with "名称" or "name"
987
  for header, value in row.items():
 
 
 
 
988
  # Clean the header for comparison
989
- cleaned_header = re.sub(r'\s+', ' ', header).strip()
990
  header_lower = cleaned_header.lower()
991
 
992
  if ("名称" in header_lower or "name" in header_lower) and value:
993
  # If field contains both Chinese and English, separate them
994
- if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
995
- chinese, english = separate_chinese_english(value)
996
  if chinese:
997
  new_row["名称"] = chinese
998
  if english:
@@ -1000,21 +1004,29 @@ Do not force map 名称(英文) to 单价
1000
  print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
1001
  else:
1002
  # Just set the name directly
1003
- new_row["名称"] = value
1004
  break # Stop after finding first name field
1005
 
1006
  # Step 2: Fill in all other fields using standard mapping
1007
  for header, value in row.items():
 
 
 
 
1008
  # Skip empty values
1009
  if not value:
1010
  continue
1011
 
1012
  # Clean the header for comparison
1013
- cleaned_header = re.sub(r'\s+', ' ', header).strip()
1014
 
1015
  # Check if this maps to a standard field
1016
  matched_field = None
1017
  for std_field, mapped_header in standard_field_mapping.items():
 
 
 
 
1018
  # Make comparison more flexible by lowercasing and stripping spaces
1019
  if mapped_header.lower().strip() == cleaned_header.lower().strip():
1020
  matched_field = std_field
@@ -1023,13 +1035,13 @@ Do not force map 名称(英文) to 单价
1023
  # If we found a mapping, use it (but don't overwrite name fields)
1024
  if matched_field:
1025
  if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
1026
- new_row[matched_field] = value
1027
  # If no mapping found, add to other_fields
1028
  else:
1029
  # Skip name fields we already processed
1030
  header_lower = cleaned_header.lower()
1031
  if not ("名称" in header_lower or "name" in header_lower):
1032
- other_fields[header] = value
1033
 
1034
  # Add remaining fields to "其他"
1035
  if other_fields:
@@ -1167,7 +1179,7 @@ def extract_po(docx_path):
1167
 
1168
  # Example Usage
1169
 
1170
- # extract_po("GN-WARJS2504-282GC烟台嘉益钢材采购合同(1).docx")
1171
  # extract_po("EPC简明合同格式-中英对照版.docx")
1172
 
1173
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
@@ -1193,7 +1205,4 @@ interface = gr.Interface(
1193
  theme=Base()
1194
  )
1195
 
1196
- interface.launch(show_error=True)
1197
-
1198
-
1199
-
 
985
 
986
  # Step 1: Handle name fields first - look for any field with "名称" or "name"
987
  for header, value in row.items():
988
+ # Skip if header is None
989
+ if header is None:
990
+ continue
991
+
992
  # Clean the header for comparison
993
+ cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
994
  header_lower = cleaned_header.lower()
995
 
996
  if ("名称" in header_lower or "name" in header_lower) and value:
997
  # If field contains both Chinese and English, separate them
998
+ if re.search(r'[\u4e00-\u9fff]', str(value)) and re.search(r'[a-zA-Z]', str(value)):
999
+ chinese, english = separate_chinese_english(str(value))
1000
  if chinese:
1001
  new_row["名称"] = chinese
1002
  if english:
 
1004
  print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
1005
  else:
1006
  # Just set the name directly
1007
+ new_row["名称"] = str(value)
1008
  break # Stop after finding first name field
1009
 
1010
  # Step 2: Fill in all other fields using standard mapping
1011
  for header, value in row.items():
1012
+ # Skip if header is None
1013
+ if header is None:
1014
+ continue
1015
+
1016
  # Skip empty values
1017
  if not value:
1018
  continue
1019
 
1020
  # Clean the header for comparison
1021
+ cleaned_header = re.sub(r'\s+', ' ', str(header)).strip()
1022
 
1023
  # Check if this maps to a standard field
1024
  matched_field = None
1025
  for std_field, mapped_header in standard_field_mapping.items():
1026
+ # Skip if mapped_header is None
1027
+ if mapped_header is None:
1028
+ continue
1029
+
1030
  # Make comparison more flexible by lowercasing and stripping spaces
1031
  if mapped_header.lower().strip() == cleaned_header.lower().strip():
1032
  matched_field = std_field
 
1035
  # If we found a mapping, use it (but don't overwrite name fields)
1036
  if matched_field:
1037
  if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
1038
+ new_row[matched_field] = str(value)
1039
  # If no mapping found, add to other_fields
1040
  else:
1041
  # Skip name fields we already processed
1042
  header_lower = cleaned_header.lower()
1043
  if not ("名称" in header_lower or "name" in header_lower):
1044
+ other_fields[header] = str(value)
1045
 
1046
  # Add remaining fields to "其他"
1047
  if other_fields:
 
1179
 
1180
  # Example Usage
1181
 
1182
+ # extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
1183
  # extract_po("EPC简明合同格式-中英对照版.docx")
1184
 
1185
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 
1205
  theme=Base()
1206
  )
1207
 
1208
+ interface.launch(show_error=True)