MikeMai commited on
Commit
951ce76
·
verified ·
1 Parent(s): eec7e13

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -40
app.py CHANGED
@@ -139,9 +139,9 @@ def extract_key_value_pairs(text, target_dict=None):
139
 
140
  # --- Table Processing Functions ---
141
 
142
- def process_single_column_table(rows):
143
- """Processes a single-column table and returns the extracted lines as a list."""
144
- single_column_data = []
145
 
146
  for row in rows:
147
  cells = row.findall('.//w:tc', NS)
@@ -149,9 +149,9 @@ def process_single_column_table(rows):
149
  cell_lines = extract_text_from_cell(cells[0]) # Extract all lines from the cell
150
 
151
  # Append each line directly to the list without splitting
152
- single_column_data.extend(cell_lines)
153
 
154
- return single_column_data # Return the list of extracted lines
155
 
156
  def process_buyer_seller_table(rows):
157
  """Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
@@ -365,8 +365,6 @@ def process_long_table(rows):
365
  # If we couldn't find a serial number column, keep the row
366
  filtered_table_data.append(row)
367
 
368
- print(f"Table process_long_table output: {filtered_table_data}")
369
-
370
  # Remove duplicate columns (ending with _2, _3, etc.)
371
  filtered_table_data = merge_duplicate_columns(filtered_table_data)
372
 
@@ -374,23 +372,25 @@ def process_long_table(rows):
374
 
375
  def identify_table_type_and_header_row(rows):
376
  """Identify table type and the index of the header row."""
377
- header_keywords = ["名称", "Name", "规格", "Unit", "Quantity", "单价", "总价", "Remarks"]
378
  for i, row in enumerate(rows):
379
  num_cells = len(row.findall('.//w:tc', NS))
380
  if num_cells > 1:
381
- cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
382
- if any(keyword in cell_texts for keyword in header_keywords):
383
- # Check for buyer-seller or summary table
384
- if num_cells == 2:
385
- if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
 
 
386
  return "buyer_seller", i
387
  else:
388
- return "summary", i
389
  else:
390
- return "long_table", i
 
 
 
391
  # Fallbacks
392
- if all(len(row.findall('.//w:tc', NS)) == 1 for row in rows):
393
- return "single_column", 0
394
  return "unknown", 0
395
 
396
  def extract_tables(root):
@@ -431,10 +431,10 @@ def extract_tables(root):
431
  for paragraph in elem.findall('.//w:p', NS):
432
  table_paragraphs.add(paragraph)
433
  table_type, header_row_index = identify_table_type_and_header_row(rows)
434
- if table_type == "single_column":
435
- single_column_data = process_single_column_table(rows)
436
- if single_column_data:
437
- table_data[f"table_{table_index}_single_column"] = single_column_data
438
  table_index += 1
439
  continue
440
  elif table_type == "buyer_seller":
@@ -455,13 +455,7 @@ def extract_tables(root):
455
  table_data[f"long_table_{table_index}"] = long_table_data
456
  table_index += 1
457
  continue
458
- else:
459
- # fallback: try to process as long table from first multi-column row
460
- long_table_data = process_long_table(rows[header_row_index:])
461
- if long_table_data:
462
- table_data[f"long_table_{table_index}"] = long_table_data
463
- table_index += 1
464
- continue
465
  return table_data, table_paragraphs
466
 
467
  # --- Non-Table Processing Functions ---
@@ -593,6 +587,8 @@ Contract data in JSON format:""" + f"""
593
  # Clean 合同编号 by removing all contents in brackets including the brackets themselves
594
  if "合同编号" in contract_json and contract_json["合同编号"]:
595
  contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
 
 
596
 
597
  validated_data = ContractSummary.model_validate(contract_json)
598
 
@@ -630,10 +626,6 @@ Contract data in JSON format:""" + f"""
630
  print("⚠️ All attempts failed, returning empty model")
631
  empty_data = ContractSummary().model_dump(by_alias=True)
632
 
633
- # Clean 合同编号 by removing all contents in brackets including the brackets themselves
634
- if "合同编号" in empty_data and empty_data["合同编号"]:
635
- empty_data["合同编号"] = re.sub(r'[\((].*?[\))]', '', empty_data["合同编号"]).strip()
636
-
637
  empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
638
 
639
  if save_json:
@@ -651,6 +643,7 @@ def find_price_list_table(extracted_data, min_matches=3):
651
  "货描", "commodity",
652
  ]
653
  last_price_list_table = None
 
654
 
655
  # Get all long tables and sort them by key to ensure we process them in order
656
  long_tables = [(key, table) for key, table in extracted_data.items()
@@ -672,8 +665,9 @@ def find_price_list_table(extracted_data, min_matches=3):
672
 
673
  if match_count >= min_matches:
674
  last_price_list_table = table # Keep the last table that meets criteria
 
675
 
676
- return last_price_list_table
677
 
678
 
679
  def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
@@ -1295,14 +1289,13 @@ def extract_po(docx_path):
1295
  # Find and rename the price list table before contract summary processing
1296
  print("Identifying Price List table...")
1297
  extracted_data_dict = json.loads(extracted_data)
1298
- price_list_table = find_price_list_table(extracted_data_dict)
1299
 
1300
  # Add the combined price list table to the extracted data
1301
  if price_list_table:
1302
- # Remove all long_table keys that were used to create the price list
1303
- keys_to_remove = [key for key in extracted_data_dict.keys() if "long_table" in key]
1304
- for key in keys_to_remove:
1305
- del extracted_data_dict[key]
1306
 
1307
  # Add the combined price list table
1308
  extracted_data_dict["price_list"] = price_list_table
@@ -1314,7 +1307,7 @@ def extract_po(docx_path):
1314
  extracted_data_dict["price_list"] = []
1315
  extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1316
 
1317
- print(f"✅ Extracted Data: {extracted_data}")
1318
 
1319
  # Create a copy of the data with only first row of price list for contract summary
1320
  contract_summary_dict = json.loads(extracted_data)
@@ -1322,6 +1315,8 @@ def extract_po(docx_path):
1322
  contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
1323
  contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
1324
 
 
 
1325
  # Step 3: Process JSON with OpenAI to get structured output
1326
  print("Processing Contract Summary data with AI...")
1327
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
@@ -1346,7 +1341,7 @@ def extract_po(docx_path):
1346
  # Ensure BytesIO is properly closed
1347
  if 'docx_bytes' in locals():
1348
  docx_bytes.close()
1349
-
1350
  # Example Usage
1351
 
1352
  # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
 
139
 
140
  # --- Table Processing Functions ---
141
 
142
+ def process_unknown_table(rows):
143
+ """Processes unknown tables and returns the extracted lines as a list."""
144
+ unknown_table_data = []
145
 
146
  for row in rows:
147
  cells = row.findall('.//w:tc', NS)
 
149
  cell_lines = extract_text_from_cell(cells[0]) # Extract all lines from the cell
150
 
151
  # Append each line directly to the list without splitting
152
+ unknown_table_data.extend(cell_lines)
153
 
154
+ return unknown_table_data # Return the list of extracted lines
155
 
156
  def process_buyer_seller_table(rows):
157
  """Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
 
365
  # If we couldn't find a serial number column, keep the row
366
  filtered_table_data.append(row)
367
 
 
 
368
  # Remove duplicate columns (ending with _2, _3, etc.)
369
  filtered_table_data = merge_duplicate_columns(filtered_table_data)
370
 
 
372
 
373
  def identify_table_type_and_header_row(rows):
374
  """Identify table type and the index of the header row."""
 
375
  for i, row in enumerate(rows):
376
  num_cells = len(row.findall('.//w:tc', NS))
377
  if num_cells > 1:
378
+ # Check for buyer-seller or summary table based on structure only
379
+ if num_cells == 2:
380
+ if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
381
+ # Check if it contains buyer/seller keywords
382
+ cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
383
+ buyer_seller_keywords = ["买方", "buyer", "卖方", "seller"]
384
+ if any(keyword.lower() in cell_texts.lower() for keyword in buyer_seller_keywords):
385
  return "buyer_seller", i
386
  else:
387
+ return "unknown", i
388
  else:
389
+ return "summary", i
390
+ else:
391
+ # For tables with more than 2 columns, process as long table
392
+ return "long_table", i
393
  # Fallbacks
 
 
394
  return "unknown", 0
395
 
396
  def extract_tables(root):
 
431
  for paragraph in elem.findall('.//w:p', NS):
432
  table_paragraphs.add(paragraph)
433
  table_type, header_row_index = identify_table_type_and_header_row(rows)
434
+ if table_type == "unknown":
435
+ unknown_table_data = process_unknown_table(rows)
436
+ if unknown_table_data:
437
+ table_data[f"table_{table_index}_unknown"] = unknown_table_data
438
  table_index += 1
439
  continue
440
  elif table_type == "buyer_seller":
 
455
  table_data[f"long_table_{table_index}"] = long_table_data
456
  table_index += 1
457
  continue
458
+
 
 
 
 
 
 
459
  return table_data, table_paragraphs
460
 
461
  # --- Non-Table Processing Functions ---
 
587
  # Clean 合同编号 by removing all contents in brackets including the brackets themselves
588
  if "合同编号" in contract_json and contract_json["合同编号"]:
589
  contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
590
+ # Remove anything after "/" (including the "/" itself)
591
+ contract_json["合同编号"] = re.sub(r'/\s*.*$', '', contract_json["合同编号"]).strip()
592
 
593
  validated_data = ContractSummary.model_validate(contract_json)
594
 
 
626
  print("⚠️ All attempts failed, returning empty model")
627
  empty_data = ContractSummary().model_dump(by_alias=True)
628
 
 
 
 
 
629
  empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
630
 
631
  if save_json:
 
643
  "货描", "commodity",
644
  ]
645
  last_price_list_table = None
646
+ last_price_list_key = None
647
 
648
  # Get all long tables and sort them by key to ensure we process them in order
649
  long_tables = [(key, table) for key, table in extracted_data.items()
 
665
 
666
  if match_count >= min_matches:
667
  last_price_list_table = table # Keep the last table that meets criteria
668
+ last_price_list_key = key # Keep the key as well
669
 
670
+ return last_price_list_table, last_price_list_key
671
 
672
 
673
  def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
 
1289
  # Find and rename the price list table before contract summary processing
1290
  print("Identifying Price List table...")
1291
  extracted_data_dict = json.loads(extracted_data)
1292
+ price_list_table, price_list_key = find_price_list_table(extracted_data_dict)
1293
 
1294
  # Add the combined price list table to the extracted data
1295
  if price_list_table:
1296
+ # Remove only the specific long_table that was used to create the price list
1297
+ if price_list_key:
1298
+ del extracted_data_dict[price_list_key]
 
1299
 
1300
  # Add the combined price list table
1301
  extracted_data_dict["price_list"] = price_list_table
 
1307
  extracted_data_dict["price_list"] = []
1308
  extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1309
 
1310
+ # print(f"✅ Extracted Data: {extracted_data}")
1311
 
1312
  # Create a copy of the data with only first row of price list for contract summary
1313
  contract_summary_dict = json.loads(extracted_data)
 
1315
  contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
1316
  contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
1317
 
1318
+ print(f"✅ Contract Summary Data: {contract_summary_data}")
1319
+
1320
  # Step 3: Process JSON with OpenAI to get structured output
1321
  print("Processing Contract Summary data with AI...")
1322
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
 
1341
  # Ensure BytesIO is properly closed
1342
  if 'docx_bytes' in locals():
1343
  docx_bytes.close()
1344
+
1345
  # Example Usage
1346
 
1347
  # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))