Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -139,9 +139,9 @@ def extract_key_value_pairs(text, target_dict=None):
|
|
| 139 |
|
| 140 |
# --- Table Processing Functions ---
|
| 141 |
|
| 142 |
-
def
|
| 143 |
-
"""Processes
|
| 144 |
-
|
| 145 |
|
| 146 |
for row in rows:
|
| 147 |
cells = row.findall('.//w:tc', NS)
|
|
@@ -149,9 +149,9 @@ def process_single_column_table(rows):
|
|
| 149 |
cell_lines = extract_text_from_cell(cells[0]) # Extract all lines from the cell
|
| 150 |
|
| 151 |
# Append each line directly to the list without splitting
|
| 152 |
-
|
| 153 |
|
| 154 |
-
return
|
| 155 |
|
| 156 |
def process_buyer_seller_table(rows):
|
| 157 |
"""Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
|
|
@@ -365,8 +365,6 @@ def process_long_table(rows):
|
|
| 365 |
# If we couldn't find a serial number column, keep the row
|
| 366 |
filtered_table_data.append(row)
|
| 367 |
|
| 368 |
-
print(f"Table process_long_table output: {filtered_table_data}")
|
| 369 |
-
|
| 370 |
# Remove duplicate columns (ending with _2, _3, etc.)
|
| 371 |
filtered_table_data = merge_duplicate_columns(filtered_table_data)
|
| 372 |
|
|
@@ -374,23 +372,25 @@ def process_long_table(rows):
|
|
| 374 |
|
| 375 |
def identify_table_type_and_header_row(rows):
|
| 376 |
"""Identify table type and the index of the header row."""
|
| 377 |
-
header_keywords = ["名称", "Name", "规格", "Unit", "Quantity", "单价", "总价", "Remarks"]
|
| 378 |
for i, row in enumerate(rows):
|
| 379 |
num_cells = len(row.findall('.//w:tc', NS))
|
| 380 |
if num_cells > 1:
|
| 381 |
-
|
| 382 |
-
if
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
|
|
|
|
|
|
| 386 |
return "buyer_seller", i
|
| 387 |
else:
|
| 388 |
-
return "
|
| 389 |
else:
|
| 390 |
-
return "
|
|
|
|
|
|
|
|
|
|
| 391 |
# Fallbacks
|
| 392 |
-
if all(len(row.findall('.//w:tc', NS)) == 1 for row in rows):
|
| 393 |
-
return "single_column", 0
|
| 394 |
return "unknown", 0
|
| 395 |
|
| 396 |
def extract_tables(root):
|
|
@@ -431,10 +431,10 @@ def extract_tables(root):
|
|
| 431 |
for paragraph in elem.findall('.//w:p', NS):
|
| 432 |
table_paragraphs.add(paragraph)
|
| 433 |
table_type, header_row_index = identify_table_type_and_header_row(rows)
|
| 434 |
-
if table_type == "
|
| 435 |
-
|
| 436 |
-
if
|
| 437 |
-
table_data[f"table_{table_index}
|
| 438 |
table_index += 1
|
| 439 |
continue
|
| 440 |
elif table_type == "buyer_seller":
|
|
@@ -455,13 +455,7 @@ def extract_tables(root):
|
|
| 455 |
table_data[f"long_table_{table_index}"] = long_table_data
|
| 456 |
table_index += 1
|
| 457 |
continue
|
| 458 |
-
|
| 459 |
-
# fallback: try to process as long table from first multi-column row
|
| 460 |
-
long_table_data = process_long_table(rows[header_row_index:])
|
| 461 |
-
if long_table_data:
|
| 462 |
-
table_data[f"long_table_{table_index}"] = long_table_data
|
| 463 |
-
table_index += 1
|
| 464 |
-
continue
|
| 465 |
return table_data, table_paragraphs
|
| 466 |
|
| 467 |
# --- Non-Table Processing Functions ---
|
|
@@ -593,6 +587,8 @@ Contract data in JSON format:""" + f"""
|
|
| 593 |
# Clean 合同编号 by removing all contents in brackets including the brackets themselves
|
| 594 |
if "合同编号" in contract_json and contract_json["合同编号"]:
|
| 595 |
contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
|
|
|
|
|
|
|
| 596 |
|
| 597 |
validated_data = ContractSummary.model_validate(contract_json)
|
| 598 |
|
|
@@ -630,10 +626,6 @@ Contract data in JSON format:""" + f"""
|
|
| 630 |
print("⚠️ All attempts failed, returning empty model")
|
| 631 |
empty_data = ContractSummary().model_dump(by_alias=True)
|
| 632 |
|
| 633 |
-
# Clean 合同编号 by removing all contents in brackets including the brackets themselves
|
| 634 |
-
if "合同编号" in empty_data and empty_data["合同编号"]:
|
| 635 |
-
empty_data["合同编号"] = re.sub(r'[\((].*?[\))]', '', empty_data["合同编号"]).strip()
|
| 636 |
-
|
| 637 |
empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
|
| 638 |
|
| 639 |
if save_json:
|
|
@@ -651,6 +643,7 @@ def find_price_list_table(extracted_data, min_matches=3):
|
|
| 651 |
"货描", "commodity",
|
| 652 |
]
|
| 653 |
last_price_list_table = None
|
|
|
|
| 654 |
|
| 655 |
# Get all long tables and sort them by key to ensure we process them in order
|
| 656 |
long_tables = [(key, table) for key, table in extracted_data.items()
|
|
@@ -672,8 +665,9 @@ def find_price_list_table(extracted_data, min_matches=3):
|
|
| 672 |
|
| 673 |
if match_count >= min_matches:
|
| 674 |
last_price_list_table = table # Keep the last table that meets criteria
|
|
|
|
| 675 |
|
| 676 |
-
return last_price_list_table
|
| 677 |
|
| 678 |
|
| 679 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
|
|
@@ -1295,14 +1289,13 @@ def extract_po(docx_path):
|
|
| 1295 |
# Find and rename the price list table before contract summary processing
|
| 1296 |
print("Identifying Price List table...")
|
| 1297 |
extracted_data_dict = json.loads(extracted_data)
|
| 1298 |
-
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1299 |
|
| 1300 |
# Add the combined price list table to the extracted data
|
| 1301 |
if price_list_table:
|
| 1302 |
-
# Remove
|
| 1303 |
-
|
| 1304 |
-
|
| 1305 |
-
del extracted_data_dict[key]
|
| 1306 |
|
| 1307 |
# Add the combined price list table
|
| 1308 |
extracted_data_dict["price_list"] = price_list_table
|
|
@@ -1314,7 +1307,7 @@ def extract_po(docx_path):
|
|
| 1314 |
extracted_data_dict["price_list"] = []
|
| 1315 |
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1316 |
|
| 1317 |
-
print(f"✅ Extracted Data: {extracted_data}")
|
| 1318 |
|
| 1319 |
# Create a copy of the data with only first row of price list for contract summary
|
| 1320 |
contract_summary_dict = json.loads(extracted_data)
|
|
@@ -1322,6 +1315,8 @@ def extract_po(docx_path):
|
|
| 1322 |
contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
|
| 1323 |
contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
|
| 1324 |
|
|
|
|
|
|
|
| 1325 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1326 |
print("Processing Contract Summary data with AI...")
|
| 1327 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
|
@@ -1346,7 +1341,7 @@ def extract_po(docx_path):
|
|
| 1346 |
# Ensure BytesIO is properly closed
|
| 1347 |
if 'docx_bytes' in locals():
|
| 1348 |
docx_bytes.close()
|
| 1349 |
-
|
| 1350 |
# Example Usage
|
| 1351 |
|
| 1352 |
# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
|
|
|
|
| 139 |
|
| 140 |
# --- Table Processing Functions ---
|
| 141 |
|
| 142 |
+
def process_unknown_table(rows):
|
| 143 |
+
"""Processes unknown tables and returns the extracted lines as a list."""
|
| 144 |
+
unknown_table_data = []
|
| 145 |
|
| 146 |
for row in rows:
|
| 147 |
cells = row.findall('.//w:tc', NS)
|
|
|
|
| 149 |
cell_lines = extract_text_from_cell(cells[0]) # Extract all lines from the cell
|
| 150 |
|
| 151 |
# Append each line directly to the list without splitting
|
| 152 |
+
unknown_table_data.extend(cell_lines)
|
| 153 |
|
| 154 |
+
return unknown_table_data # Return the list of extracted lines
|
| 155 |
|
| 156 |
def process_buyer_seller_table(rows):
|
| 157 |
"""Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
|
|
|
|
| 365 |
# If we couldn't find a serial number column, keep the row
|
| 366 |
filtered_table_data.append(row)
|
| 367 |
|
|
|
|
|
|
|
| 368 |
# Remove duplicate columns (ending with _2, _3, etc.)
|
| 369 |
filtered_table_data = merge_duplicate_columns(filtered_table_data)
|
| 370 |
|
|
|
|
| 372 |
|
| 373 |
def identify_table_type_and_header_row(rows):
|
| 374 |
"""Identify table type and the index of the header row."""
|
|
|
|
| 375 |
for i, row in enumerate(rows):
|
| 376 |
num_cells = len(row.findall('.//w:tc', NS))
|
| 377 |
if num_cells > 1:
|
| 378 |
+
# Check for buyer-seller or summary table based on structure only
|
| 379 |
+
if num_cells == 2:
|
| 380 |
+
if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
|
| 381 |
+
# Check if it contains buyer/seller keywords
|
| 382 |
+
cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
|
| 383 |
+
buyer_seller_keywords = ["买方", "buyer", "卖方", "seller"]
|
| 384 |
+
if any(keyword.lower() in cell_texts.lower() for keyword in buyer_seller_keywords):
|
| 385 |
return "buyer_seller", i
|
| 386 |
else:
|
| 387 |
+
return "unknown", i
|
| 388 |
else:
|
| 389 |
+
return "summary", i
|
| 390 |
+
else:
|
| 391 |
+
# For tables with more than 2 columns, process as long table
|
| 392 |
+
return "long_table", i
|
| 393 |
# Fallbacks
|
|
|
|
|
|
|
| 394 |
return "unknown", 0
|
| 395 |
|
| 396 |
def extract_tables(root):
|
|
|
|
| 431 |
for paragraph in elem.findall('.//w:p', NS):
|
| 432 |
table_paragraphs.add(paragraph)
|
| 433 |
table_type, header_row_index = identify_table_type_and_header_row(rows)
|
| 434 |
+
if table_type == "unknown":
|
| 435 |
+
unknown_table_data = process_unknown_table(rows)
|
| 436 |
+
if unknown_table_data:
|
| 437 |
+
table_data[f"table_{table_index}_unknown"] = unknown_table_data
|
| 438 |
table_index += 1
|
| 439 |
continue
|
| 440 |
elif table_type == "buyer_seller":
|
|
|
|
| 455 |
table_data[f"long_table_{table_index}"] = long_table_data
|
| 456 |
table_index += 1
|
| 457 |
continue
|
| 458 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
return table_data, table_paragraphs
|
| 460 |
|
| 461 |
# --- Non-Table Processing Functions ---
|
|
|
|
| 587 |
# Clean 合同编号 by removing all contents in brackets including the brackets themselves
|
| 588 |
if "合同编号" in contract_json and contract_json["合同编号"]:
|
| 589 |
contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
|
| 590 |
+
# Remove anything after "/" (including the "/" itself)
|
| 591 |
+
contract_json["合同编号"] = re.sub(r'/\s*.*$', '', contract_json["合同编号"]).strip()
|
| 592 |
|
| 593 |
validated_data = ContractSummary.model_validate(contract_json)
|
| 594 |
|
|
|
|
| 626 |
print("⚠️ All attempts failed, returning empty model")
|
| 627 |
empty_data = ContractSummary().model_dump(by_alias=True)
|
| 628 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
|
| 630 |
|
| 631 |
if save_json:
|
|
|
|
| 643 |
"货描", "commodity",
|
| 644 |
]
|
| 645 |
last_price_list_table = None
|
| 646 |
+
last_price_list_key = None
|
| 647 |
|
| 648 |
# Get all long tables and sort them by key to ensure we process them in order
|
| 649 |
long_tables = [(key, table) for key, table in extracted_data.items()
|
|
|
|
| 665 |
|
| 666 |
if match_count >= min_matches:
|
| 667 |
last_price_list_table = table # Keep the last table that meets criteria
|
| 668 |
+
last_price_list_key = key # Keep the key as well
|
| 669 |
|
| 670 |
+
return last_price_list_table, last_price_list_key
|
| 671 |
|
| 672 |
|
| 673 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
|
|
|
|
| 1289 |
# Find and rename the price list table before contract summary processing
|
| 1290 |
print("Identifying Price List table...")
|
| 1291 |
extracted_data_dict = json.loads(extracted_data)
|
| 1292 |
+
price_list_table, price_list_key = find_price_list_table(extracted_data_dict)
|
| 1293 |
|
| 1294 |
# Add the combined price list table to the extracted data
|
| 1295 |
if price_list_table:
|
| 1296 |
+
# Remove only the specific long_table that was used to create the price list
|
| 1297 |
+
if price_list_key:
|
| 1298 |
+
del extracted_data_dict[price_list_key]
|
|
|
|
| 1299 |
|
| 1300 |
# Add the combined price list table
|
| 1301 |
extracted_data_dict["price_list"] = price_list_table
|
|
|
|
| 1307 |
extracted_data_dict["price_list"] = []
|
| 1308 |
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1309 |
|
| 1310 |
+
# print(f"✅ Extracted Data: {extracted_data}")
|
| 1311 |
|
| 1312 |
# Create a copy of the data with only first row of price list for contract summary
|
| 1313 |
contract_summary_dict = json.loads(extracted_data)
|
|
|
|
| 1315 |
contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
|
| 1316 |
contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
|
| 1317 |
|
| 1318 |
+
print(f"✅ Contract Summary Data: {contract_summary_data}")
|
| 1319 |
+
|
| 1320 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1321 |
print("Processing Contract Summary data with AI...")
|
| 1322 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
|
|
|
| 1341 |
# Ensure BytesIO is properly closed
|
| 1342 |
if 'docx_bytes' in locals():
|
| 1343 |
docx_bytes.close()
|
| 1344 |
+
|
| 1345 |
# Example Usage
|
| 1346 |
|
| 1347 |
# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
|