Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,7 +22,7 @@ from fuzzywuzzy import fuzz
|
|
| 22 |
from fuzzywuzzy import process
|
| 23 |
|
| 24 |
|
| 25 |
-
HF_API_KEY = os.getenv("
|
| 26 |
|
| 27 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 28 |
# base_url = "https://router.huggingface.co/novita"
|
|
@@ -33,8 +33,8 @@ HF_API_KEY = os.getenv("HF_API_KEY")
|
|
| 33 |
# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 34 |
|
| 35 |
# Qwen 2.5 7B --------------------------------------------------------
|
| 36 |
-
base_url =
|
| 37 |
-
model=
|
| 38 |
|
| 39 |
# Qwen 2.5 32B --------------------------------------------------------
|
| 40 |
# base_url = "https://router.huggingface.co/novita/v3/openai"
|
|
@@ -394,53 +394,74 @@ def identify_table_type_and_header_row(rows):
|
|
| 394 |
return "unknown", 0
|
| 395 |
|
| 396 |
def extract_tables(root):
|
| 397 |
-
"""Extracts tables from the DOCX document and returns structured data.
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
table_data = {}
|
| 400 |
table_paragraphs = set()
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
if
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
if
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 444 |
return table_data, table_paragraphs
|
| 445 |
|
| 446 |
# --- Non-Table Processing Functions ---
|
|
@@ -568,6 +589,11 @@ Contract data in JSON format:""" + f"""
|
|
| 568 |
|
| 569 |
# Clean up JSON before validation
|
| 570 |
contract_json = json.loads(contract_summary.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
validated_data = ContractSummary.model_validate(contract_json)
|
| 572 |
|
| 573 |
# Success! Return validated data
|
|
@@ -603,6 +629,11 @@ Contract data in JSON format:""" + f"""
|
|
| 603 |
# If we get here, all attempts failed - return empty but valid model
|
| 604 |
print("⚠️ All attempts failed, returning empty model")
|
| 605 |
empty_data = ContractSummary().model_dump(by_alias=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 606 |
empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
|
| 607 |
|
| 608 |
if save_json:
|
|
@@ -612,6 +643,39 @@ Contract data in JSON format:""" + f"""
|
|
| 612 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 613 |
|
| 614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
|
| 616 |
"""
|
| 617 |
Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
|
|
@@ -697,7 +761,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json",
|
|
| 697 |
# 单位 mappings
|
| 698 |
"单位": ["单位 unit", "单位unit", "unit", "单位"],
|
| 699 |
# 单价 mappings
|
| 700 |
-
"单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
|
| 701 |
"单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
|
| 702 |
"价格 price", "价格price", "价格",
|
| 703 |
"美元单价"],
|
|
@@ -1092,40 +1156,6 @@ def json_to_excel(contract_summary, json_data, excel_path):
|
|
| 1092 |
contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
|
| 1093 |
long_table.to_excel(writer, sheet_name="Price List", index=False)
|
| 1094 |
|
| 1095 |
-
# Add this helper function near your other helpers
|
| 1096 |
-
def find_price_list_table(extracted_data, min_matches=3):
|
| 1097 |
-
price_keywords = [
|
| 1098 |
-
"名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
|
| 1099 |
-
"单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
|
| 1100 |
-
"几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
|
| 1101 |
-
"货描", "commodity",
|
| 1102 |
-
]
|
| 1103 |
-
last_price_list_table = None
|
| 1104 |
-
|
| 1105 |
-
# Get all long tables and sort them by key to ensure we process them in order
|
| 1106 |
-
long_tables = [(key, table) for key, table in extracted_data.items()
|
| 1107 |
-
if "long_table" in key and isinstance(table, list) and table]
|
| 1108 |
-
long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
|
| 1109 |
-
|
| 1110 |
-
for key, table in long_tables:
|
| 1111 |
-
|
| 1112 |
-
headers = list(table[0].keys())
|
| 1113 |
-
|
| 1114 |
-
match_count = 0
|
| 1115 |
-
for header in headers:
|
| 1116 |
-
header_lower = header.lower()
|
| 1117 |
-
# Use fuzzy matching for keyword detection
|
| 1118 |
-
for keyword in price_keywords:
|
| 1119 |
-
if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
|
| 1120 |
-
match_count += 1
|
| 1121 |
-
break # Found a match for this header, move to next
|
| 1122 |
-
|
| 1123 |
-
if match_count >= min_matches:
|
| 1124 |
-
last_price_list_table = table # Keep the last table that meets criteria
|
| 1125 |
-
|
| 1126 |
-
return last_price_list_table
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
#--- Handle Edge Cases ------------------------------
|
| 1130 |
|
| 1131 |
def handle_weight_conversion_edge_case(transformed_data):
|
|
@@ -1320,7 +1350,7 @@ def extract_po(docx_path):
|
|
| 1320 |
# Example Usage
|
| 1321 |
|
| 1322 |
# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
|
| 1323 |
-
# print(extract_po(r"UAT Contracts
|
| 1324 |
|
| 1325 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1326 |
|
|
|
|
| 22 |
from fuzzywuzzy import process
|
| 23 |
|
| 24 |
|
| 25 |
+
HF_API_KEY = os.getenv("API_KEY")
|
| 26 |
|
| 27 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 28 |
# base_url = "https://router.huggingface.co/novita"
|
|
|
|
| 33 |
# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 34 |
|
| 35 |
# Qwen 2.5 7B --------------------------------------------------------
|
| 36 |
+
base_url = os.getenv("LLM_URL")
|
| 37 |
+
model= os.getenv("MODEL_NAME")
|
| 38 |
|
| 39 |
# Qwen 2.5 32B --------------------------------------------------------
|
| 40 |
# base_url = "https://router.huggingface.co/novita/v3/openai"
|
|
|
|
| 394 |
return "unknown", 0
|
| 395 |
|
| 396 |
def extract_tables(root):
|
| 397 |
+
"""Extracts tables from the DOCX document and returns structured data, skipping tables whose title contains 'template'.
|
| 398 |
+
Handles cases where there are blank paragraphs between the title and the table."""
|
| 399 |
+
# Find the document body (usually the first child of root)
|
| 400 |
+
body = root.find('.//w:body', NS)
|
| 401 |
+
if body is None:
|
| 402 |
+
body = root # fallback if structure is different
|
| 403 |
+
|
| 404 |
table_data = {}
|
| 405 |
table_paragraphs = set()
|
| 406 |
+
table_index = 1
|
| 407 |
+
last_paragraphs = [] # Store the last few paragraphs (max 3)
|
| 408 |
+
|
| 409 |
+
# Iterate through direct children of the body
|
| 410 |
+
for elem in list(body):
|
| 411 |
+
if elem.tag == f'{{{NS["w"]}}}p':
|
| 412 |
+
# Keep a rolling list of the last 3 paragraphs
|
| 413 |
+
last_paragraphs.append(elem)
|
| 414 |
+
if len(last_paragraphs) > 3:
|
| 415 |
+
last_paragraphs.pop(0)
|
| 416 |
+
elif elem.tag == f'{{{NS["w"]}}}tbl':
|
| 417 |
+
# Look back through last_paragraphs for the most recent non-empty one
|
| 418 |
+
title = ""
|
| 419 |
+
for para in reversed(last_paragraphs):
|
| 420 |
+
texts = [t.text for t in para.findall('.//w:t', NS) if t.text]
|
| 421 |
+
candidate = ' '.join(texts).strip()
|
| 422 |
+
if candidate:
|
| 423 |
+
title = candidate
|
| 424 |
+
break
|
| 425 |
+
# If title contains 'template', skip this table
|
| 426 |
+
if title and 'template' in title.lower():
|
| 427 |
+
continue
|
| 428 |
+
rows = elem.findall('.//w:tr', NS)
|
| 429 |
+
if not rows:
|
| 430 |
+
continue # Skip empty tables
|
| 431 |
+
for paragraph in elem.findall('.//w:p', NS):
|
| 432 |
+
table_paragraphs.add(paragraph)
|
| 433 |
+
table_type, header_row_index = identify_table_type_and_header_row(rows)
|
| 434 |
+
if table_type == "single_column":
|
| 435 |
+
single_column_data = process_single_column_table(rows)
|
| 436 |
+
if single_column_data:
|
| 437 |
+
table_data[f"table_{table_index}_single_column"] = single_column_data
|
| 438 |
+
table_index += 1
|
| 439 |
+
continue
|
| 440 |
+
elif table_type == "buyer_seller":
|
| 441 |
+
buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
|
| 442 |
+
if buyer_seller_data:
|
| 443 |
+
table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
|
| 444 |
+
table_index += 1
|
| 445 |
+
continue
|
| 446 |
+
elif table_type == "summary":
|
| 447 |
+
summary_data = process_summary_table(rows[header_row_index:])
|
| 448 |
+
if summary_data:
|
| 449 |
+
table_data[f"table_{table_index}_summary"] = summary_data
|
| 450 |
+
table_index += 1
|
| 451 |
+
continue
|
| 452 |
+
elif table_type == "long_table":
|
| 453 |
+
long_table_data = process_long_table(rows[header_row_index:])
|
| 454 |
+
if long_table_data:
|
| 455 |
+
table_data[f"long_table_{table_index}"] = long_table_data
|
| 456 |
+
table_index += 1
|
| 457 |
+
continue
|
| 458 |
+
else:
|
| 459 |
+
# fallback: try to process as long table from first multi-column row
|
| 460 |
+
long_table_data = process_long_table(rows[header_row_index:])
|
| 461 |
+
if long_table_data:
|
| 462 |
+
table_data[f"long_table_{table_index}"] = long_table_data
|
| 463 |
+
table_index += 1
|
| 464 |
+
continue
|
| 465 |
return table_data, table_paragraphs
|
| 466 |
|
| 467 |
# --- Non-Table Processing Functions ---
|
|
|
|
| 589 |
|
| 590 |
# Clean up JSON before validation
|
| 591 |
contract_json = json.loads(contract_summary.strip())
|
| 592 |
+
|
| 593 |
+
# Clean 合同编号 by removing all contents in brackets including the brackets themselves
|
| 594 |
+
if "合同编号" in contract_json and contract_json["合同编号"]:
|
| 595 |
+
contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
|
| 596 |
+
|
| 597 |
validated_data = ContractSummary.model_validate(contract_json)
|
| 598 |
|
| 599 |
# Success! Return validated data
|
|
|
|
| 629 |
# If we get here, all attempts failed - return empty but valid model
|
| 630 |
print("⚠️ All attempts failed, returning empty model")
|
| 631 |
empty_data = ContractSummary().model_dump(by_alias=True)
|
| 632 |
+
|
| 633 |
+
# Clean 合同编号 by removing all contents in brackets including the brackets themselves
|
| 634 |
+
if "合同编号" in empty_data and empty_data["合同编号"]:
|
| 635 |
+
empty_data["合同编号"] = re.sub(r'[\((].*?[\))]', '', empty_data["合同编号"]).strip()
|
| 636 |
+
|
| 637 |
empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
|
| 638 |
|
| 639 |
if save_json:
|
|
|
|
| 643 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 644 |
|
| 645 |
|
| 646 |
+
def find_price_list_table(extracted_data, min_matches=3):
|
| 647 |
+
price_keywords = [
|
| 648 |
+
"名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
|
| 649 |
+
"单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
|
| 650 |
+
"几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
|
| 651 |
+
"货描", "commodity",
|
| 652 |
+
]
|
| 653 |
+
last_price_list_table = None
|
| 654 |
+
|
| 655 |
+
# Get all long tables and sort them by key to ensure we process them in order
|
| 656 |
+
long_tables = [(key, table) for key, table in extracted_data.items()
|
| 657 |
+
if "long_table" in key and isinstance(table, list) and table]
|
| 658 |
+
long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
|
| 659 |
+
|
| 660 |
+
for key, table in long_tables:
|
| 661 |
+
|
| 662 |
+
headers = list(table[0].keys())
|
| 663 |
+
|
| 664 |
+
match_count = 0
|
| 665 |
+
for header in headers:
|
| 666 |
+
header_lower = header.lower()
|
| 667 |
+
# Use fuzzy matching for keyword detection
|
| 668 |
+
for keyword in price_keywords:
|
| 669 |
+
if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
|
| 670 |
+
match_count += 1
|
| 671 |
+
break # Found a match for this header, move to next
|
| 672 |
+
|
| 673 |
+
if match_count >= min_matches:
|
| 674 |
+
last_price_list_table = table # Keep the last table that meets criteria
|
| 675 |
+
|
| 676 |
+
return last_price_list_table
|
| 677 |
+
|
| 678 |
+
|
| 679 |
def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
|
| 680 |
"""
|
| 681 |
Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
|
|
|
|
| 761 |
# 单位 mappings
|
| 762 |
"单位": ["单位 unit", "单位unit", "unit", "单位"],
|
| 763 |
# 单价 mappings
|
| 764 |
+
"单价": ["单价 unit price (cny)", "单价unit price (cny)", "单价(元)Unit Price (CNY)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
|
| 765 |
"单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
|
| 766 |
"价格 price", "价格price", "价格",
|
| 767 |
"美元单价"],
|
|
|
|
| 1156 |
contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
|
| 1157 |
long_table.to_excel(writer, sheet_name="Price List", index=False)
|
| 1158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1159 |
#--- Handle Edge Cases ------------------------------
|
| 1160 |
|
| 1161 |
def handle_weight_conversion_edge_case(transformed_data):
|
|
|
|
| 1350 |
# Example Usage
|
| 1351 |
|
| 1352 |
# print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
|
| 1353 |
+
# print(extract_po(r"UAT Contracts\20250703\GN-WAPJS202405-297HG 1200R20轮胎采购合同-威海君乐-法务审批0515.docx"))
|
| 1354 |
|
| 1355 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1356 |
|