MikeMai commited on
Commit
1f01c66
·
verified ·
1 Parent(s): 83f48f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -84
app.py CHANGED
@@ -22,7 +22,7 @@ from fuzzywuzzy import fuzz
22
  from fuzzywuzzy import process
23
 
24
 
25
- HF_API_KEY = os.getenv("HF_API_KEY")
26
 
27
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
28
  # base_url = "https://router.huggingface.co/novita"
@@ -33,8 +33,8 @@ HF_API_KEY = os.getenv("HF_API_KEY")
33
  # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
34
 
35
  # Qwen 2.5 7B --------------------------------------------------------
36
- base_url = "https://router.huggingface.co/together/v1"
37
- model="Qwen/Qwen2.5-7B-Instruct-Turbo"
38
 
39
  # Qwen 2.5 32B --------------------------------------------------------
40
  # base_url = "https://router.huggingface.co/novita/v3/openai"
@@ -394,53 +394,74 @@ def identify_table_type_and_header_row(rows):
394
  return "unknown", 0
395
 
396
  def extract_tables(root):
397
- """Extracts tables from the DOCX document and returns structured data."""
398
- tables = root.findall('.//w:tbl', NS)
 
 
 
 
 
399
  table_data = {}
400
  table_paragraphs = set()
401
-
402
- for table_index, table in enumerate(tables, start=1):
403
- rows = table.findall('.//w:tr', NS)
404
- if not rows:
405
- continue # Skip empty tables
406
-
407
- for paragraph in table.findall('.//w:p', NS):
408
- table_paragraphs.add(paragraph)
409
-
410
- table_type, header_row_index = identify_table_type_and_header_row(rows)
411
-
412
- if table_type == "single_column":
413
- single_column_data = process_single_column_table(rows)
414
- if single_column_data:
415
- table_data[f"table_{table_index}_single_column"] = single_column_data
416
- continue
417
- elif table_type == "buyer_seller":
418
- buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
419
- if buyer_seller_data:
420
- table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
421
- continue
422
- elif table_type == "summary":
423
- summary_data = process_summary_table(rows[header_row_index:])
424
- if summary_data:
425
- table_data[f"table_{table_index}_summary"] = summary_data
426
- continue
427
- elif table_type == "long_table":
428
- long_table_data = process_long_table(rows[header_row_index:])
429
- if long_table_data:
430
- table_data[f"long_table_{table_index}"] = long_table_data
431
- continue
432
- else:
433
- # fallback: try to process as long table from first multi-column row
434
- long_table_data = process_long_table(rows[header_row_index:])
435
- if long_table_data:
436
- table_data[f"long_table_{table_index}"] = long_table_data
437
- continue
438
-
439
- # # Print the first row's cell texts for debugging
440
- # header_cells = rows[0].findall('.//w:tc', NS)
441
- # header_texts = ["|".join(extract_text_from_cell(cell)) for cell in header_cells]
442
- # print(f"Table {table_index} header: {header_texts}")
443
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  return table_data, table_paragraphs
445
 
446
  # --- Non-Table Processing Functions ---
@@ -568,6 +589,11 @@ Contract data in JSON format:""" + f"""
568
 
569
  # Clean up JSON before validation
570
  contract_json = json.loads(contract_summary.strip())
 
 
 
 
 
571
  validated_data = ContractSummary.model_validate(contract_json)
572
 
573
  # Success! Return validated data
@@ -603,6 +629,11 @@ Contract data in JSON format:""" + f"""
603
  # If we get here, all attempts failed - return empty but valid model
604
  print("⚠️ All attempts failed, returning empty model")
605
  empty_data = ContractSummary().model_dump(by_alias=True)
 
 
 
 
 
606
  empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
607
 
608
  if save_json:
@@ -612,6 +643,39 @@ Contract data in JSON format:""" + f"""
612
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
613
 
614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
616
  """
617
  Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
@@ -697,7 +761,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json",
697
  # 单位 mappings
698
  "单位": ["单位 unit", "单位unit", "unit", "单位"],
699
  # 单价 mappings
700
- "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
701
  "单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
702
  "价格 price", "价格price", "价格",
703
  "美元单价"],
@@ -1092,40 +1156,6 @@ def json_to_excel(contract_summary, json_data, excel_path):
1092
  contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
1093
  long_table.to_excel(writer, sheet_name="Price List", index=False)
1094
 
1095
- # Add this helper function near your other helpers
1096
- def find_price_list_table(extracted_data, min_matches=3):
1097
- price_keywords = [
1098
- "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
1099
- "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
1100
- "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
1101
- "货描", "commodity",
1102
- ]
1103
- last_price_list_table = None
1104
-
1105
- # Get all long tables and sort them by key to ensure we process them in order
1106
- long_tables = [(key, table) for key, table in extracted_data.items()
1107
- if "long_table" in key and isinstance(table, list) and table]
1108
- long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
1109
-
1110
- for key, table in long_tables:
1111
-
1112
- headers = list(table[0].keys())
1113
-
1114
- match_count = 0
1115
- for header in headers:
1116
- header_lower = header.lower()
1117
- # Use fuzzy matching for keyword detection
1118
- for keyword in price_keywords:
1119
- if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
1120
- match_count += 1
1121
- break # Found a match for this header, move to next
1122
-
1123
- if match_count >= min_matches:
1124
- last_price_list_table = table # Keep the last table that meets criteria
1125
-
1126
- return last_price_list_table
1127
-
1128
-
1129
  #--- Handle Edge Cases ------------------------------
1130
 
1131
  def handle_weight_conversion_edge_case(transformed_data):
@@ -1320,7 +1350,7 @@ def extract_po(docx_path):
1320
  # Example Usage
1321
 
1322
  # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
1323
- # print(extract_po(r"UAT Contracts\修改后合同\GN-CGS202410-AMC-169BJ 柳工设备配件采购合同-广西柳工.docx"))
1324
 
1325
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1326
 
 
22
  from fuzzywuzzy import process
23
 
24
 
25
+ HF_API_KEY = os.getenv("API_KEY")
26
 
27
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
28
  # base_url = "https://router.huggingface.co/novita"
 
33
  # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
34
 
35
  # Qwen 2.5 7B --------------------------------------------------------
36
+ base_url = os.getenv("LLM_URL")
37
+ model= os.getenv("MODEL_NAME")
38
 
39
  # Qwen 2.5 32B --------------------------------------------------------
40
  # base_url = "https://router.huggingface.co/novita/v3/openai"
 
394
  return "unknown", 0
395
 
396
  def extract_tables(root):
397
+ """Extracts tables from the DOCX document and returns structured data, skipping tables whose title contains 'template'.
398
+ Handles cases where there are blank paragraphs between the title and the table."""
399
+ # Find the document body (usually the first child of root)
400
+ body = root.find('.//w:body', NS)
401
+ if body is None:
402
+ body = root # fallback if structure is different
403
+
404
  table_data = {}
405
  table_paragraphs = set()
406
+ table_index = 1
407
+ last_paragraphs = [] # Store the last few paragraphs (max 3)
408
+
409
+ # Iterate through direct children of the body
410
+ for elem in list(body):
411
+ if elem.tag == f'{{{NS["w"]}}}p':
412
+ # Keep a rolling list of the last 3 paragraphs
413
+ last_paragraphs.append(elem)
414
+ if len(last_paragraphs) > 3:
415
+ last_paragraphs.pop(0)
416
+ elif elem.tag == f'{{{NS["w"]}}}tbl':
417
+ # Look back through last_paragraphs for the most recent non-empty one
418
+ title = ""
419
+ for para in reversed(last_paragraphs):
420
+ texts = [t.text for t in para.findall('.//w:t', NS) if t.text]
421
+ candidate = ' '.join(texts).strip()
422
+ if candidate:
423
+ title = candidate
424
+ break
425
+ # If title contains 'template', skip this table
426
+ if title and 'template' in title.lower():
427
+ continue
428
+ rows = elem.findall('.//w:tr', NS)
429
+ if not rows:
430
+ continue # Skip empty tables
431
+ for paragraph in elem.findall('.//w:p', NS):
432
+ table_paragraphs.add(paragraph)
433
+ table_type, header_row_index = identify_table_type_and_header_row(rows)
434
+ if table_type == "single_column":
435
+ single_column_data = process_single_column_table(rows)
436
+ if single_column_data:
437
+ table_data[f"table_{table_index}_single_column"] = single_column_data
438
+ table_index += 1
439
+ continue
440
+ elif table_type == "buyer_seller":
441
+ buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
442
+ if buyer_seller_data:
443
+ table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
444
+ table_index += 1
445
+ continue
446
+ elif table_type == "summary":
447
+ summary_data = process_summary_table(rows[header_row_index:])
448
+ if summary_data:
449
+ table_data[f"table_{table_index}_summary"] = summary_data
450
+ table_index += 1
451
+ continue
452
+ elif table_type == "long_table":
453
+ long_table_data = process_long_table(rows[header_row_index:])
454
+ if long_table_data:
455
+ table_data[f"long_table_{table_index}"] = long_table_data
456
+ table_index += 1
457
+ continue
458
+ else:
459
+ # fallback: try to process as long table from first multi-column row
460
+ long_table_data = process_long_table(rows[header_row_index:])
461
+ if long_table_data:
462
+ table_data[f"long_table_{table_index}"] = long_table_data
463
+ table_index += 1
464
+ continue
465
  return table_data, table_paragraphs
466
 
467
  # --- Non-Table Processing Functions ---
 
589
 
590
  # Clean up JSON before validation
591
  contract_json = json.loads(contract_summary.strip())
592
+
593
+ # Clean 合同编号 by removing all contents in brackets including the brackets themselves
594
+ if "合同编号" in contract_json and contract_json["合同编号"]:
595
+ contract_json["合同编号"] = re.sub(r'[\((].*?[\))]', '', contract_json["合同编号"]).strip()
596
+
597
  validated_data = ContractSummary.model_validate(contract_json)
598
 
599
  # Success! Return validated data
 
629
  # If we get here, all attempts failed - return empty but valid model
630
  print("⚠️ All attempts failed, returning empty model")
631
  empty_data = ContractSummary().model_dump(by_alias=True)
632
+
633
+ # Clean 合同编号 by removing all contents in brackets including the brackets themselves
634
+ if "合同编号" in empty_data and empty_data["合同编号"]:
635
+ empty_data["合同编号"] = re.sub(r'[\((].*?[\))]', '', empty_data["合同编号"]).strip()
636
+
637
  empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
638
 
639
  if save_json:
 
643
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
644
 
645
 
646
+ def find_price_list_table(extracted_data, min_matches=3):
647
+ price_keywords = [
648
+ "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
649
+ "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
650
+ "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
651
+ "货描", "commodity",
652
+ ]
653
+ last_price_list_table = None
654
+
655
+ # Get all long tables and sort them by key to ensure we process them in order
656
+ long_tables = [(key, table) for key, table in extracted_data.items()
657
+ if "long_table" in key and isinstance(table, list) and table]
658
+ long_tables.sort(key=lambda x: x[0]) # Sort by key to maintain order
659
+
660
+ for key, table in long_tables:
661
+
662
+ headers = list(table[0].keys())
663
+
664
+ match_count = 0
665
+ for header in headers:
666
+ header_lower = header.lower()
667
+ # Use fuzzy matching for keyword detection
668
+ for keyword in price_keywords:
669
+ if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
670
+ match_count += 1
671
+ break # Found a match for this header, move to next
672
+
673
+ if match_count >= min_matches:
674
+ last_price_list_table = table # Keep the last table that meets criteria
675
+
676
+ return last_price_list_table
677
+
678
+
679
  def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
680
  """
681
  Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
 
761
  # 单位 mappings
762
  "单位": ["单位 unit", "单位unit", "unit", "单位"],
763
  # 单价 mappings
764
+ "单价": ["单价 unit price (cny)", "单价unit price (cny)", "单价(元)Unit Price (CNY)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price(cny)",
765
  "单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)", "单价(欧元) unit price(eur)",
766
  "价格 price", "价格price", "价格",
767
  "美元单价"],
 
1156
  contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
1157
  long_table.to_excel(writer, sheet_name="Price List", index=False)
1158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
  #--- Handle Edge Cases ------------------------------
1160
 
1161
  def handle_weight_conversion_edge_case(transformed_data):
 
1350
  # Example Usage
1351
 
1352
  # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
1353
+ # print(extract_po(r"UAT Contracts\20250703\GN-WAPJS202405-297HG 1200R20轮胎采购合同-威海君乐-法务审批0515.docx"))
1354
 
1355
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1356