MikeMai commited on
Commit
7556a1e
·
verified ·
1 Parent(s): fb7be42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -13
app.py CHANGED
@@ -1133,26 +1133,43 @@ def extract_po(docx_path):
1133
  print("Extracting XML data to JSON...")
1134
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
1135
  extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1136
  print(f"✅ Extracted Data: {extracted_data}")
1137
 
 
 
 
 
 
 
1138
  # Step 3: Process JSON with OpenAI to get structured output
1139
  print("Processing Contract Summary data with AI...")
1140
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1141
- contract_summary = deepseek_extract_contract_summary(extracted_data, save_json=False, json_filename=contract_summary_filename)
1142
 
1143
- # Find the last long table (excluding summary tables)
1144
  print("Processing Price List data with AI...")
1145
- extracted_data_dict = json.loads(extracted_data)
1146
- price_list_table = find_price_list_table(extracted_data_dict)
1147
- if not price_list_table:
1148
- print("⚠️ No suitable price list table found!")
1149
- price_list_table = []
1150
-
1151
- # Generate the price list filename in the same folder as the document
1152
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1153
-
1154
- # Process the price list and save it to a JSON file
1155
- price_list = extract_price_list(price_list_table, save_json=True, json_name=price_list_filename)
1156
 
1157
  # Step 4: Combine contract summary and long table data into a single JSON object
1158
  print("Combining AI Generated JSON with Extracted Data...")
@@ -1180,7 +1197,7 @@ def extract_po(docx_path):
1180
  # Example Usage
1181
 
1182
  # extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
1183
- # extract_po("EPC简明合同格式-中英对照版.docx")
1184
 
1185
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1186
 
 
1133
  print("Extracting XML data to JSON...")
1134
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
1135
  extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
1136
+
1137
+ # Find and rename the price list table before contract summary processing
1138
+ print("Identifying Price List table...")
1139
+ extracted_data_dict = json.loads(extracted_data)
1140
+ price_list_table = find_price_list_table(extracted_data_dict)
1141
+
1142
+ # Rename the price list table key
1143
+ if price_list_table:
1144
+ # Find and rename the key containing the price list table
1145
+ for key in list(extracted_data_dict.keys()):
1146
+ if "long_table" in key and extracted_data_dict[key] == price_list_table:
1147
+ extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
1148
+ break
1149
+ # Update the extracted_data string with proper formatting
1150
+ extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1151
+ else:
1152
+ print("⚠️ No suitable price list table found!")
1153
+ extracted_data_dict["price_list"] = []
1154
+ extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1155
+
1156
  print(f"✅ Extracted Data: {extracted_data}")
1157
 
1158
+ # Create a copy of the data with only first row of price list for contract summary
1159
+ contract_summary_dict = json.loads(extracted_data)
1160
+ if contract_summary_dict.get("price_list"):
1161
+ contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
1162
+ contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
1163
+
1164
  # Step 3: Process JSON with OpenAI to get structured output
1165
  print("Processing Contract Summary data with AI...")
1166
  contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1167
+ contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
1168
 
1169
+ # Process the price list
1170
  print("Processing Price List data with AI...")
 
 
 
 
 
 
 
1171
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1172
+ price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
 
 
1173
 
1174
  # Step 4: Combine contract summary and long table data into a single JSON object
1175
  print("Combining AI Generated JSON with Extracted Data...")
 
1197
  # Example Usage
1198
 
1199
  # extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
1200
+ #extract_po("UAT Contracts\GN-WCIE2025-WCSP-276BJ-稳定土拌合机配件-合同.docx")
1201
 
1202
  # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
1203