Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1133,26 +1133,43 @@ def extract_po(docx_path):
|
|
| 1133 |
print("Extracting XML data to JSON...")
|
| 1134 |
json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
|
| 1135 |
extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1136 |
print(f"✅ Extracted Data: {extracted_data}")
|
| 1137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1139 |
print("Processing Contract Summary data with AI...")
|
| 1140 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
| 1141 |
-
contract_summary = deepseek_extract_contract_summary(
|
| 1142 |
|
| 1143 |
-
#
|
| 1144 |
print("Processing Price List data with AI...")
|
| 1145 |
-
extracted_data_dict = json.loads(extracted_data)
|
| 1146 |
-
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1147 |
-
if not price_list_table:
|
| 1148 |
-
print("⚠️ No suitable price list table found!")
|
| 1149 |
-
price_list_table = []
|
| 1150 |
-
|
| 1151 |
-
# Generate the price list filename in the same folder as the document
|
| 1152 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 1153 |
-
|
| 1154 |
-
# Process the price list and save it to a JSON file
|
| 1155 |
-
price_list = extract_price_list(price_list_table, save_json=True, json_name=price_list_filename)
|
| 1156 |
|
| 1157 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 1158 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
@@ -1180,7 +1197,7 @@ def extract_po(docx_path):
|
|
| 1180 |
# Example Usage
|
| 1181 |
|
| 1182 |
# extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
|
| 1183 |
-
#
|
| 1184 |
|
| 1185 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1186 |
|
|
|
|
| 1133 |
print("Extracting XML data to JSON...")
|
| 1134 |
json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
|
| 1135 |
extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
|
| 1136 |
+
|
| 1137 |
+
# Find and rename the price list table before contract summary processing
|
| 1138 |
+
print("Identifying Price List table...")
|
| 1139 |
+
extracted_data_dict = json.loads(extracted_data)
|
| 1140 |
+
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1141 |
+
|
| 1142 |
+
# Rename the price list table key
|
| 1143 |
+
if price_list_table:
|
| 1144 |
+
# Find and rename the key containing the price list table
|
| 1145 |
+
for key in list(extracted_data_dict.keys()):
|
| 1146 |
+
if "long_table" in key and extracted_data_dict[key] == price_list_table:
|
| 1147 |
+
extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
|
| 1148 |
+
break
|
| 1149 |
+
# Update the extracted_data string with proper formatting
|
| 1150 |
+
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1151 |
+
else:
|
| 1152 |
+
print("⚠️ No suitable price list table found!")
|
| 1153 |
+
extracted_data_dict["price_list"] = []
|
| 1154 |
+
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1155 |
+
|
| 1156 |
print(f"✅ Extracted Data: {extracted_data}")
|
| 1157 |
|
| 1158 |
+
# Create a copy of the data with only first row of price list for contract summary
|
| 1159 |
+
contract_summary_dict = json.loads(extracted_data)
|
| 1160 |
+
if contract_summary_dict.get("price_list"):
|
| 1161 |
+
contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
|
| 1162 |
+
contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
|
| 1163 |
+
|
| 1164 |
# Step 3: Process JSON with OpenAI to get structured output
|
| 1165 |
print("Processing Contract Summary data with AI...")
|
| 1166 |
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
| 1167 |
+
contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
|
| 1168 |
|
| 1169 |
+
# Process the price list
|
| 1170 |
print("Processing Price List data with AI...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1171 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 1172 |
+
price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
|
|
|
|
|
|
|
| 1173 |
|
| 1174 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 1175 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
|
|
| 1197 |
# Example Usage
|
| 1198 |
|
| 1199 |
# extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx")
|
| 1200 |
+
#extract_po("UAT Contracts\GN-WCIE2025-WCSP-276BJ-稳定土拌合机配件-合同.docx")
|
| 1201 |
|
| 1202 |
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 1203 |
|