Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,8 +14,6 @@ from openai import OpenAI
|
|
| 14 |
|
| 15 |
import re
|
| 16 |
|
| 17 |
-
import logging
|
| 18 |
-
|
| 19 |
from pydantic import BaseModel, Field, ValidationError, RootModel
|
| 20 |
from typing import List, Optional
|
| 21 |
|
|
@@ -42,14 +40,6 @@ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
|
|
| 42 |
# base_url = "https://router.huggingface.co/sambanova/v1"
|
| 43 |
# model="Qwen3-32B"
|
| 44 |
|
| 45 |
-
# Configure logging to write to 'zaoju_logs.log' without using pickle
|
| 46 |
-
logging.basicConfig(
|
| 47 |
-
filename='extract_po_logs.log',
|
| 48 |
-
level=logging.INFO,
|
| 49 |
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 50 |
-
encoding='utf-8'
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
# Default Word XML namespace
|
| 54 |
DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
| 55 |
NS = None # Global variable to store the namespace
|
|
@@ -548,7 +538,6 @@ Contract data in JSON format:""" + f"""
|
|
| 548 |
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
| 549 |
if think_text:
|
| 550 |
print(f"🧠 Thought Process: {think_text}")
|
| 551 |
-
logging.info(f"Think text: {think_text}")
|
| 552 |
|
| 553 |
contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
|
| 554 |
contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
|
|
@@ -569,14 +558,10 @@ Contract data in JSON format:""" + f"""
|
|
| 569 |
|
| 570 |
except ValidationError as e:
|
| 571 |
error_msg = f"Validation error: {e}"
|
| 572 |
-
logging.error(f"{error_msg}")
|
| 573 |
-
logging.error(f"Input data: {contract_summary}")
|
| 574 |
print(f"❌ {error_msg}")
|
| 575 |
|
| 576 |
except json.JSONDecodeError as e:
|
| 577 |
error_msg = f"JSON decode error: {e}"
|
| 578 |
-
logging.error(f"{error_msg}")
|
| 579 |
-
logging.error(f"Input data: {contract_summary}")
|
| 580 |
print(f"❌ {error_msg}")
|
| 581 |
|
| 582 |
# Don't retry on the last attempt
|
|
@@ -867,7 +852,6 @@ Do not force map 名称(英文) to 单价
|
|
| 867 |
think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
|
| 868 |
if think_text:
|
| 869 |
print(f"🧠 Thought Process: {think_text}")
|
| 870 |
-
logging.info(f"Think text: {think_text}")
|
| 871 |
|
| 872 |
raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
|
| 873 |
# Remove any backticks or json tags
|
|
@@ -903,7 +887,6 @@ Do not force map 名称(英文) to 单价
|
|
| 903 |
|
| 904 |
except Exception as e:
|
| 905 |
error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
|
| 906 |
-
logging.error(f"{error_msg}")
|
| 907 |
print(f"❌ {error_msg}")
|
| 908 |
|
| 909 |
if attempt < max_retries - 1:
|
|
@@ -1120,79 +1103,72 @@ def extract_po(docx_path):
|
|
| 1120 |
with open(docx_path, "rb") as f:
|
| 1121 |
docx_bytes = BytesIO(f.read())
|
| 1122 |
|
| 1123 |
-
# Step 1: Extract XML content from DOCX
|
| 1124 |
-
print("Extracting Docs data to XML...")
|
| 1125 |
-
xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
|
| 1126 |
try:
|
| 1127 |
-
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
|
| 1138 |
-
|
| 1139 |
-
|
| 1140 |
-
|
| 1141 |
-
|
| 1142 |
-
|
| 1143 |
-
|
| 1144 |
-
|
| 1145 |
-
|
| 1146 |
-
|
| 1147 |
-
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
|
| 1151 |
-
|
| 1152 |
-
|
| 1153 |
-
|
| 1154 |
-
|
| 1155 |
-
|
| 1156 |
-
|
| 1157 |
-
|
| 1158 |
-
|
| 1159 |
-
|
| 1160 |
-
|
| 1161 |
-
|
| 1162 |
-
|
| 1163 |
-
|
| 1164 |
-
|
| 1165 |
-
|
| 1166 |
-
|
| 1167 |
-
|
| 1168 |
-
|
| 1169 |
-
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1173 |
|
| 1174 |
-
|
| 1175 |
-
|
| 1176 |
-
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
|
| 1181 |
|
| 1182 |
-
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
|
| 1190 |
-
|
| 1191 |
-
# print(log)
|
| 1192 |
-
# print(f"🔄 Extracted Data: {combined_data}")
|
| 1193 |
-
logging.info(f"""{log}""")
|
| 1194 |
-
|
| 1195 |
-
return combined_data
|
| 1196 |
|
| 1197 |
# Example Usage
|
| 1198 |
|
|
@@ -1206,13 +1182,6 @@ def extract_po(docx_path):
|
|
| 1206 |
import gradio as gr
|
| 1207 |
from gradio.themes.base import Base
|
| 1208 |
|
| 1209 |
-
# def extract_po_api(docx_path):
|
| 1210 |
-
# try:
|
| 1211 |
-
# return extract_po(docx_path)
|
| 1212 |
-
# except Exception as e:
|
| 1213 |
-
# # Return error details in the API response
|
| 1214 |
-
# return {"error":str(e)}
|
| 1215 |
-
|
| 1216 |
interface = gr.Interface(
|
| 1217 |
fn=extract_po,
|
| 1218 |
title="PO Extractor 买卖合同数据提取",
|
|
|
|
| 14 |
|
| 15 |
import re
|
| 16 |
|
|
|
|
|
|
|
| 17 |
from pydantic import BaseModel, Field, ValidationError, RootModel
|
| 18 |
from typing import List, Optional
|
| 19 |
|
|
|
|
| 40 |
# base_url = "https://router.huggingface.co/sambanova/v1"
|
| 41 |
# model="Qwen3-32B"
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Default Word XML namespace
|
| 44 |
DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
|
| 45 |
NS = None # Global variable to store the namespace
|
|
|
|
| 538 |
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
| 539 |
if think_text:
|
| 540 |
print(f"🧠 Thought Process: {think_text}")
|
|
|
|
| 541 |
|
| 542 |
contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
|
| 543 |
contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
|
|
|
|
| 558 |
|
| 559 |
except ValidationError as e:
|
| 560 |
error_msg = f"Validation error: {e}"
|
|
|
|
|
|
|
| 561 |
print(f"❌ {error_msg}")
|
| 562 |
|
| 563 |
except json.JSONDecodeError as e:
|
| 564 |
error_msg = f"JSON decode error: {e}"
|
|
|
|
|
|
|
| 565 |
print(f"❌ {error_msg}")
|
| 566 |
|
| 567 |
# Don't retry on the last attempt
|
|
|
|
| 852 |
think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
|
| 853 |
if think_text:
|
| 854 |
print(f"🧠 Thought Process: {think_text}")
|
|
|
|
| 855 |
|
| 856 |
raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
|
| 857 |
# Remove any backticks or json tags
|
|
|
|
| 887 |
|
| 888 |
except Exception as e:
|
| 889 |
error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
|
|
|
|
| 890 |
print(f"❌ {error_msg}")
|
| 891 |
|
| 892 |
if attempt < max_retries - 1:
|
|
|
|
| 1103 |
with open(docx_path, "rb") as f:
|
| 1104 |
docx_bytes = BytesIO(f.read())
|
| 1105 |
|
|
|
|
|
|
|
|
|
|
| 1106 |
try:
|
| 1107 |
+
# Step 1: Extract XML content from DOCX
|
| 1108 |
+
print("Extracting Docs data to XML...")
|
| 1109 |
+
xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
|
| 1110 |
+
try:
|
| 1111 |
+
xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
|
| 1112 |
+
get_namespace(ET.fromstring(xml_file))
|
| 1113 |
+
except (zipfile.BadZipFile, KeyError):
|
| 1114 |
+
raise ValueError(f"Invalid file: {docx_path}")
|
| 1115 |
+
|
| 1116 |
+
# Step 2: Extract tables from DOCX and save JSON
|
| 1117 |
+
print("Extracting XML data to JSON...")
|
| 1118 |
+
json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
|
| 1119 |
+
extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
|
| 1120 |
+
|
| 1121 |
+
# Find and rename the price list table before contract summary processing
|
| 1122 |
+
print("Identifying Price List table...")
|
| 1123 |
+
extracted_data_dict = json.loads(extracted_data)
|
| 1124 |
+
price_list_table = find_price_list_table(extracted_data_dict)
|
| 1125 |
+
|
| 1126 |
+
# Rename the price list table key
|
| 1127 |
+
if price_list_table:
|
| 1128 |
+
# Find and rename the key containing the price list table
|
| 1129 |
+
for key in list(extracted_data_dict.keys()):
|
| 1130 |
+
if "long_table" in key and extracted_data_dict[key] == price_list_table:
|
| 1131 |
+
extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
|
| 1132 |
+
break
|
| 1133 |
+
# Update the extracted_data string with proper formatting
|
| 1134 |
+
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1135 |
+
else:
|
| 1136 |
+
print("⚠️ No suitable price list table found!")
|
| 1137 |
+
extracted_data_dict["price_list"] = []
|
| 1138 |
+
extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
|
| 1139 |
+
|
| 1140 |
+
print(f"✅ Extracted Data: {extracted_data}")
|
| 1141 |
+
|
| 1142 |
+
# Create a copy of the data with only first row of price list for contract summary
|
| 1143 |
+
contract_summary_dict = json.loads(extracted_data)
|
| 1144 |
+
if contract_summary_dict.get("price_list"):
|
| 1145 |
+
contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
|
| 1146 |
+
contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
|
| 1147 |
+
|
| 1148 |
+
# Step 3: Process JSON with OpenAI to get structured output
|
| 1149 |
+
print("Processing Contract Summary data with AI...")
|
| 1150 |
+
contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
|
| 1151 |
+
contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
|
| 1152 |
+
|
| 1153 |
+
# Process the price list
|
| 1154 |
+
print("Processing Price List data with AI...")
|
| 1155 |
+
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 1156 |
+
price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
|
| 1157 |
|
| 1158 |
+
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 1159 |
+
print("Combining AI Generated JSON with Extracted Data...")
|
| 1160 |
+
|
| 1161 |
+
combined_data = {
|
| 1162 |
+
"contract_summary": json.loads(json.loads(contract_summary)),
|
| 1163 |
+
"price_list": price_list
|
| 1164 |
+
}
|
| 1165 |
|
| 1166 |
+
return combined_data
|
| 1167 |
+
|
| 1168 |
+
finally:
|
| 1169 |
+
# Ensure BytesIO is properly closed
|
| 1170 |
+
if 'docx_bytes' in locals():
|
| 1171 |
+
docx_bytes.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1172 |
|
| 1173 |
# Example Usage
|
| 1174 |
|
|
|
|
| 1182 |
import gradio as gr
|
| 1183 |
from gradio.themes.base import Base
|
| 1184 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1185 |
interface = gr.Interface(
|
| 1186 |
fn=extract_po,
|
| 1187 |
title="PO Extractor 买卖合同数据提取",
|