MikeMai commited on
Commit
a4dc2f9
·
verified ·
1 Parent(s): 7556a1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -94
app.py CHANGED
@@ -14,8 +14,6 @@ from openai import OpenAI
14
 
15
  import re
16
 
17
- import logging
18
-
19
  from pydantic import BaseModel, Field, ValidationError, RootModel
20
  from typing import List, Optional
21
 
@@ -42,14 +40,6 @@ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
42
  # base_url = "https://router.huggingface.co/sambanova/v1"
43
  # model="Qwen3-32B"
44
 
45
- # Configure logging to write to 'zaoju_logs.log' without using pickle
46
- logging.basicConfig(
47
- filename='extract_po_logs.log',
48
- level=logging.INFO,
49
- format='%(asctime)s - %(levelname)s - %(message)s',
50
- encoding='utf-8'
51
- )
52
-
53
  # Default Word XML namespace
54
  DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
55
  NS = None # Global variable to store the namespace
@@ -548,7 +538,6 @@ Contract data in JSON format:""" + f"""
548
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
549
  if think_text:
550
  print(f"🧠 Thought Process: {think_text}")
551
- logging.info(f"Think text: {think_text}")
552
 
553
  contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
554
  contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
@@ -569,14 +558,10 @@ Contract data in JSON format:""" + f"""
569
 
570
  except ValidationError as e:
571
  error_msg = f"Validation error: {e}"
572
- logging.error(f"{error_msg}")
573
- logging.error(f"Input data: {contract_summary}")
574
  print(f"❌ {error_msg}")
575
 
576
  except json.JSONDecodeError as e:
577
  error_msg = f"JSON decode error: {e}"
578
- logging.error(f"{error_msg}")
579
- logging.error(f"Input data: {contract_summary}")
580
  print(f"❌ {error_msg}")
581
 
582
  # Don't retry on the last attempt
@@ -867,7 +852,6 @@ Do not force map 名称(英文) to 单价
867
  think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
868
  if think_text:
869
  print(f"🧠 Thought Process: {think_text}")
870
- logging.info(f"Think text: {think_text}")
871
 
872
  raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
873
  # Remove any backticks or json tags
@@ -903,7 +887,6 @@ Do not force map 名称(英文) to 单价
903
 
904
  except Exception as e:
905
  error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
906
- logging.error(f"{error_msg}")
907
  print(f"❌ {error_msg}")
908
 
909
  if attempt < max_retries - 1:
@@ -1120,79 +1103,72 @@ def extract_po(docx_path):
1120
  with open(docx_path, "rb") as f:
1121
  docx_bytes = BytesIO(f.read())
1122
 
1123
- # Step 1: Extract XML content from DOCX
1124
- print("Extracting Docs data to XML...")
1125
- xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
1126
  try:
1127
- xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
1128
- get_namespace(ET.fromstring(xml_file))
1129
- except (zipfile.BadZipFile, KeyError):
1130
- raise ValueError(f"Invalid file: {docx_path}")
1131
-
1132
- # Step 2: Extract tables from DOCX and save JSON
1133
- print("Extracting XML data to JSON...")
1134
- json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
1135
- extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
1136
-
1137
- # Find and rename the price list table before contract summary processing
1138
- print("Identifying Price List table...")
1139
- extracted_data_dict = json.loads(extracted_data)
1140
- price_list_table = find_price_list_table(extracted_data_dict)
1141
-
1142
- # Rename the price list table key
1143
- if price_list_table:
1144
- # Find and rename the key containing the price list table
1145
- for key in list(extracted_data_dict.keys()):
1146
- if "long_table" in key and extracted_data_dict[key] == price_list_table:
1147
- extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
1148
- break
1149
- # Update the extracted_data string with proper formatting
1150
- extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1151
- else:
1152
- print("⚠️ No suitable price list table found!")
1153
- extracted_data_dict["price_list"] = []
1154
- extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1155
-
1156
- print(f" Extracted Data: {extracted_data}")
1157
-
1158
- # Create a copy of the data with only first row of price list for contract summary
1159
- contract_summary_dict = json.loads(extracted_data)
1160
- if contract_summary_dict.get("price_list"):
1161
- contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
1162
- contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
1163
-
1164
- # Step 3: Process JSON with OpenAI to get structured output
1165
- print("Processing Contract Summary data with AI...")
1166
- contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1167
- contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
1168
-
1169
- # Process the price list
1170
- print("Processing Price List data with AI...")
1171
- price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1172
- price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
 
 
 
 
1173
 
1174
- # Step 4: Combine contract summary and long table data into a single JSON object
1175
- print("Combining AI Generated JSON with Extracted Data...")
1176
-
1177
- combined_data = {
1178
- "contract_summary": json.loads(json.loads(contract_summary)),
1179
- "price_list": price_list
1180
- }
1181
 
1182
- # Logging
1183
- log = f"""Results:
1184
-
1185
- Contract Summary: {contract_summary},
1186
-
1187
- RAW Extracted Data: {extracted_data},
1188
-
1189
- Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
1190
-
1191
- # print(log)
1192
- # print(f"🔄 Extracted Data: {combined_data}")
1193
- logging.info(f"""{log}""")
1194
-
1195
- return combined_data
1196
 
1197
  # Example Usage
1198
 
@@ -1206,13 +1182,6 @@ def extract_po(docx_path):
1206
  import gradio as gr
1207
  from gradio.themes.base import Base
1208
 
1209
- # def extract_po_api(docx_path):
1210
- # try:
1211
- # return extract_po(docx_path)
1212
- # except Exception as e:
1213
- # # Return error details in the API response
1214
- # return {"error":str(e)}
1215
-
1216
  interface = gr.Interface(
1217
  fn=extract_po,
1218
  title="PO Extractor 买卖合同数据提取",
 
14
 
15
  import re
16
 
 
 
17
  from pydantic import BaseModel, Field, ValidationError, RootModel
18
  from typing import List, Optional
19
 
 
40
  # base_url = "https://router.huggingface.co/sambanova/v1"
41
  # model="Qwen3-32B"
42
 
 
 
 
 
 
 
 
 
43
  # Default Word XML namespace
44
  DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
45
  NS = None # Global variable to store the namespace
 
538
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
539
  if think_text:
540
  print(f"🧠 Thought Process: {think_text}")
 
541
 
542
  contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
543
  contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
 
558
 
559
  except ValidationError as e:
560
  error_msg = f"Validation error: {e}"
 
 
561
  print(f"❌ {error_msg}")
562
 
563
  except json.JSONDecodeError as e:
564
  error_msg = f"JSON decode error: {e}"
 
 
565
  print(f"❌ {error_msg}")
566
 
567
  # Don't retry on the last attempt
 
852
  think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
853
  if think_text:
854
  print(f"🧠 Thought Process: {think_text}")
 
855
 
856
  raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
857
  # Remove any backticks or json tags
 
887
 
888
  except Exception as e:
889
  error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
 
890
  print(f"❌ {error_msg}")
891
 
892
  if attempt < max_retries - 1:
 
1103
  with open(docx_path, "rb") as f:
1104
  docx_bytes = BytesIO(f.read())
1105
 
 
 
 
1106
  try:
1107
+ # Step 1: Extract XML content from DOCX
1108
+ print("Extracting Docs data to XML...")
1109
+ xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
1110
+ try:
1111
+ xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
1112
+ get_namespace(ET.fromstring(xml_file))
1113
+ except (zipfile.BadZipFile, KeyError):
1114
+ raise ValueError(f"Invalid file: {docx_path}")
1115
+
1116
+ # Step 2: Extract tables from DOCX and save JSON
1117
+ print("Extracting XML data to JSON...")
1118
+ json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
1119
+ extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
1120
+
1121
+ # Find and rename the price list table before contract summary processing
1122
+ print("Identifying Price List table...")
1123
+ extracted_data_dict = json.loads(extracted_data)
1124
+ price_list_table = find_price_list_table(extracted_data_dict)
1125
+
1126
+ # Rename the price list table key
1127
+ if price_list_table:
1128
+ # Find and rename the key containing the price list table
1129
+ for key in list(extracted_data_dict.keys()):
1130
+ if "long_table" in key and extracted_data_dict[key] == price_list_table:
1131
+ extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
1132
+ break
1133
+ # Update the extracted_data string with proper formatting
1134
+ extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1135
+ else:
1136
+ print("⚠️ No suitable price list table found!")
1137
+ extracted_data_dict["price_list"] = []
1138
+ extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
1139
+
1140
+ print(f"✅ Extracted Data: {extracted_data}")
1141
+
1142
+ # Create a copy of the data with only first row of price list for contract summary
1143
+ contract_summary_dict = json.loads(extracted_data)
1144
+ if contract_summary_dict.get("price_list"):
1145
+ contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
1146
+ contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
1147
+
1148
+ # Step 3: Process JSON with OpenAI to get structured output
1149
+ print("Processing Contract Summary data with AI...")
1150
+ contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
1151
+ contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
1152
+
1153
+ # Process the price list
1154
+ print("Processing Price List data with AI...")
1155
+ price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1156
+ price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
1157
 
1158
+ # Step 4: Combine contract summary and long table data into a single JSON object
1159
+ print("Combining AI Generated JSON with Extracted Data...")
1160
+
1161
+ combined_data = {
1162
+ "contract_summary": json.loads(json.loads(contract_summary)),
1163
+ "price_list": price_list
1164
+ }
1165
 
1166
+ return combined_data
1167
+
1168
+ finally:
1169
+ # Ensure BytesIO is properly closed
1170
+ if 'docx_bytes' in locals():
1171
+ docx_bytes.close()
 
 
 
 
 
 
 
 
1172
 
1173
  # Example Usage
1174
 
 
1182
  import gradio as gr
1183
  from gradio.themes.base import Base
1184
 
 
 
 
 
 
 
 
1185
  interface = gr.Interface(
1186
  fn=extract_po,
1187
  title="PO Extractor 买卖合同数据提取",