MikeMai commited on
Commit
fd35ba2
·
verified ·
1 Parent(s): c61bff0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +443 -316
app.py CHANGED
@@ -38,6 +38,10 @@ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
38
  # base_url = "https://router.huggingface.co/novita/v3/openai"
39
  # model="qwen/qwen-2.5-72b-instruct"
40
 
 
 
 
 
41
  # Configure logging to write to 'zaoju_logs.log' without using pickle
42
  logging.basicConfig(
43
  filename='extract_po_logs.log',
@@ -79,9 +83,21 @@ def extract_text_from_cell(cell):
79
  def clean_spaces(text):
80
  """
81
  Removes excessive spaces between Chinese characters while preserving spaces in English words.
 
82
  """
83
- # Remove spaces **between** Chinese characters but keep English spaces
 
 
 
84
  text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
 
 
 
 
 
 
 
 
85
  return text.strip()
86
 
87
  def extract_key_value_pairs(text, target_dict=None):
@@ -196,6 +212,39 @@ def process_summary_table(rows):
196
 
197
  return extracted_data
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def extract_headers(first_row_cells):
200
  """Extracts unique column headers from the first row of a table."""
201
  headers = []
@@ -266,13 +315,24 @@ def process_long_table(rows):
266
 
267
  table_data.append(row_data)
268
 
 
 
 
 
 
 
 
 
 
 
269
  # Filter out rows where the "序号" column contains non-numeric values
270
  filtered_table_data = []
271
- for row in table_data:
272
- # Check if any cell contains "合计" (total)
 
273
  contains_total = False
274
  for key, value in row.items():
275
- if isinstance(value, str) and "合计" in value:
276
  contains_total = True
277
  break
278
 
@@ -303,6 +363,27 @@ def process_long_table(rows):
303
 
304
  return filtered_table_data
305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  def extract_tables(root):
307
  """Extracts tables from the DOCX document and returns structured data."""
308
  tables = root.findall('.//w:tbl', NS)
@@ -317,42 +398,34 @@ def extract_tables(root):
317
  for paragraph in table.findall('.//w:p', NS):
318
  table_paragraphs.add(paragraph)
319
 
320
- first_row_cells = rows[0].findall('.//w:tc', NS)
321
- num_columns = len(first_row_cells)
322
 
323
- if num_columns == 1:
324
  single_column_data = process_single_column_table(rows)
325
  if single_column_data:
326
  table_data[f"table_{table_index}_single_column"] = single_column_data
327
- continue # Skip further processing for this table
328
-
329
- summary_start_index = None
330
- for i, row in enumerate(rows):
331
- if len(row.findall('.//w:tc', NS)) == 2:
332
- summary_start_index = i
333
- break
334
-
335
- long_table_data = []
336
- summary_data = []
337
-
338
- if summary_start_index is not None and summary_start_index > 0:
339
- long_table_data = process_long_table(rows[:summary_start_index])
340
- elif summary_start_index is None:
341
- long_table_data = process_long_table(rows)
342
-
343
- if summary_start_index is not None:
344
- is_buyer_seller_table = all(len(row.findall('.//w:tc', NS)) == 2 for row in rows)
345
- if is_buyer_seller_table:
346
- buyer_seller_data = process_buyer_seller_table(rows)
347
- if buyer_seller_data:
348
- table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
349
- else:
350
- summary_data = process_summary_table(rows[summary_start_index:])
351
-
352
- if long_table_data:
353
- table_data[f"long_table_{table_index}"] = long_table_data
354
- if summary_data:
355
- table_data[f"long_table_{table_index}_summary"] = summary_data
356
 
357
  return table_data, table_paragraphs
358
 
@@ -532,8 +605,7 @@ Contract data in JSON format:""" + f"""
532
 
533
  def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
534
  """
535
- Extracts structured price list by first using AI to map column names to standard keys,
536
- then programmatically transforming the data to match the Pydantic model.
537
  """
538
 
539
  # If price_list is empty, return an empty list
@@ -558,10 +630,35 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
558
  # Get the headers directly from the sample row
559
  extracted_headers = list(sample_row.keys())
560
 
561
- # Clean double spaces in headers to facilitate AI identification
562
  def clean_header_spaces(headers):
563
- """Clean double spaces in headers to make them more consistent for AI processing."""
564
- return [re.sub(r'\s+', ' ', header).strip() for header in headers]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
 
566
  # Apply the cleaning function to extracted headers
567
  extracted_headers = clean_header_spaces(extracted_headers)
@@ -572,31 +669,92 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
572
  "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
573
  "备注", "计划来源"
574
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
- sample_mapping = """Examples of how you should map to guide you, there are other cases so use your own judgement to map the headers to the standard fields:
577
- - Map "序号" to headers containing "序号No.", "序号 No.",
578
- - Map "品牌" to headers containing "品牌Brand", "品牌 brand",
579
- - Map "规格型号" to headers containing "规格型号", "规格 Specification", "Specification and Model", "规格型号Specification and Model", "型号Model"
580
- - Map "所属机型" to headers containing "所属机型", "Applicable Models"
581
- - Map "数量" to headers containing "数量Quantity", "数量 Quantity", "Qty"
582
- - Map "单位" to headers containing "单位Unit", "单位 Unit"
583
- - Map "单价" to headers containing "单价(元)", "单价(CNY)", "Unit Price (CNY)", "单价Unit Price"
584
- - Map "总价" to headers containing "总价(元)", "总额(元)", "Amount (CNY)", "Total Amount (CNY)"
585
- - Map "几郎单价" to headers containing "单价(几郎)", "几郎单价(元)", "Unit Price (GNF)", "单价Unit Price(几郎)(GNF)"
586
- - Map "几郎总价" to headers containing "总额(几郎)", "几郎总额(元)", "Total Amount (GNF)"
587
- - Map "备注" to headers containing "备注Remarks", "备注 notes", "Note"
588
- - Map "计划来源" to headers containing "计划来源Plan No.", "计划来源(唛头信息)", "Planned Source" """
589
-
590
- # Use AI to map extracted headers to our target fields
591
- base_prompt = f"""
592
- You are playing a matching game. Match each and every standard fields to the exactcolumn headers within "" separated by ,.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
594
 
595
- The standard fields are:
596
- {json.dumps(target_fields, ensure_ascii=False)}
597
 
598
  You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
599
- {json.dumps(extracted_headers, ensure_ascii=False)}
600
 
601
  ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
602
 
@@ -615,269 +773,206 @@ For example, if the extracted header is "名称Name of Materials and Equipment",
615
  "名称": "名称Name of Materials and Equipment",
616
  "名称(英文)": "名称Name of Materials and Equipment"
617
  }}
618
-
619
  """
620
 
621
- messages = [{"role": "user", "content": base_prompt}]
622
-
623
- client = OpenAI(
624
- base_url=base_url,
625
- api_key=HF_API_KEY,
626
- )
627
-
628
- # Add retry logic similar to deepseek_extract_contract_summary
629
- max_retries = 3
630
- transformed_data = []
631
-
632
- for attempt in range(max_retries):
633
- try:
634
- print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries}: {base_prompt})")
635
- response = client.chat.completions.create(
636
- model=model,
637
- messages=messages,
638
- temperature=0.1,
639
- )
640
-
641
- raw_mapping = response.choices[0].message.content
642
-
643
- think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
644
- if think_text:
645
- print(f"🧠 Thought Process: {think_text}")
646
- logging.info(f"Think text: {think_text}")
647
-
648
- raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
649
- # Remove any backticks or json tags
650
- raw_mapping = re.sub(r"```json|```", "", raw_mapping)
651
-
652
- # Parse the mapping with standard fields as keys
653
- standard_field_mapping = json.loads(raw_mapping.strip())
654
- print(f"📊 Standard field mapping: {json.dumps(standard_field_mapping, ensure_ascii=False, indent=2)}")
655
-
656
- # Function to separate Chinese and English text
657
- def separate_chinese_english(text):
658
- if not text or not isinstance(text, str):
659
- return "", ""
660
-
661
- # First check if there's a clear separator like hyphen or space
662
- # Common patterns: "中文-English", "中文(English)", "中文 English"
663
- patterns = [
664
- r'^([\u4e00-\u9fff\-]+)[:\-\s]+([a-zA-Z].*)$', # Chinese-English
665
- r'^([\u4e00-\u9fff\-]+)[\((]([a-zA-Z].*)[\))]$', # Chinese(English)
666
- ]
667
-
668
- for pattern in patterns:
669
- match = re.search(pattern, text)
670
- if match:
671
- return match.group(1), match.group(2)
672
-
673
- # Find the first Chinese character index
674
- first_chinese_idx = -1
675
- for i, char in enumerate(text):
676
- if '\u4e00' <= char <= '\u9fff': # Chinese character
677
- first_chinese_idx = i
678
- break
679
-
680
- # Find where English starts after Chinese
681
- english_start_idx = len(text)
682
- if first_chinese_idx >= 0:
683
- # Search for the first English character that comes after Chinese
684
- for i in range(first_chinese_idx, len(text)):
685
- # Skip to the end of Chinese characters
686
- if '\u4e00' <= text[i] <= '\u9fff':
687
- continue
688
-
689
- # Look ahead for English characters
690
- for j in range(i, len(text)):
691
- if 'a' <= text[j].lower() <= 'z':
692
- english_start_idx = j
693
- break
694
- if english_start_idx < len(text):
695
- break
696
-
697
- # If we found the boundaries
698
- if first_chinese_idx >= 0 and english_start_idx < len(text):
699
- # Handle prefix: any Latin characters before Chinese should be part of Chinese name
700
- prefix = text[:first_chinese_idx].strip() if first_chinese_idx > 0 else ""
701
- chinese_part = text[first_chinese_idx:english_start_idx].strip()
702
- english_part = text[english_start_idx:].strip()
703
-
704
- # Combine prefix with Chinese part
705
- if prefix:
706
- chinese_part = f"{prefix} {chinese_part}"
707
-
708
- return chinese_part, english_part
709
-
710
- # Special case for prefix like "PVC" with no space before Chinese
711
- if first_chinese_idx > 0:
712
- prefix = text[:first_chinese_idx].strip()
713
- rest_of_text = text[first_chinese_idx:]
714
-
715
- # Extract Chinese and English from the rest of the text
716
- chinese_chars = []
717
- english_chars = []
718
- in_chinese = True
719
-
720
- for char in rest_of_text:
721
- if '\u4e00' <= char <= '\u9fff': # Chinese character
722
- if not in_chinese and english_chars: # If we've already seen English, something is wrong
723
- chinese_chars = []
724
- english_chars = []
725
- break
726
- chinese_chars.append(char)
727
- in_chinese = True
728
- elif 'a' <= char.lower() <= 'z' or char in ' -_()': # English or separator
729
- if in_chinese and chinese_chars: # We've seen Chinese and now see English
730
- english_chars.append(char)
731
- in_chinese = False
732
- elif not in_chinese: # Continue collecting English
733
- english_chars.append(char)
734
- else: # No Chinese seen yet, might be part of prefix
735
- chinese_chars.append(char)
736
- else: # Other characters (numbers, etc.)
737
- if in_chinese:
738
- chinese_chars.append(char)
739
- else:
740
- english_chars.append(char)
741
-
742
- if chinese_chars and english_chars:
743
- chinese_text = prefix + " " + ''.join(chinese_chars).strip()
744
- english_text = ''.join(english_chars).strip()
745
- return chinese_text, english_text
746
- else:
747
- # No clean separation possible
748
- return prefix + " " + rest_of_text, ""
749
 
750
- # Fallback: Try simple pattern matching
751
- # Find all Chinese characters
752
- chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
753
- chinese = ''.join(chinese_chars)
754
 
755
- # If we have Chinese, extract everything up to the last Chinese character
756
- if chinese:
757
- last_chinese_idx = text.rindex(chinese_chars[-1]) + len(chinese_chars[-1])
758
-
759
- # Anything before the first Chinese character is a prefix
760
- first_chinese_idx = text.index(chinese_chars[0])
761
- prefix = text[:first_chinese_idx].strip()
762
-
763
- # Everything after the last Chinese character is English
764
- chinese_part = prefix + " " + text[first_chinese_idx:last_chinese_idx].strip() if prefix else text[first_chinese_idx:last_chinese_idx].strip()
765
- english_part = text[last_chinese_idx:].strip()
766
-
767
- # If English part doesn't actually contain English letters, treat it as empty
768
- if not re.search(r'[a-zA-Z]', english_part):
769
- english_part = ""
770
-
771
- return chinese_part, english_part
772
 
773
- # No Chinese characters found, check if there are any English letters
774
- if re.search(r'[a-zA-Z]', text):
775
- return "", text.strip()
776
 
777
- # No clear separation possible
778
- return text.strip(), ""
779
-
780
- # Process the data based on the standard field mapping
781
- transformed_data = []
782
-
783
- for row in price_list:
784
- new_row = {field: "" for field in target_fields} # Initialize with empty strings
785
- other_fields = {}
786
-
787
- # Step 1: Handle name fields first - look for any field with "名称" or "name"
788
- for header, value in row.items():
789
- # Clean the header for comparison
790
- cleaned_header = re.sub(r'\s+', ' ', header).strip()
791
- header_lower = cleaned_header.lower()
792
-
793
- if ("名称" in header_lower or "name" in header_lower) and value:
794
- # If field contains both Chinese and English, separate them
795
- if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
796
- chinese, english = separate_chinese_english(value)
797
- if chinese:
798
- new_row["名称"] = chinese
799
- if english:
800
- new_row["名称(英文)"] = english
801
- print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
802
- else:
803
- # Just set the name directly
804
- new_row["名称"] = value
805
- break # Stop after finding first name field
806
-
807
- # Step 2: Fill in all other fields using standard mapping
808
- for header, value in row.items():
809
- # Skip empty values
810
- if not value:
811
  continue
812
-
813
- # Clean the header for comparison
814
- cleaned_header = re.sub(r'\s+', ' ', header).strip()
815
-
816
- # Check if this maps to a standard field
817
- matched_field = None
818
- for std_field, mapped_header in standard_field_mapping.items():
819
- # Make comparison more flexible by lowercasing and stripping spaces
820
- if mapped_header.lower().strip() == cleaned_header.lower().strip():
821
- matched_field = std_field
822
- break
823
-
824
- # If we found a mapping, use it (but don't overwrite name fields)
825
- if matched_field:
826
- if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
827
- new_row[matched_field] = value
828
- # If no mapping found, add to other_fields
829
- else:
830
- # Skip name fields we already processed
831
- header_lower = cleaned_header.lower()
832
- if not ("名称" in header_lower or "name" in header_lower):
833
- other_fields[header] = value
834
-
835
- # Add remaining fields to "其他"
836
- if other_fields:
837
- new_row["其他"] = other_fields
838
  else:
839
- new_row["其他"] = {}
 
 
 
 
 
 
840
 
841
- # Convert field names for validation
842
- if "名称(英文)" in new_row:
843
- new_row["名称(英文)"] = new_row.pop("名称(英文)")
 
844
 
845
- transformed_data.append(new_row)
846
-
847
- # Success! Break out of the retry loop
848
- print(f" Successfully processed price list on attempt {attempt + 1}")
849
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
 
851
- except json.JSONDecodeError as e:
852
- error_msg = f"JSON decode error in field mapping: {e}"
853
- logging.error(f"{error_msg}")
854
- print(f"❌ {error_msg}")
 
 
 
 
 
 
 
 
 
855
 
856
- except KeyError as e:
857
- error_msg = f"KeyError during data transformation: {e}"
858
- logging.error(f"{error_msg}")
859
- print(f"❌ {error_msg}")
 
860
 
861
- except Exception as e:
862
- error_msg = f"Error processing price list: {e}"
863
- logging.error(f"{error_msg}")
864
- print(f"❌ {error_msg}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
 
866
- # Don't retry on the last attempt
867
- if attempt < max_retries - 1:
868
- # Add error message to the conversation and retry
869
- if 'response' in locals():
870
- messages.append({
871
- "role": "assistant",
872
- "content": response.choices[0].message.content
873
- })
874
- messages.append({
875
- "role": "user",
876
- "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
877
- })
878
  else:
879
- print(f"⚠️ All {max_retries} attempts failed, returning empty result")
880
- transformed_data = [] # Return empty list after all retries failed
 
 
 
 
 
881
 
882
  # Save to file if requested
883
  if save_json and transformed_data:
@@ -906,6 +1001,30 @@ def json_to_excel(contract_summary, json_data, excel_path):
906
  contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
907
  long_table.to_excel(writer, sheet_name="Price List", index=False)
908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  #--- Extract PO ------------------------------
910
 
911
  def extract_po(docx_path):
@@ -930,6 +1049,7 @@ def extract_po(docx_path):
930
  print("Extracting XML data to JSON...")
931
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
932
  extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
 
933
 
934
  # Step 3: Process JSON with OpenAI to get structured output
935
  print("Processing Contract Summary data with AI...")
@@ -938,17 +1058,17 @@ def extract_po(docx_path):
938
 
939
  # Find the last long table (excluding summary tables)
940
  print("Processing Price List data with AI...")
941
- long_tables = [
942
- table for key, table in json.loads(extracted_data).items()
943
- if "long_table" in key and "summary" not in key
944
- ]
945
- last_long_table = long_tables[-1] if long_tables else {}
946
-
947
  # Generate the price list filename in the same folder as the document
948
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
949
 
950
  # Process the price list and save it to a JSON file
951
- price_list = extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
952
 
953
  # Step 4: Combine contract summary and long table data into a single JSON object
954
  print("Combining AI Generated JSON with Extracted Data...")
@@ -985,6 +1105,13 @@ def extract_po(docx_path):
985
  import gradio as gr
986
  from gradio.themes.base import Base
987
 
 
 
 
 
 
 
 
988
  interface = gr.Interface(
989
  fn=extract_po,
990
  title="PO Extractor 买卖合同数据提取",
 
38
  # base_url = "https://router.huggingface.co/novita/v3/openai"
39
  # model="qwen/qwen-2.5-72b-instruct"
40
 
41
+ # Qwen 3 32B --------------------------------------------------------
42
+ # base_url = "https://router.huggingface.co/sambanova/v1"
43
+ # model="Qwen3-32B"
44
+
45
  # Configure logging to write to 'zaoju_logs.log' without using pickle
46
  logging.basicConfig(
47
  filename='extract_po_logs.log',
 
83
  def clean_spaces(text):
84
  """
85
  Removes excessive spaces between Chinese characters while preserving spaces in English words.
86
+ Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
87
  """
88
+ if not text or not isinstance(text, str):
89
+ return text
90
+
91
+ # Remove spaces between Chinese characters
92
  text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
93
+
94
+ # Ensure one space between Chinese and English
95
+ text = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', text)
96
+ text = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', text)
97
+
98
+ # Normalize multiple spaces to single space
99
+ text = re.sub(r'\s+', ' ', text)
100
+
101
  return text.strip()
102
 
103
  def extract_key_value_pairs(text, target_dict=None):
 
212
 
213
  return extracted_data
214
 
215
+ def clean_header_spaces(headers):
216
+ """
217
+ Cleans headers for consistent matching by:
218
+ 1. Normalizing multiple spaces to single space
219
+ 2. Ensuring exactly one space between Chinese and English
220
+ 3. Converting to lowercase
221
+ """
222
+ if not headers:
223
+ return headers
224
+
225
+ cleaned_headers = []
226
+ for header in headers:
227
+ if not header:
228
+ cleaned_headers.append(header)
229
+ continue
230
+
231
+ # Normalize multiple spaces to single space
232
+ header = re.sub(r'\s+', ' ', header)
233
+
234
+ # Ensure exactly one space between Chinese and English
235
+ header = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', header)
236
+ header = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', header)
237
+
238
+ # Final cleanup of any remaining multiple spaces
239
+ header = re.sub(r'\s+', ' ', header)
240
+
241
+ # Convert to lowercase
242
+ header = header.lower()
243
+
244
+ cleaned_headers.append(header.strip())
245
+
246
+ return cleaned_headers
247
+
248
  def extract_headers(first_row_cells):
249
  """Extracts unique column headers from the first row of a table."""
250
  headers = []
 
315
 
316
  table_data.append(row_data)
317
 
318
+ # Clean the keys in the table data
319
+ cleaned_table_data = []
320
+ for row in table_data:
321
+ cleaned_row = {}
322
+ for key, value in row.items():
323
+ # Clean the key using the same function we use for headers
324
+ cleaned_key = clean_header_spaces([key])[0]
325
+ cleaned_row[cleaned_key] = value
326
+ cleaned_table_data.append(cleaned_row)
327
+
328
  # Filter out rows where the "序号" column contains non-numeric values
329
  filtered_table_data = []
330
+ for row in cleaned_table_data:
331
+
332
+ # Check if any cell contains "合计" (total) or "折扣" (discount)
333
  contains_total = False
334
  for key, value in row.items():
335
+ if isinstance(value, str) and ("合计" in value or "折扣" in value):
336
  contains_total = True
337
  break
338
 
 
363
 
364
  return filtered_table_data
365
 
366
+ def identify_table_type_and_header_row(rows):
367
+ """Identify table type and the index of the header row."""
368
+ header_keywords = ["名称", "Name", "规格", "Unit", "Quantity", "单价", "总价", "Remarks"]
369
+ for i, row in enumerate(rows):
370
+ num_cells = len(row.findall('.//w:tc', NS))
371
+ if num_cells > 1:
372
+ cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
373
+ if any(keyword in cell_texts for keyword in header_keywords):
374
+ # Check for buyer-seller or summary table
375
+ if num_cells == 2:
376
+ if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
377
+ return "buyer_seller", i
378
+ else:
379
+ return "summary", i
380
+ else:
381
+ return "long_table", i
382
+ # Fallbacks
383
+ if all(len(row.findall('.//w:tc', NS)) == 1 for row in rows):
384
+ return "single_column", 0
385
+ return "unknown", 0
386
+
387
  def extract_tables(root):
388
  """Extracts tables from the DOCX document and returns structured data."""
389
  tables = root.findall('.//w:tbl', NS)
 
398
  for paragraph in table.findall('.//w:p', NS):
399
  table_paragraphs.add(paragraph)
400
 
401
+ table_type, header_row_index = identify_table_type_and_header_row(rows)
 
402
 
403
+ if table_type == "single_column":
404
  single_column_data = process_single_column_table(rows)
405
  if single_column_data:
406
  table_data[f"table_{table_index}_single_column"] = single_column_data
407
+ continue
408
+ elif table_type == "buyer_seller":
409
+ buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
410
+ if buyer_seller_data:
411
+ table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
412
+ continue
413
+ elif table_type == "summary":
414
+ summary_data = process_summary_table(rows[header_row_index:])
415
+ if summary_data:
416
+ table_data[f"table_{table_index}_summary"] = summary_data
417
+ continue
418
+ elif table_type == "long_table":
419
+ long_table_data = process_long_table(rows[header_row_index:])
420
+ if long_table_data:
421
+ table_data[f"long_table_{table_index}"] = long_table_data
422
+ continue
423
+ else:
424
+ # fallback: try to process as long table from first multi-column row
425
+ long_table_data = process_long_table(rows[header_row_index:])
426
+ if long_table_data:
427
+ table_data[f"long_table_{table_index}"] = long_table_data
428
+ continue
 
 
 
 
 
 
 
429
 
430
  return table_data, table_paragraphs
431
 
 
605
 
606
  def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
607
  """
608
+ Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
 
609
  """
610
 
611
  # If price_list is empty, return an empty list
 
630
  # Get the headers directly from the sample row
631
  extracted_headers = list(sample_row.keys())
632
 
633
+ # Clean double spaces in headers to facilitate matching
634
  def clean_header_spaces(headers):
635
+ """
636
+ Cleans headers for consistent matching by:
637
+ 1. Normalizing multiple spaces to single space
638
+ 2. Ensuring exactly one space between Chinese and English
639
+ """
640
+ if not headers:
641
+ return headers
642
+
643
+ cleaned_headers = []
644
+ for header in headers:
645
+ if not header:
646
+ cleaned_headers.append(header)
647
+ continue
648
+
649
+ # Normalize multiple spaces to single space
650
+ header = re.sub(r'\s+', ' ', header)
651
+
652
+ # Ensure exactly one space between Chinese and English
653
+ header = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', header)
654
+ header = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', header)
655
+
656
+ # Final cleanup of any remaining multiple spaces
657
+ header = re.sub(r'\s+', ' ', header)
658
+
659
+ cleaned_headers.append(header.strip())
660
+
661
+ return cleaned_headers
662
 
663
  # Apply the cleaning function to extracted headers
664
  extracted_headers = clean_header_spaces(extracted_headers)
 
669
  "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
670
  "备注", "计划来源"
671
  ]
672
+
673
+ # Hardcoded mapping dictionary
674
+ hardcoded_mapping = {
675
+ # 序号 mappings
676
+ "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号"],
677
+ # 名称 mappings
678
+ "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
679
+ # 名称(英文) mappings
680
+ "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "单价(欧元) unit price(eur)", "名称", "产品名称 product name", "单价(元)unit price(cny)"],
681
+ # 品牌 mappings
682
+ "品牌": ["品牌 brand", "品牌brand", "brand", "品牌 brand", "品牌brand", "品牌"],
683
+ # 规格型号 mappings
684
+ "规格型号": ["规格型号 specification", "规格型号specification", "规格 specification", "规格specification",
685
+ "specification", "规格型号specification and model", "型号model", "型号 model", "规格型号 specification and model", "规格型号"],
686
+ # 所属机型 mappings
687
+ "所属机型": ["所属机型 applicable models", "所属机型applicable models", "applicable models", "所属机型"],
688
+ # 数量 mappings
689
+ "数量": ["数量 quantity", "数量quantity", "quantity", "qty", "数量qty", "数量"],
690
+ # 单位 mappings
691
+ "单位": ["单位 unit", "单位unit", "unit", "单位"],
692
+ # 单价 mappings
693
+ "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price",
694
+ "单价(元)", "单价(cny)", "单价 unit price (cny)", "单价(欧元) unit price(eur)", "单价", "单价(元) unit price(cny)", "单价(元)unit price(cny)"],
695
+ # 总价 mappings
696
+ "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
697
+ "总价(元)", "总额(元)", "总价 total amount (cny)", "总价(欧元) amount(eur)", "总价", "总价(元)amount (cny)", "总价(元)amount(cny)"],
698
+ # 几郎单价 mappings
699
+ "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
700
+ "几郎单价(元)", "单价(几郎)", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price(几郎)(gnf)", "单价(元)unit price(cny)"],
701
+ # 几郎总价 mappings
702
+ "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
703
+ "几郎总价(元)", "总额(几郎)", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount(几郎)(gnf)", "总价(元)amount(cny)"],
704
+ # 备注 mappings
705
+ "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
706
+ # 计划来源 mappings
707
+ "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源(唛头信息)",
708
+ "计划来源 planned source", "计划来源planned source", "planned source", "计划来源"]
709
+ }
710
+
711
+ # Try to map headers using hardcoded mapping
712
+ standard_field_mapping = {}
713
+ unmapped_headers = []
714
+
715
+ # Clean the extracted headers first
716
+ cleaned_extracted_headers = clean_header_spaces(extracted_headers)
717
 
718
+ # Clean all possible headers in the hardcoded mapping
719
+ cleaned_hardcoded_mapping = {
720
+ std_field: [clean_header_spaces([h])[0] for h in possible_headers]
721
+ for std_field, possible_headers in hardcoded_mapping.items()
722
+ }
723
+
724
+ print("\n🔍 Hardcoded Mapping Results:")
725
+ print("-" * 50)
726
+ for header in cleaned_extracted_headers:
727
+ header_mapped = False
728
+ for std_field, possible_headers in cleaned_hardcoded_mapping.items():
729
+ if header in possible_headers:
730
+ standard_field_mapping[std_field] = header
731
+ header_mapped = True
732
+ print(f"✅ {std_field} -> {header}")
733
+ break
734
+ if not header_mapped:
735
+ unmapped_headers.append(header)
736
+ print(f"❌ No match found for: {header}")
737
+ print("-" * 50)
738
+
739
+ # If we have unmapped headers, fall back to AI mapping
740
+ if unmapped_headers:
741
+ print(f"⚠️ Some headers could not be mapped using hardcoded mapping: {unmapped_headers}")
742
+ print("🔄 Falling back to AI mapping...")
743
+
744
+ # Get the list of standard fields that haven't been mapped yet
745
+ unmapped_standard_fields = [field for field in target_fields if field not in standard_field_mapping]
746
+
747
+ # Use AI to map remaining headers
748
+ base_prompt = f"""
749
+ You are playing a matching game. Match each and every standard fields to the exact column headers within "" separated by ,.
750
+ You must match all the given column headers to the standard fields to you best ability.
751
  USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
752
 
753
+ The standard fields that need mapping are:
754
+ {json.dumps(unmapped_standard_fields, ensure_ascii=False)}
755
 
756
  You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
757
+ {json.dumps(unmapped_headers, ensure_ascii=False)}
758
 
759
  ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
760
 
 
773
  "名称": "名称Name of Materials and Equipment",
774
  "名称(英文)": "名称Name of Materials and Equipment"
775
  }}
 
776
  """
777
 
778
+ messages = [{"role": "user", "content": base_prompt}]
779
+
780
+ client = OpenAI(
781
+ base_url=base_url,
782
+ api_key=HF_API_KEY,
783
+ )
784
+
785
+ # Add retry logic for AI mapping
786
+ max_retries = 3
787
+ for attempt in range(max_retries):
788
+ try:
789
+ print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries})")
790
+ response = client.chat.completions.create(
791
+ model=model,
792
+ messages=messages,
793
+ temperature=0.1,
794
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
 
796
+ raw_mapping = response.choices[0].message.content
 
 
 
797
 
798
+ think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
799
+ if think_text:
800
+ print(f"🧠 Thought Process: {think_text}")
801
+ logging.info(f"Think text: {think_text}")
802
+
803
+ raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
804
+ # Remove any backticks or json tags
805
+ raw_mapping = re.sub(r"```json|```", "", raw_mapping)
 
 
 
 
 
 
 
 
 
806
 
807
+ # Parse the AI mapping and merge with hardcoded mapping
808
+ ai_mapping = json.loads(raw_mapping.strip())
809
+ standard_field_mapping.update(ai_mapping)
810
 
811
+ # Check if all standard fields are mapped
812
+ still_unmapped = [field for field in target_fields if field not in standard_field_mapping]
813
+ if still_unmapped:
814
+ print(f"⚠️ Some standard fields are still unmapped: {still_unmapped}")
815
+ if attempt < max_retries - 1:
816
+ # Add feedback to the prompt for the next attempt
817
+ messages.append({
818
+ "role": "assistant",
819
+ "content": response.choices[0].message.content
820
+ })
821
+ messages.append({
822
+ "role": "user",
823
+ "content": f"The following standard fields are still unmapped: {still_unmapped}. Please try to map these fields using the available headers: {unmapped_headers}"
824
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
825
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
826
  else:
827
+ print(f" Successfully mapped all fields using AI")
828
+ print("\n📊 AI Mapping Results:")
829
+ print("-------------------")
830
+ for std_field, mapped_header in ai_mapping.items():
831
+ print(f"{std_field} -> {mapped_header}")
832
+ print("-------------------")
833
+ break
834
 
835
+ except Exception as e:
836
+ error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
837
+ logging.error(f"{error_msg}")
838
+ print(f"❌ {error_msg}")
839
 
840
+ if attempt < max_retries - 1:
841
+ messages.append({
842
+ "role": "assistant",
843
+ "content": response.choices[0].message.content
844
+ })
845
+ messages.append({
846
+ "role": "user",
847
+ "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
848
+ })
849
+ else:
850
+ print(f"⚠️ All AI mapping attempts failed, proceeding with partial mapping")
851
+
852
+ # After all mapping is done, print the final mapping and unmapped columns
853
+ print("\n📊 Final Field Mapping:")
854
+ print("-" * 50)
855
+ # Print all standard fields, showing mapping if exists or blank if not
856
+ for field in target_fields:
857
+ mapped_header = standard_field_mapping.get(field, "")
858
+ print(f"{field} -> {mapped_header}")
859
+ print("-" * 50)
860
+
861
+ # Check for unmapped standard fields
862
+ unmapped_standard = [field for field in target_fields if field not in standard_field_mapping]
863
+ if unmapped_standard:
864
+ print("\n⚠️ Unmapped Standard Fields:")
865
+ print("-" * 50)
866
+ for field in unmapped_standard:
867
+ print(f"- {field}")
868
+ print("-" * 50)
869
+
870
+ # Check for unmapped extracted headers
871
+ mapped_headers = set(standard_field_mapping.values())
872
+ unmapped_headers = [header for header in extracted_headers if header not in mapped_headers]
873
+ if unmapped_headers:
874
+ print("\n⚠️ Unmapped Extracted Headers:")
875
+ print("-" * 50)
876
+ for header in unmapped_headers:
877
+ print(f"- {header}")
878
+ print("-" * 50)
879
+
880
+ # Function to separate Chinese and English text
881
+ def separate_chinese_english(text):
882
+ if not text or not isinstance(text, str):
883
+ return "", ""
884
+
885
+ # Find all Chinese character positions
886
+ chinese_positions = []
887
+ for i, char in enumerate(text):
888
+ if '\u4e00' <= char <= '\u9fff':
889
+ chinese_positions.append(i)
890
+
891
+ if not chinese_positions:
892
+ # No Chinese characters, return empty Chinese and full text as English
893
+ return "", text.strip()
894
+
895
+ # Find the last Chinese character position
896
+ last_chinese_pos = chinese_positions[-1]
897
+
898
+ # Everything up to and including the last Chinese character is Chinese
899
+ chinese_part = text[:last_chinese_pos + 1].strip()
900
+
901
+ # Everything after the last Chinese character is English
902
+ english_part = text[last_chinese_pos + 1:].strip()
903
+
904
+ # If English part doesn't actually contain English letters, treat it as empty
905
+ if not re.search(r'[a-zA-Z]', english_part):
906
+ english_part = ""
907
+
908
+ return chinese_part, english_part
909
+
910
+ # Process the data based on the final mapping
911
+ transformed_data = []
912
+
913
+ for row in price_list:
914
+ new_row = {field: "" for field in target_fields} # Initialize with empty strings
915
+ other_fields = {}
916
+
917
+ # Step 1: Handle name fields first - look for any field with "名称" or "name"
918
+ for header, value in row.items():
919
+ # Clean the header for comparison
920
+ cleaned_header = re.sub(r'\s+', ' ', header).strip()
921
+ header_lower = cleaned_header.lower()
922
 
923
+ if ("名称" in header_lower or "name" in header_lower) and value:
924
+ # If field contains both Chinese and English, separate them
925
+ if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
926
+ chinese, english = separate_chinese_english(value)
927
+ if chinese:
928
+ new_row["名称"] = chinese
929
+ if english:
930
+ new_row["名称(英文)"] = english
931
+ print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
932
+ else:
933
+ # Just set the name directly
934
+ new_row["名称"] = value
935
+ break # Stop after finding first name field
936
 
937
+ # Step 2: Fill in all other fields using standard mapping
938
+ for header, value in row.items():
939
+ # Skip empty values
940
+ if not value:
941
+ continue
942
 
943
+ # Clean the header for comparison
944
+ cleaned_header = re.sub(r'\s+', ' ', header).strip()
945
+
946
+ # Check if this maps to a standard field
947
+ matched_field = None
948
+ for std_field, mapped_header in standard_field_mapping.items():
949
+ # Make comparison more flexible by lowercasing and stripping spaces
950
+ if mapped_header.lower().strip() == cleaned_header.lower().strip():
951
+ matched_field = std_field
952
+ break
953
+
954
+ # If we found a mapping, use it (but don't overwrite name fields)
955
+ if matched_field:
956
+ if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
957
+ new_row[matched_field] = value
958
+ # If no mapping found, add to other_fields
959
+ else:
960
+ # Skip name fields we already processed
961
+ header_lower = cleaned_header.lower()
962
+ if not ("名称" in header_lower or "name" in header_lower):
963
+ other_fields[header] = value
964
 
965
+ # Add remaining fields to "其他"
966
+ if other_fields:
967
+ new_row["其他"] = other_fields
 
 
 
 
 
 
 
 
 
968
  else:
969
+ new_row["其他"] = {}
970
+
971
+ # Convert field names for validation
972
+ if "名称(英文)" in new_row:
973
+ new_row["名称(英文)"] = new_row.pop("名称(英文)")
974
+
975
+ transformed_data.append(new_row)
976
 
977
  # Save to file if requested
978
  if save_json and transformed_data:
 
1001
  contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
1002
  long_table.to_excel(writer, sheet_name="Price List", index=False)
1003
 
1004
+ # Add this helper function near your other helpers
1005
+ def find_price_list_table(extracted_data, min_matches=3):
1006
+ price_keywords = [
1007
+ "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
1008
+ "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
1009
+ "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no"
1010
+ ]
1011
+ best_table = None
1012
+ best_match_count = 0
1013
+
1014
+ for key, table in extracted_data.items():
1015
+ if "long_table" in key and isinstance(table, list) and table:
1016
+ headers = list(table[0].keys())
1017
+ match_count = 0
1018
+ for header in headers:
1019
+ header_lower = header.lower()
1020
+ if any(kw in header_lower for kw in price_keywords):
1021
+ match_count += 1
1022
+ if match_count > best_match_count and match_count >= min_matches:
1023
+ best_match_count = match_count
1024
+ best_table = table
1025
+
1026
+ return best_table
1027
+
1028
  #--- Extract PO ------------------------------
1029
 
1030
  def extract_po(docx_path):
 
1049
  print("Extracting XML data to JSON...")
1050
  json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
1051
  extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
1052
+ print(f"✅ Extracted Data: {extracted_data}")
1053
 
1054
  # Step 3: Process JSON with OpenAI to get structured output
1055
  print("Processing Contract Summary data with AI...")
 
1058
 
1059
  # Find the last long table (excluding summary tables)
1060
  print("Processing Price List data with AI...")
1061
+ extracted_data_dict = json.loads(extracted_data)
1062
+ price_list_table = find_price_list_table(extracted_data_dict)
1063
+ if not price_list_table:
1064
+ print("⚠️ No suitable price list table found!")
1065
+ price_list_table = []
1066
+
1067
  # Generate the price list filename in the same folder as the document
1068
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
1069
 
1070
  # Process the price list and save it to a JSON file
1071
+ price_list = extract_price_list(price_list_table, save_json=True, json_name=price_list_filename)
1072
 
1073
  # Step 4: Combine contract summary and long table data into a single JSON object
1074
  print("Combining AI Generated JSON with Extracted Data...")
 
1105
  import gradio as gr
1106
  from gradio.themes.base import Base
1107
 
1108
+ # def extract_po_api(docx_path):
1109
+ # try:
1110
+ # return extract_po(docx_path)
1111
+ # except Exception as e:
1112
+ # # Return error details in the API response
1113
+ # return {"error":str(e)}
1114
+
1115
  interface = gr.Interface(
1116
  fn=extract_po,
1117
  title="PO Extractor 买卖合同数据提取",