MikeMai commited on
Commit
dc7b3c3
·
verified ·
1 Parent(s): 58ff4c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +341 -114
app.py CHANGED
@@ -16,23 +16,23 @@ import re
16
 
17
  import logging
18
 
19
- from pydantic import BaseModel, Field, ValidationError, RootModel, field_validator
20
  from typing import List, Optional
21
 
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
25
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
26
- base_url = "https://router.huggingface.co/novita"
27
- model = "deepseek/deepseek-r1-distill-qwen-14b"
28
 
29
  # Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
30
  # base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
31
  # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
32
 
33
  # Qwen 2.5 7B --------------------------------------------------------
34
- # base_url = "https://router.huggingface.co/together/v1"
35
- # model="Qwen/Qwen2.5-7B-Instruct-Turbo"
36
 
37
  # Qwen 2.5 32B --------------------------------------------------------
38
  # base_url = "https://router.huggingface.co/novita/v3/openai"
@@ -530,81 +530,93 @@ Contract data in JSON format:""" + f"""
530
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
531
 
532
 
533
- def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
534
  """
535
- Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
536
- Retries up to 3 times with error feedback if output is not valid JSON.
537
  """
538
 
539
- # Pydantic schema
540
- class PriceItem(BaseModel):
541
- 序号: Optional[str] = ""
542
- 名称: Optional[str] = ""
543
- 名称_英文: Optional[str] = Field("", alias="名称(英文)")
544
- 品牌: Optional[str] = ""
545
- 规格: Optional[str] = ""
546
- 所属机型: Optional[str] = ""
547
- 采购数量: Optional[str] = ""
548
- 单位: Optional[str] = ""
549
- 单价: Optional[str] = ""
550
- 总价: Optional[str] = ""
551
- 几郎单价: Optional[str] = ""
552
- 几郎总额: Optional[str] = ""
553
- 备注: Optional[str] = ""
554
- 计划来源: Optional[str] = ""
555
- 其他: Optional[dict] = Field(default_factory=dict, alias="其他")
556
-
557
- model_config = {
558
- # Ensures numbers remain as strings and aren't converted
559
- "coerce_numbers_to_str": True,
560
- # Allows numeric strings to be parsed into the model
561
- "arbitrary_types_allowed": True
562
- }
563
-
564
- @field_validator('其他', mode='before')
565
- def convert_empty_list_to_dict(cls, v):
566
- # Convert empty list to empty dict
567
- if isinstance(v, list) and len(v) == 0:
568
- return {}
569
- return v
570
-
571
- class PriceListModel(BaseModel):
572
- items: List[PriceItem]
573
-
574
- model_config = {
575
- # Ensures numbers remain as strings and aren't converted
576
- "coerce_numbers_to_str": True,
577
- # Allows numeric strings to be parsed into the model
578
- "arbitrary_types_allowed": True
579
- }
580
-
581
- base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
582
- 有时候第一行是表头,有时候是数据行,只输入数据行。
583
- 请注意,输出的 JSON 需要符合以下格式要求:
584
-
585
- # 输出格式要求:
586
- 每个条目输出以下字段:
587
- - 序号
588
- - 名称:只填中文
589
- - 名称(英文):只填英文
590
- - 品牌
591
- - 规格
592
- - 所属机型
593
- - 采购数量:
594
- - 单位
595
- - 单价:
596
- - 总价:
597
- - 几郎单价:
598
- - 几郎总额:
599
- - 备注
600
- - 计划来源
601
- - 其他:如果有以上以外的字段就以list的形式写在其他里 {{"其他": "key1": "value1", "key2":"value2"}},如果没有就给一个空的list:{{}}
602
-
603
- 请确保输出的 JSON 是有效的,且字段名称与输入的字段名称一致。请注意,字段名称可能会有不同的拼写方式,请根据上下文进行判断。
604
- 请确保输出的条目数量与输入的列表数量一致。
605
-
606
- # 原始价格表:
607
- {price_list}"""
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
  messages = [{"role": "user", "content": base_prompt}]
610
 
@@ -613,54 +625,268 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
613
  api_key=HF_API_KEY,
614
  )
615
 
616
- for attempt in range(3):
617
- print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
 
618
 
 
619
  try:
 
620
  response = client.chat.completions.create(
621
  model=model,
622
  messages=messages,
 
623
  )
624
- raw = response.choices[0].message.content
 
625
 
626
  think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
627
  if think_text:
628
  print(f"🧠 Thought Process: {think_text}")
629
  logging.info(f"Think text: {think_text}")
630
 
631
- # Strip out LLM artifacts
632
- raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
633
- raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
634
-
635
- # Wrap the raw JSON in a proper structure if it's a list
636
- if raw.strip().startswith('['):
637
- raw = '{"items": ' + raw + '}'
638
-
639
- validated = PriceListModel.model_validate_json(raw)
640
- price_list_json = validated.model_dump(by_alias=True)["items"]
641
-
642
- if save_json:
643
- with open(json_name, "w", encoding="utf-8") as f:
644
- json.dump(price_list_json, f, ensure_ascii=False, indent=4)
645
- print(f" Saved to {json_name}")
646
-
647
- return price_list_json
648
-
649
- except ValidationError as ve:
650
- error_msg = f"Pydantic validation error: {ve}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  except Exception as e:
652
- error_msg = f"Unexpected error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
653
 
654
- print(f"❌ {error_msg}")
655
- messages.append({
656
- "role": "user",
657
- "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
658
- })
 
 
659
 
660
- print("⚠️ Failed after 3 attempts.")
661
- return raw
662
-
663
-
664
  def json_to_excel(contract_summary, json_data, excel_path):
665
  """Converts extracted JSON tables to an Excel file."""
666
 
@@ -720,7 +946,7 @@ def extract_po(docx_path):
720
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
721
 
722
  # Process the price list and save it to a JSON file
723
- price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
724
 
725
  # Step 4: Combine contract summary and long table data into a single JSON object
726
  print("Combining AI Generated JSON with Extracted Data...")
@@ -739,7 +965,8 @@ def extract_po(docx_path):
739
 
740
  Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
741
 
742
- print(log)
 
743
  logging.info(f"""{log}""")
744
 
745
  return combined_data
@@ -747,9 +974,9 @@ def extract_po(docx_path):
747
  # Example Usage
748
 
749
  # extract_po("test-contract-converted.docx")
750
- # extract_po("test-contract.docx")
751
 
752
- # print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
753
 
754
  # Gradio Interface ------------------------------
755
 
 
16
 
17
  import logging
18
 
19
+ from pydantic import BaseModel, Field, ValidationError, RootModel
20
  from typing import List, Optional
21
 
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
25
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
26
+ # base_url = "https://router.huggingface.co/novita"
27
+ # model = "deepseek/deepseek-r1-distill-qwen-14b"
28
 
29
  # Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
30
  # base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
31
  # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
32
 
33
  # Qwen 2.5 7B --------------------------------------------------------
34
+ base_url = "https://router.huggingface.co/together/v1"
35
+ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
36
 
37
  # Qwen 2.5 32B --------------------------------------------------------
38
  # base_url = "https://router.huggingface.co/novita/v3/openai"
 
530
  return json.dumps(empty_json, ensure_ascii=False, indent=4)
531
 
532
 
533
+ def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
534
  """
535
+ Extracts structured price list by first using AI to map column names to standard keys,
536
+ then programmatically transforming the data to match the Pydantic model.
537
  """
538
 
539
+ # If price_list is empty, return an empty list
540
+ if not price_list:
541
+ return []
542
+
543
+ # Convert price_list to a list if it's a dict
544
+ if isinstance(price_list, dict):
545
+ # Check if the dict has any items
546
+ if len(price_list) == 0:
547
+ return []
548
+ # Convert to list if it's just a single entry dict
549
+ price_list = [price_list]
550
+
551
+ # Extract a sample row for header mapping
552
+ sample_row = price_list[0] if price_list else {}
553
+
554
+ # If there are no headers, return empty list
555
+ if not sample_row:
556
+ return []
557
+
558
+ # Get the headers directly from the sample row
559
+ extracted_headers = list(sample_row.keys())
560
+
561
+ # Clean double spaces in headers to facilitate AI identification
562
+ def clean_header_spaces(headers):
563
+ """Clean double spaces in headers to make them more consistent for AI processing."""
564
+ return [re.sub(r'\s+', ' ', header).strip() for header in headers]
565
+
566
+ # Apply the cleaning function to extracted headers
567
+ extracted_headers = clean_header_spaces(extracted_headers)
568
+
569
+ # Define our target fields from the Pydantic model
570
+ target_fields = [
571
+ "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
572
+ "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
573
+ "备注", "计划来源"
574
+ ]
575
+
576
+ sample_mapping = """Examples of how you should map to guide you, there are other cases so use your own judgement to map the headers to the standard fields:
577
+ - Map "序号" to headers containing "序号No.", "序号 No.",
578
+ - Map "品牌" to headers containing "品牌Brand", "品牌 brand",
579
+ - Map "规格型号" to headers containing "规格型号", "规格 Specification", "Specification and Model", "规格型号Specification and Model", "型号Model"
580
+ - Map "所属机型" to headers containing "所属机型", "Applicable Models"
581
+ - Map "数量" to headers containing "数量Quantity", "数量 Quantity", "Qty"
582
+ - Map "单位" to headers containing "单位Unit", "单位 Unit"
583
+ - Map "单价" to headers containing "单价(元)", "单价(CNY)", "Unit Price (CNY)", "单价Unit Price"
584
+ - Map "总价" to headers containing "总价(元)", "总额(元)", "Amount (CNY)", "Total Amount (CNY)"
585
+ - Map "几郎单价" to headers containing "单价(几郎)", "几郎单价(元)", "Unit Price (GNF)", "单价Unit Price(几郎)(GNF)"
586
+ - Map "几郎总价" to headers containing "总额(几郎)", "几郎总额(元)", "Total Amount (GNF)"
587
+ - Map "备注" to headers containing "备注Remarks", "备注 notes", "Note"
588
+ - Map "计划来源" to headers containing "计划来源Plan No.", "计划来源(唛头信息)", "Planned Source" """
589
+
590
+ # Use AI to map extracted headers to our target fields
591
+ base_prompt = f"""
592
+ You are playing a matching game. Match each and every standard fields to the exactcolumn headers within "" separated by ,.
593
+ USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
594
+
595
+ The standard fields are:
596
+ {json.dumps(target_fields, ensure_ascii=False)}
597
+
598
+ You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
599
+ {json.dumps(extracted_headers, ensure_ascii=False)}
600
+
601
+ ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
602
+
603
+ Return only a JSON mapping in this format WITHOUT any explanations:
604
+ ```json
605
+ {{
606
+ "standard_field_1": "column_header_1",
607
+ "standard_field_2": "column_header_2",
608
+ ...
609
+ }}
610
+ ```
611
+
612
+ Important: Map "名称" AND "名称(英文)" to the SAME extracted header.
613
+ For example, if the extracted header is "名称Name of Materials and Equipment", then:
614
+ {{
615
+ "名称": "名称Name of Materials and Equipment",
616
+ "名称(英文)": "名称Name of Materials and Equipment"
617
+ }}
618
+
619
+ """
620
 
621
  messages = [{"role": "user", "content": base_prompt}]
622
 
 
625
  api_key=HF_API_KEY,
626
  )
627
 
628
+ # Add retry logic similar to deepseek_extract_contract_summary
629
+ max_retries = 3
630
+ transformed_data = []
631
 
632
+ for attempt in range(max_retries):
633
  try:
634
+ print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries}: {base_prompt})")
635
  response = client.chat.completions.create(
636
  model=model,
637
  messages=messages,
638
+ temperature=0.1,
639
  )
640
+
641
+ raw_mapping = response.choices[0].message.content
642
 
643
  think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
644
  if think_text:
645
  print(f"🧠 Thought Process: {think_text}")
646
  logging.info(f"Think text: {think_text}")
647
 
648
+ raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
649
+ # Remove any backticks or json tags
650
+ raw_mapping = re.sub(r"```json|```", "", raw_mapping)
651
+
652
+ # Parse the mapping with standard fields as keys
653
+ standard_field_mapping = json.loads(raw_mapping.strip())
654
+ print(f"📊 Standard field mapping: {json.dumps(standard_field_mapping, ensure_ascii=False, indent=2)}")
655
+
656
+ # Function to separate Chinese and English text
657
+ def separate_chinese_english(text):
658
+ if not text or not isinstance(text, str):
659
+ return "", ""
660
+
661
+ # First check if there's a clear separator like hyphen or space
662
+ # Common patterns: "中文-English", "中文(English)", "中文 English"
663
+ patterns = [
664
+ r'^([\u4e00-\u9fff\-]+)[:\-\s]+([a-zA-Z].*)$', # Chinese-English
665
+ r'^([\u4e00-\u9fff\-]+)[\((]([a-zA-Z].*)[\))]$', # Chinese(English)
666
+ ]
667
+
668
+ for pattern in patterns:
669
+ match = re.search(pattern, text)
670
+ if match:
671
+ return match.group(1), match.group(2)
672
+
673
+ # Find the first Chinese character index
674
+ first_chinese_idx = -1
675
+ for i, char in enumerate(text):
676
+ if '\u4e00' <= char <= '\u9fff': # Chinese character
677
+ first_chinese_idx = i
678
+ break
679
+
680
+ # Find where English starts after Chinese
681
+ english_start_idx = len(text)
682
+ if first_chinese_idx >= 0:
683
+ # Search for the first English character that comes after Chinese
684
+ for i in range(first_chinese_idx, len(text)):
685
+ # Skip to the end of Chinese characters
686
+ if '\u4e00' <= text[i] <= '\u9fff':
687
+ continue
688
+
689
+ # Look ahead for English characters
690
+ for j in range(i, len(text)):
691
+ if 'a' <= text[j].lower() <= 'z':
692
+ english_start_idx = j
693
+ break
694
+ if english_start_idx < len(text):
695
+ break
696
+
697
+ # If we found the boundaries
698
+ if first_chinese_idx >= 0 and english_start_idx < len(text):
699
+ # Handle prefix: any Latin characters before Chinese should be part of Chinese name
700
+ prefix = text[:first_chinese_idx].strip() if first_chinese_idx > 0 else ""
701
+ chinese_part = text[first_chinese_idx:english_start_idx].strip()
702
+ english_part = text[english_start_idx:].strip()
703
+
704
+ # Combine prefix with Chinese part
705
+ if prefix:
706
+ chinese_part = f"{prefix} {chinese_part}"
707
+
708
+ return chinese_part, english_part
709
+
710
+ # Special case for prefix like "PVC" with no space before Chinese
711
+ if first_chinese_idx > 0:
712
+ prefix = text[:first_chinese_idx].strip()
713
+ rest_of_text = text[first_chinese_idx:]
714
+
715
+ # Extract Chinese and English from the rest of the text
716
+ chinese_chars = []
717
+ english_chars = []
718
+ in_chinese = True
719
+
720
+ for char in rest_of_text:
721
+ if '\u4e00' <= char <= '\u9fff': # Chinese character
722
+ if not in_chinese and english_chars: # If we've already seen English, something is wrong
723
+ chinese_chars = []
724
+ english_chars = []
725
+ break
726
+ chinese_chars.append(char)
727
+ in_chinese = True
728
+ elif 'a' <= char.lower() <= 'z' or char in ' -_()': # English or separator
729
+ if in_chinese and chinese_chars: # We've seen Chinese and now see English
730
+ english_chars.append(char)
731
+ in_chinese = False
732
+ elif not in_chinese: # Continue collecting English
733
+ english_chars.append(char)
734
+ else: # No Chinese seen yet, might be part of prefix
735
+ chinese_chars.append(char)
736
+ else: # Other characters (numbers, etc.)
737
+ if in_chinese:
738
+ chinese_chars.append(char)
739
+ else:
740
+ english_chars.append(char)
741
+
742
+ if chinese_chars and english_chars:
743
+ chinese_text = prefix + " " + ''.join(chinese_chars).strip()
744
+ english_text = ''.join(english_chars).strip()
745
+ return chinese_text, english_text
746
+ else:
747
+ # No clean separation possible
748
+ return prefix + " " + rest_of_text, ""
749
+
750
+ # Fallback: Try simple pattern matching
751
+ # Find all Chinese characters
752
+ chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
753
+ chinese = ''.join(chinese_chars)
754
+
755
+ # If we have Chinese, extract everything up to the last Chinese character
756
+ if chinese:
757
+ last_chinese_idx = text.rindex(chinese_chars[-1]) + len(chinese_chars[-1])
758
+
759
+ # Anything before the first Chinese character is a prefix
760
+ first_chinese_idx = text.index(chinese_chars[0])
761
+ prefix = text[:first_chinese_idx].strip()
762
+
763
+ # Everything after the last Chinese character is English
764
+ chinese_part = prefix + " " + text[first_chinese_idx:last_chinese_idx].strip() if prefix else text[first_chinese_idx:last_chinese_idx].strip()
765
+ english_part = text[last_chinese_idx:].strip()
766
+
767
+ # If English part doesn't actually contain English letters, treat it as empty
768
+ if not re.search(r'[a-zA-Z]', english_part):
769
+ english_part = ""
770
+
771
+ return chinese_part, english_part
772
+
773
+ # No Chinese characters found, check if there are any English letters
774
+ if re.search(r'[a-zA-Z]', text):
775
+ return "", text.strip()
776
+
777
+ # No clear separation possible
778
+ return text.strip(), ""
779
+
780
+ # Process the data based on the standard field mapping
781
+ transformed_data = []
782
+
783
+ for row in price_list:
784
+ new_row = {field: "" for field in target_fields} # Initialize with empty strings
785
+ other_fields = {}
786
+
787
+ # Step 1: Handle name fields first - look for any field with "名称" or "name"
788
+ for header, value in row.items():
789
+ # Clean the header for comparison
790
+ cleaned_header = re.sub(r'\s+', ' ', header).strip()
791
+ header_lower = cleaned_header.lower()
792
+
793
+ if ("名称" in header_lower or "name" in header_lower) and value:
794
+ # If field contains both Chinese and English, separate them
795
+ if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
796
+ chinese, english = separate_chinese_english(value)
797
+ if chinese:
798
+ new_row["名称"] = chinese
799
+ if english:
800
+ new_row["名称(英文)"] = english
801
+ print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
802
+ else:
803
+ # Just set the name directly
804
+ new_row["名称"] = value
805
+ break # Stop after finding first name field
806
+
807
+ # Step 2: Fill in all other fields using standard mapping
808
+ for header, value in row.items():
809
+ # Skip empty values
810
+ if not value:
811
+ continue
812
+
813
+ # Clean the header for comparison
814
+ cleaned_header = re.sub(r'\s+', ' ', header).strip()
815
+
816
+ # Check if this maps to a standard field
817
+ matched_field = None
818
+ for std_field, mapped_header in standard_field_mapping.items():
819
+ # Make comparison more flexible by lowercasing and stripping spaces
820
+ if mapped_header.lower().strip() == cleaned_header.lower().strip():
821
+ matched_field = std_field
822
+ break
823
+
824
+ # If we found a mapping, use it (but don't overwrite name fields)
825
+ if matched_field:
826
+ if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
827
+ new_row[matched_field] = value
828
+ # If no mapping found, add to other_fields
829
+ else:
830
+ # Skip name fields we already processed
831
+ header_lower = cleaned_header.lower()
832
+ if not ("名称" in header_lower or "name" in header_lower):
833
+ other_fields[header] = value
834
+
835
+ # Add remaining fields to "其他"
836
+ if other_fields:
837
+ new_row["其他"] = other_fields
838
+ else:
839
+ new_row["其他"] = {}
840
+
841
+ # Convert field names for validation
842
+ if "名称(英文)" in new_row:
843
+ new_row["名称(英文)"] = new_row.pop("名称(英文)")
844
+
845
+ transformed_data.append(new_row)
846
+
847
+ # Success! Break out of the retry loop
848
+ print(f"✅ Successfully processed price list on attempt {attempt + 1}")
849
+ break
850
+
851
+ except json.JSONDecodeError as e:
852
+ error_msg = f"JSON decode error in field mapping: {e}"
853
+ logging.error(f"{error_msg}")
854
+ print(f"❌ {error_msg}")
855
+
856
+ except KeyError as e:
857
+ error_msg = f"KeyError during data transformation: {e}"
858
+ logging.error(f"{error_msg}")
859
+ print(f"❌ {error_msg}")
860
+
861
  except Exception as e:
862
+ error_msg = f"Error processing price list: {e}"
863
+ logging.error(f"{error_msg}")
864
+ print(f"❌ {error_msg}")
865
+
866
+ # Don't retry on the last attempt
867
+ if attempt < max_retries - 1:
868
+ # Add error message to the conversation and retry
869
+ if 'response' in locals():
870
+ messages.append({
871
+ "role": "assistant",
872
+ "content": response.choices[0].message.content
873
+ })
874
+ messages.append({
875
+ "role": "user",
876
+ "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
877
+ })
878
+ else:
879
+ print(f"⚠️ All {max_retries} attempts failed, returning empty result")
880
+ transformed_data = [] # Return empty list after all retries failed
881
 
882
+ # Save to file if requested
883
+ if save_json and transformed_data:
884
+ with open(json_name, "w", encoding="utf-8") as f:
885
+ json.dump(transformed_data, f, ensure_ascii=False, indent=4)
886
+ print(f"✅ Saved to {json_name}")
887
+
888
+ return transformed_data
889
 
 
 
 
 
890
  def json_to_excel(contract_summary, json_data, excel_path):
891
  """Converts extracted JSON tables to an Excel file."""
892
 
 
946
  price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
947
 
948
  # Process the price list and save it to a JSON file
949
+ price_list = extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
950
 
951
  # Step 4: Combine contract summary and long table data into a single JSON object
952
  print("Combining AI Generated JSON with Extracted Data...")
 
965
 
966
  Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
967
 
968
+ # print(log)
969
+ # print(f"🔄 Extracted Data: {combined_data}")
970
  logging.info(f"""{log}""")
971
 
972
  return combined_data
 
974
  # Example Usage
975
 
976
  # extract_po("test-contract-converted.docx")
977
+ # extract_po("test-contracts\GN-SMBLMCD202501-032WJ SMB联盟菜地PVC球阀等五金物资采购合同-ZHUOKE.docx")
978
 
979
+ # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '��径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
980
 
981
  # Gradio Interface ------------------------------
982