Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,23 +16,23 @@ import re
|
|
| 16 |
|
| 17 |
import logging
|
| 18 |
|
| 19 |
-
from pydantic import BaseModel, Field, ValidationError, RootModel
|
| 20 |
from typing import List, Optional
|
| 21 |
|
| 22 |
|
| 23 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 24 |
|
| 25 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 26 |
-
base_url = "https://router.huggingface.co/novita"
|
| 27 |
-
model = "deepseek/deepseek-r1-distill-qwen-14b"
|
| 28 |
|
| 29 |
# Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
|
| 30 |
# base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
|
| 31 |
# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 32 |
|
| 33 |
# Qwen 2.5 7B --------------------------------------------------------
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
# Qwen 2.5 32B --------------------------------------------------------
|
| 38 |
# base_url = "https://router.huggingface.co/novita/v3/openai"
|
|
@@ -530,81 +530,93 @@ Contract data in JSON format:""" + f"""
|
|
| 530 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 531 |
|
| 532 |
|
| 533 |
-
def
|
| 534 |
"""
|
| 535 |
-
Extracts structured price list using
|
| 536 |
-
|
| 537 |
"""
|
| 538 |
|
| 539 |
-
#
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
-
|
| 588 |
-
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
|
| 609 |
messages = [{"role": "user", "content": base_prompt}]
|
| 610 |
|
|
@@ -613,54 +625,268 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 613 |
api_key=HF_API_KEY,
|
| 614 |
)
|
| 615 |
|
| 616 |
-
|
| 617 |
-
|
|
|
|
| 618 |
|
|
|
|
| 619 |
try:
|
|
|
|
| 620 |
response = client.chat.completions.create(
|
| 621 |
model=model,
|
| 622 |
messages=messages,
|
|
|
|
| 623 |
)
|
| 624 |
-
|
|
|
|
| 625 |
|
| 626 |
think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
|
| 627 |
if think_text:
|
| 628 |
print(f"🧠 Thought Process: {think_text}")
|
| 629 |
logging.info(f"Think text: {think_text}")
|
| 630 |
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
#
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 651 |
except Exception as e:
|
| 652 |
-
error_msg = f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 653 |
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
})
|
|
|
|
|
|
|
| 659 |
|
| 660 |
-
print("⚠️ Failed after 3 attempts.")
|
| 661 |
-
return raw
|
| 662 |
-
|
| 663 |
-
|
| 664 |
def json_to_excel(contract_summary, json_data, excel_path):
|
| 665 |
"""Converts extracted JSON tables to an Excel file."""
|
| 666 |
|
|
@@ -720,7 +946,7 @@ def extract_po(docx_path):
|
|
| 720 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 721 |
|
| 722 |
# Process the price list and save it to a JSON file
|
| 723 |
-
price_list =
|
| 724 |
|
| 725 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 726 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
@@ -739,7 +965,8 @@ def extract_po(docx_path):
|
|
| 739 |
|
| 740 |
Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
|
| 741 |
|
| 742 |
-
print(log)
|
|
|
|
| 743 |
logging.info(f"""{log}""")
|
| 744 |
|
| 745 |
return combined_data
|
|
@@ -747,9 +974,9 @@ def extract_po(docx_path):
|
|
| 747 |
# Example Usage
|
| 748 |
|
| 749 |
# extract_po("test-contract-converted.docx")
|
| 750 |
-
# extract_po("test-
|
| 751 |
|
| 752 |
-
# print(
|
| 753 |
|
| 754 |
# Gradio Interface ------------------------------
|
| 755 |
|
|
|
|
| 16 |
|
| 17 |
import logging
|
| 18 |
|
| 19 |
+
from pydantic import BaseModel, Field, ValidationError, RootModel
|
| 20 |
from typing import List, Optional
|
| 21 |
|
| 22 |
|
| 23 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 24 |
|
| 25 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 26 |
+
# base_url = "https://router.huggingface.co/novita"
|
| 27 |
+
# model = "deepseek/deepseek-r1-distill-qwen-14b"
|
| 28 |
|
| 29 |
# Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
|
| 30 |
# base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
|
| 31 |
# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 32 |
|
| 33 |
# Qwen 2.5 7B --------------------------------------------------------
|
| 34 |
+
base_url = "https://router.huggingface.co/together/v1"
|
| 35 |
+
model="Qwen/Qwen2.5-7B-Instruct-Turbo"
|
| 36 |
|
| 37 |
# Qwen 2.5 32B --------------------------------------------------------
|
| 38 |
# base_url = "https://router.huggingface.co/novita/v3/openai"
|
|
|
|
| 530 |
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 531 |
|
| 532 |
|
| 533 |
+
def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
| 534 |
"""
|
| 535 |
+
Extracts structured price list by first using AI to map column names to standard keys,
|
| 536 |
+
then programmatically transforming the data to match the Pydantic model.
|
| 537 |
"""
|
| 538 |
|
| 539 |
+
# If price_list is empty, return an empty list
|
| 540 |
+
if not price_list:
|
| 541 |
+
return []
|
| 542 |
+
|
| 543 |
+
# Convert price_list to a list if it's a dict
|
| 544 |
+
if isinstance(price_list, dict):
|
| 545 |
+
# Check if the dict has any items
|
| 546 |
+
if len(price_list) == 0:
|
| 547 |
+
return []
|
| 548 |
+
# Convert to list if it's just a single entry dict
|
| 549 |
+
price_list = [price_list]
|
| 550 |
+
|
| 551 |
+
# Extract a sample row for header mapping
|
| 552 |
+
sample_row = price_list[0] if price_list else {}
|
| 553 |
+
|
| 554 |
+
# If there are no headers, return empty list
|
| 555 |
+
if not sample_row:
|
| 556 |
+
return []
|
| 557 |
+
|
| 558 |
+
# Get the headers directly from the sample row
|
| 559 |
+
extracted_headers = list(sample_row.keys())
|
| 560 |
+
|
| 561 |
+
# Clean double spaces in headers to facilitate AI identification
|
| 562 |
+
def clean_header_spaces(headers):
|
| 563 |
+
"""Clean double spaces in headers to make them more consistent for AI processing."""
|
| 564 |
+
return [re.sub(r'\s+', ' ', header).strip() for header in headers]
|
| 565 |
+
|
| 566 |
+
# Apply the cleaning function to extracted headers
|
| 567 |
+
extracted_headers = clean_header_spaces(extracted_headers)
|
| 568 |
+
|
| 569 |
+
# Define our target fields from the Pydantic model
|
| 570 |
+
target_fields = [
|
| 571 |
+
"序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
|
| 572 |
+
"数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
|
| 573 |
+
"备注", "计划来源"
|
| 574 |
+
]
|
| 575 |
+
|
| 576 |
+
sample_mapping = """Examples of how you should map to guide you, there are other cases so use your own judgement to map the headers to the standard fields:
|
| 577 |
+
- Map "序号" to headers containing "序号No.", "序号 No.",
|
| 578 |
+
- Map "品牌" to headers containing "品牌Brand", "品牌 brand",
|
| 579 |
+
- Map "规格型号" to headers containing "规格型号", "规格 Specification", "Specification and Model", "规格型号Specification and Model", "型号Model"
|
| 580 |
+
- Map "所属机型" to headers containing "所属机型", "Applicable Models"
|
| 581 |
+
- Map "数量" to headers containing "数量Quantity", "数量 Quantity", "Qty"
|
| 582 |
+
- Map "单位" to headers containing "单位Unit", "单位 Unit"
|
| 583 |
+
- Map "单价" to headers containing "单价(元)", "单价(CNY)", "Unit Price (CNY)", "单价Unit Price"
|
| 584 |
+
- Map "总价" to headers containing "总价(元)", "总额(元)", "Amount (CNY)", "Total Amount (CNY)"
|
| 585 |
+
- Map "几郎单价" to headers containing "单价(几郎)", "几郎单价(元)", "Unit Price (GNF)", "单价Unit Price(几郎)(GNF)"
|
| 586 |
+
- Map "几郎总价" to headers containing "总额(几郎)", "几郎总额(元)", "Total Amount (GNF)"
|
| 587 |
+
- Map "备注" to headers containing "备注Remarks", "备注 notes", "Note"
|
| 588 |
+
- Map "计划来源" to headers containing "计划来源Plan No.", "计划来源(唛头信息)", "Planned Source" """
|
| 589 |
+
|
| 590 |
+
# Use AI to map extracted headers to our target fields
|
| 591 |
+
base_prompt = f"""
|
| 592 |
+
You are playing a matching game. Match each and every standard fields to the exactcolumn headers within "" separated by ,.
|
| 593 |
+
USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
|
| 594 |
+
|
| 595 |
+
The standard fields are:
|
| 596 |
+
{json.dumps(target_fields, ensure_ascii=False)}
|
| 597 |
+
|
| 598 |
+
You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
|
| 599 |
+
{json.dumps(extracted_headers, ensure_ascii=False)}
|
| 600 |
+
|
| 601 |
+
ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
|
| 602 |
+
|
| 603 |
+
Return only a JSON mapping in this format WITHOUT any explanations:
|
| 604 |
+
```json
|
| 605 |
+
{{
|
| 606 |
+
"standard_field_1": "column_header_1",
|
| 607 |
+
"standard_field_2": "column_header_2",
|
| 608 |
+
...
|
| 609 |
+
}}
|
| 610 |
+
```
|
| 611 |
+
|
| 612 |
+
Important: Map "名称" AND "名称(英文)" to the SAME extracted header.
|
| 613 |
+
For example, if the extracted header is "名称Name of Materials and Equipment", then:
|
| 614 |
+
{{
|
| 615 |
+
"名称": "名称Name of Materials and Equipment",
|
| 616 |
+
"名称(英文)": "名称Name of Materials and Equipment"
|
| 617 |
+
}}
|
| 618 |
+
|
| 619 |
+
"""
|
| 620 |
|
| 621 |
messages = [{"role": "user", "content": base_prompt}]
|
| 622 |
|
|
|
|
| 625 |
api_key=HF_API_KEY,
|
| 626 |
)
|
| 627 |
|
| 628 |
+
# Add retry logic similar to deepseek_extract_contract_summary
|
| 629 |
+
max_retries = 3
|
| 630 |
+
transformed_data = []
|
| 631 |
|
| 632 |
+
for attempt in range(max_retries):
|
| 633 |
try:
|
| 634 |
+
print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries}: {base_prompt})")
|
| 635 |
response = client.chat.completions.create(
|
| 636 |
model=model,
|
| 637 |
messages=messages,
|
| 638 |
+
temperature=0.1,
|
| 639 |
)
|
| 640 |
+
|
| 641 |
+
raw_mapping = response.choices[0].message.content
|
| 642 |
|
| 643 |
think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
|
| 644 |
if think_text:
|
| 645 |
print(f"🧠 Thought Process: {think_text}")
|
| 646 |
logging.info(f"Think text: {think_text}")
|
| 647 |
|
| 648 |
+
raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
|
| 649 |
+
# Remove any backticks or json tags
|
| 650 |
+
raw_mapping = re.sub(r"```json|```", "", raw_mapping)
|
| 651 |
+
|
| 652 |
+
# Parse the mapping with standard fields as keys
|
| 653 |
+
standard_field_mapping = json.loads(raw_mapping.strip())
|
| 654 |
+
print(f"📊 Standard field mapping: {json.dumps(standard_field_mapping, ensure_ascii=False, indent=2)}")
|
| 655 |
+
|
| 656 |
+
# Function to separate Chinese and English text
|
| 657 |
+
def separate_chinese_english(text):
|
| 658 |
+
if not text or not isinstance(text, str):
|
| 659 |
+
return "", ""
|
| 660 |
+
|
| 661 |
+
# First check if there's a clear separator like hyphen or space
|
| 662 |
+
# Common patterns: "中文-English", "中文(English)", "中文 English"
|
| 663 |
+
patterns = [
|
| 664 |
+
r'^([\u4e00-\u9fff\-]+)[:\-\s]+([a-zA-Z].*)$', # Chinese-English
|
| 665 |
+
r'^([\u4e00-\u9fff\-]+)[\((]([a-zA-Z].*)[\))]$', # Chinese(English)
|
| 666 |
+
]
|
| 667 |
+
|
| 668 |
+
for pattern in patterns:
|
| 669 |
+
match = re.search(pattern, text)
|
| 670 |
+
if match:
|
| 671 |
+
return match.group(1), match.group(2)
|
| 672 |
+
|
| 673 |
+
# Find the first Chinese character index
|
| 674 |
+
first_chinese_idx = -1
|
| 675 |
+
for i, char in enumerate(text):
|
| 676 |
+
if '\u4e00' <= char <= '\u9fff': # Chinese character
|
| 677 |
+
first_chinese_idx = i
|
| 678 |
+
break
|
| 679 |
+
|
| 680 |
+
# Find where English starts after Chinese
|
| 681 |
+
english_start_idx = len(text)
|
| 682 |
+
if first_chinese_idx >= 0:
|
| 683 |
+
# Search for the first English character that comes after Chinese
|
| 684 |
+
for i in range(first_chinese_idx, len(text)):
|
| 685 |
+
# Skip to the end of Chinese characters
|
| 686 |
+
if '\u4e00' <= text[i] <= '\u9fff':
|
| 687 |
+
continue
|
| 688 |
+
|
| 689 |
+
# Look ahead for English characters
|
| 690 |
+
for j in range(i, len(text)):
|
| 691 |
+
if 'a' <= text[j].lower() <= 'z':
|
| 692 |
+
english_start_idx = j
|
| 693 |
+
break
|
| 694 |
+
if english_start_idx < len(text):
|
| 695 |
+
break
|
| 696 |
+
|
| 697 |
+
# If we found the boundaries
|
| 698 |
+
if first_chinese_idx >= 0 and english_start_idx < len(text):
|
| 699 |
+
# Handle prefix: any Latin characters before Chinese should be part of Chinese name
|
| 700 |
+
prefix = text[:first_chinese_idx].strip() if first_chinese_idx > 0 else ""
|
| 701 |
+
chinese_part = text[first_chinese_idx:english_start_idx].strip()
|
| 702 |
+
english_part = text[english_start_idx:].strip()
|
| 703 |
+
|
| 704 |
+
# Combine prefix with Chinese part
|
| 705 |
+
if prefix:
|
| 706 |
+
chinese_part = f"{prefix} {chinese_part}"
|
| 707 |
+
|
| 708 |
+
return chinese_part, english_part
|
| 709 |
+
|
| 710 |
+
# Special case for prefix like "PVC" with no space before Chinese
|
| 711 |
+
if first_chinese_idx > 0:
|
| 712 |
+
prefix = text[:first_chinese_idx].strip()
|
| 713 |
+
rest_of_text = text[first_chinese_idx:]
|
| 714 |
+
|
| 715 |
+
# Extract Chinese and English from the rest of the text
|
| 716 |
+
chinese_chars = []
|
| 717 |
+
english_chars = []
|
| 718 |
+
in_chinese = True
|
| 719 |
+
|
| 720 |
+
for char in rest_of_text:
|
| 721 |
+
if '\u4e00' <= char <= '\u9fff': # Chinese character
|
| 722 |
+
if not in_chinese and english_chars: # If we've already seen English, something is wrong
|
| 723 |
+
chinese_chars = []
|
| 724 |
+
english_chars = []
|
| 725 |
+
break
|
| 726 |
+
chinese_chars.append(char)
|
| 727 |
+
in_chinese = True
|
| 728 |
+
elif 'a' <= char.lower() <= 'z' or char in ' -_()': # English or separator
|
| 729 |
+
if in_chinese and chinese_chars: # We've seen Chinese and now see English
|
| 730 |
+
english_chars.append(char)
|
| 731 |
+
in_chinese = False
|
| 732 |
+
elif not in_chinese: # Continue collecting English
|
| 733 |
+
english_chars.append(char)
|
| 734 |
+
else: # No Chinese seen yet, might be part of prefix
|
| 735 |
+
chinese_chars.append(char)
|
| 736 |
+
else: # Other characters (numbers, etc.)
|
| 737 |
+
if in_chinese:
|
| 738 |
+
chinese_chars.append(char)
|
| 739 |
+
else:
|
| 740 |
+
english_chars.append(char)
|
| 741 |
+
|
| 742 |
+
if chinese_chars and english_chars:
|
| 743 |
+
chinese_text = prefix + " " + ''.join(chinese_chars).strip()
|
| 744 |
+
english_text = ''.join(english_chars).strip()
|
| 745 |
+
return chinese_text, english_text
|
| 746 |
+
else:
|
| 747 |
+
# No clean separation possible
|
| 748 |
+
return prefix + " " + rest_of_text, ""
|
| 749 |
+
|
| 750 |
+
# Fallback: Try simple pattern matching
|
| 751 |
+
# Find all Chinese characters
|
| 752 |
+
chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
|
| 753 |
+
chinese = ''.join(chinese_chars)
|
| 754 |
+
|
| 755 |
+
# If we have Chinese, extract everything up to the last Chinese character
|
| 756 |
+
if chinese:
|
| 757 |
+
last_chinese_idx = text.rindex(chinese_chars[-1]) + len(chinese_chars[-1])
|
| 758 |
+
|
| 759 |
+
# Anything before the first Chinese character is a prefix
|
| 760 |
+
first_chinese_idx = text.index(chinese_chars[0])
|
| 761 |
+
prefix = text[:first_chinese_idx].strip()
|
| 762 |
+
|
| 763 |
+
# Everything after the last Chinese character is English
|
| 764 |
+
chinese_part = prefix + " " + text[first_chinese_idx:last_chinese_idx].strip() if prefix else text[first_chinese_idx:last_chinese_idx].strip()
|
| 765 |
+
english_part = text[last_chinese_idx:].strip()
|
| 766 |
+
|
| 767 |
+
# If English part doesn't actually contain English letters, treat it as empty
|
| 768 |
+
if not re.search(r'[a-zA-Z]', english_part):
|
| 769 |
+
english_part = ""
|
| 770 |
+
|
| 771 |
+
return chinese_part, english_part
|
| 772 |
+
|
| 773 |
+
# No Chinese characters found, check if there are any English letters
|
| 774 |
+
if re.search(r'[a-zA-Z]', text):
|
| 775 |
+
return "", text.strip()
|
| 776 |
+
|
| 777 |
+
# No clear separation possible
|
| 778 |
+
return text.strip(), ""
|
| 779 |
+
|
| 780 |
+
# Process the data based on the standard field mapping
|
| 781 |
+
transformed_data = []
|
| 782 |
+
|
| 783 |
+
for row in price_list:
|
| 784 |
+
new_row = {field: "" for field in target_fields} # Initialize with empty strings
|
| 785 |
+
other_fields = {}
|
| 786 |
+
|
| 787 |
+
# Step 1: Handle name fields first - look for any field with "名称" or "name"
|
| 788 |
+
for header, value in row.items():
|
| 789 |
+
# Clean the header for comparison
|
| 790 |
+
cleaned_header = re.sub(r'\s+', ' ', header).strip()
|
| 791 |
+
header_lower = cleaned_header.lower()
|
| 792 |
+
|
| 793 |
+
if ("名称" in header_lower or "name" in header_lower) and value:
|
| 794 |
+
# If field contains both Chinese and English, separate them
|
| 795 |
+
if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
|
| 796 |
+
chinese, english = separate_chinese_english(value)
|
| 797 |
+
if chinese:
|
| 798 |
+
new_row["名称"] = chinese
|
| 799 |
+
if english:
|
| 800 |
+
new_row["名称(英文)"] = english
|
| 801 |
+
print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
|
| 802 |
+
else:
|
| 803 |
+
# Just set the name directly
|
| 804 |
+
new_row["名称"] = value
|
| 805 |
+
break # Stop after finding first name field
|
| 806 |
+
|
| 807 |
+
# Step 2: Fill in all other fields using standard mapping
|
| 808 |
+
for header, value in row.items():
|
| 809 |
+
# Skip empty values
|
| 810 |
+
if not value:
|
| 811 |
+
continue
|
| 812 |
+
|
| 813 |
+
# Clean the header for comparison
|
| 814 |
+
cleaned_header = re.sub(r'\s+', ' ', header).strip()
|
| 815 |
+
|
| 816 |
+
# Check if this maps to a standard field
|
| 817 |
+
matched_field = None
|
| 818 |
+
for std_field, mapped_header in standard_field_mapping.items():
|
| 819 |
+
# Make comparison more flexible by lowercasing and stripping spaces
|
| 820 |
+
if mapped_header.lower().strip() == cleaned_header.lower().strip():
|
| 821 |
+
matched_field = std_field
|
| 822 |
+
break
|
| 823 |
+
|
| 824 |
+
# If we found a mapping, use it (but don't overwrite name fields)
|
| 825 |
+
if matched_field:
|
| 826 |
+
if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
|
| 827 |
+
new_row[matched_field] = value
|
| 828 |
+
# If no mapping found, add to other_fields
|
| 829 |
+
else:
|
| 830 |
+
# Skip name fields we already processed
|
| 831 |
+
header_lower = cleaned_header.lower()
|
| 832 |
+
if not ("名称" in header_lower or "name" in header_lower):
|
| 833 |
+
other_fields[header] = value
|
| 834 |
+
|
| 835 |
+
# Add remaining fields to "其他"
|
| 836 |
+
if other_fields:
|
| 837 |
+
new_row["其他"] = other_fields
|
| 838 |
+
else:
|
| 839 |
+
new_row["其他"] = {}
|
| 840 |
+
|
| 841 |
+
# Convert field names for validation
|
| 842 |
+
if "名称(英文)" in new_row:
|
| 843 |
+
new_row["名称(英文)"] = new_row.pop("名称(英文)")
|
| 844 |
+
|
| 845 |
+
transformed_data.append(new_row)
|
| 846 |
+
|
| 847 |
+
# Success! Break out of the retry loop
|
| 848 |
+
print(f"✅ Successfully processed price list on attempt {attempt + 1}")
|
| 849 |
+
break
|
| 850 |
+
|
| 851 |
+
except json.JSONDecodeError as e:
|
| 852 |
+
error_msg = f"JSON decode error in field mapping: {e}"
|
| 853 |
+
logging.error(f"{error_msg}")
|
| 854 |
+
print(f"❌ {error_msg}")
|
| 855 |
+
|
| 856 |
+
except KeyError as e:
|
| 857 |
+
error_msg = f"KeyError during data transformation: {e}"
|
| 858 |
+
logging.error(f"{error_msg}")
|
| 859 |
+
print(f"❌ {error_msg}")
|
| 860 |
+
|
| 861 |
except Exception as e:
|
| 862 |
+
error_msg = f"Error processing price list: {e}"
|
| 863 |
+
logging.error(f"{error_msg}")
|
| 864 |
+
print(f"❌ {error_msg}")
|
| 865 |
+
|
| 866 |
+
# Don't retry on the last attempt
|
| 867 |
+
if attempt < max_retries - 1:
|
| 868 |
+
# Add error message to the conversation and retry
|
| 869 |
+
if 'response' in locals():
|
| 870 |
+
messages.append({
|
| 871 |
+
"role": "assistant",
|
| 872 |
+
"content": response.choices[0].message.content
|
| 873 |
+
})
|
| 874 |
+
messages.append({
|
| 875 |
+
"role": "user",
|
| 876 |
+
"content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
|
| 877 |
+
})
|
| 878 |
+
else:
|
| 879 |
+
print(f"⚠️ All {max_retries} attempts failed, returning empty result")
|
| 880 |
+
transformed_data = [] # Return empty list after all retries failed
|
| 881 |
|
| 882 |
+
# Save to file if requested
|
| 883 |
+
if save_json and transformed_data:
|
| 884 |
+
with open(json_name, "w", encoding="utf-8") as f:
|
| 885 |
+
json.dump(transformed_data, f, ensure_ascii=False, indent=4)
|
| 886 |
+
print(f"✅ Saved to {json_name}")
|
| 887 |
+
|
| 888 |
+
return transformed_data
|
| 889 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 890 |
def json_to_excel(contract_summary, json_data, excel_path):
|
| 891 |
"""Converts extracted JSON tables to an Excel file."""
|
| 892 |
|
|
|
|
| 946 |
price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
|
| 947 |
|
| 948 |
# Process the price list and save it to a JSON file
|
| 949 |
+
price_list = extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
|
| 950 |
|
| 951 |
# Step 4: Combine contract summary and long table data into a single JSON object
|
| 952 |
print("Combining AI Generated JSON with Extracted Data...")
|
|
|
|
| 965 |
|
| 966 |
Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
|
| 967 |
|
| 968 |
+
# print(log)
|
| 969 |
+
# print(f"🔄 Extracted Data: {combined_data}")
|
| 970 |
logging.info(f"""{log}""")
|
| 971 |
|
| 972 |
return combined_data
|
|
|
|
| 974 |
# Example Usage
|
| 975 |
|
| 976 |
# extract_po("test-contract-converted.docx")
|
| 977 |
+
# extract_po("test-contracts\GN-SMBLMCD202501-032WJ SMB联盟菜地PVC球阀等五金物资采购合同-ZHUOKE.docx")
|
| 978 |
|
| 979 |
+
# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管) PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '��径600mm,6米/根,SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价(元) Unit Price (CNY)': '106.00', '总额(元) Total Amount (CNY)': '1080.00', '几郎单价(元) Unit Price (GNF)': '16.21', '几郎总额(元) Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
|
| 980 |
|
| 981 |
# Gradio Interface ------------------------------
|
| 982 |
|