Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on May 7, 2025

Commit

fd35ba2

verified ·

1 Parent(s): c61bff0

Update app.py

Browse files

Files changed (1) hide show

app.py +443 -316

app.py CHANGED Viewed

@@ -38,6 +38,10 @@ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
 # base_url = "https://router.huggingface.co/novita/v3/openai"
 # model="qwen/qwen-2.5-72b-instruct"
 # Configure logging to write to 'zaoju_logs.log' without using pickle
 logging.basicConfig(
     filename='extract_po_logs.log',
@@ -79,9 +83,21 @@ def extract_text_from_cell(cell):
 def clean_spaces(text):
     """
     Removes excessive spaces between Chinese characters while preserving spaces in English words.
     """
-    # Remove spaces **between** Chinese characters but keep English spaces
     text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
     return text.strip()
 def extract_key_value_pairs(text, target_dict=None):
@@ -196,6 +212,39 @@ def process_summary_table(rows):
     return extracted_data
 def extract_headers(first_row_cells):
     """Extracts unique column headers from the first row of a table."""
     headers = []
@@ -266,13 +315,24 @@ def process_long_table(rows):
         table_data.append(row_data)
     # Filter out rows where the "序号" column contains non-numeric values
     filtered_table_data = []
-    for row in table_data:
-        # Check if any cell contains "合计" (total)
         contains_total = False
         for key, value in row.items():
-            if isinstance(value, str) and "合计" in value:
                 contains_total = True
                 break
@@ -303,6 +363,27 @@ def process_long_table(rows):
     return filtered_table_data
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
     tables = root.findall('.//w:tbl', NS)
@@ -317,42 +398,34 @@ def extract_tables(root):
         for paragraph in table.findall('.//w:p', NS):
             table_paragraphs.add(paragraph)
-        first_row_cells = rows[0].findall('.//w:tc', NS)
-        num_columns = len(first_row_cells)
-        if num_columns == 1:
             single_column_data = process_single_column_table(rows)
             if single_column_data:
                 table_data[f"table_{table_index}_single_column"] = single_column_data
-            continue  # Skip further processing for this table
-        summary_start_index = None
-        for i, row in enumerate(rows):
-            if len(row.findall('.//w:tc', NS)) == 2:
-                summary_start_index = i
-                break
-        long_table_data = []
-        summary_data = []
-        if summary_start_index is not None and summary_start_index > 0:
-            long_table_data = process_long_table(rows[:summary_start_index])
-        elif summary_start_index is None:
-            long_table_data = process_long_table(rows)
-        if summary_start_index is not None:
-            is_buyer_seller_table = all(len(row.findall('.//w:tc', NS)) == 2 for row in rows)
-            if is_buyer_seller_table:
-                buyer_seller_data = process_buyer_seller_table(rows)
-                if buyer_seller_data:
-                    table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
-            else:
-                summary_data = process_summary_table(rows[summary_start_index:])
-        if long_table_data:
-            table_data[f"long_table_{table_index}"] = long_table_data
-        if summary_data:
-            table_data[f"long_table_{table_index}_summary"] = summary_data
     return table_data, table_paragraphs
@@ -532,8 +605,7 @@ Contract data in JSON format:""" + f"""
 def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     """
-    Extracts structured price list by first using AI to map column names to standard keys,
-    then programmatically transforming the data to match the Pydantic model.
     """
     # If price_list is empty, return an empty list
@@ -558,10 +630,35 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
     # Get the headers directly from the sample row
     extracted_headers = list(sample_row.keys())
-    # Clean double spaces in headers to facilitate AI identification
     def clean_header_spaces(headers):
-        """Clean double spaces in headers to make them more consistent for AI processing."""
-        return [re.sub(r'\s+', ' ', header).strip() for header in headers]
     # Apply the cleaning function to extracted headers
     extracted_headers = clean_header_spaces(extracted_headers)
@@ -572,31 +669,92 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json")
         "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
         "备注", "计划来源"
     ]
-    sample_mapping = """Examples of how you should map to guide you, there are other cases so use your own judgement to map the headers to the standard fields:
-- Map "序号" to headers containing "序号No.", "序号 No.",
-- Map "品牌" to headers containing "品牌Brand", "品牌 brand",
-- Map "规格型号" to headers containing "规格型号", "规格 Specification", "Specification and Model", "规格型号Specification and Model", "型号Model"
-- Map "所属机型" to headers containing "所属机型", "Applicable Models"
-- Map "数量" to headers containing "数量Quantity", "数量 Quantity", "Qty"
-- Map "单位" to headers containing "单位Unit", "单位 Unit"
-- Map "单价" to headers containing "单价（元）", "单价(CNY)", "Unit Price (CNY)", "单价Unit Price"
-- Map "总价" to headers containing "总价（元）", "总额（元）", "Amount (CNY)", "Total Amount (CNY)"
-- Map "几郎单价" to headers containing "单价（几郎）", "几郎单价（元）", "Unit Price (GNF)", "单价Unit Price（几郎）(GNF)"
-- Map "几郎总价" to headers containing "总额（几郎）", "几郎总额（元）", "Total Amount (GNF)"
-- Map "备注" to headers containing "备注Remarks", "备注 notes", "Note"
-- Map "计划来源" to headers containing "计划来源Plan No.", "计划来源（唛头信息）", "Planned Source" """
-    # Use AI to map extracted headers to our target fields
-    base_prompt = f"""
-You are playing a matching game. Match each and every standard fields to the exactcolumn headers within "" separated by ,.
 USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
-The standard fields are:
-{json.dumps(target_fields, ensure_ascii=False)}
 You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
-{json.dumps(extracted_headers, ensure_ascii=False)}
 ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
@@ -615,269 +773,206 @@ For example, if the extracted header is "名称Name of Materials and Equipment",
   "名称": "名称Name of Materials and Equipment",
   "名称(英文)": "名称Name of Materials and Equipment"
 }}
 """
-    messages = [{"role": "user", "content": base_prompt}]
-    client = OpenAI(
-        base_url=base_url,
-        api_key=HF_API_KEY,
-    )
-    # Add retry logic similar to deepseek_extract_contract_summary
-    max_retries = 3
-    transformed_data = []
-    for attempt in range(max_retries):
-        try:
-            print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries}: {base_prompt})")
-            response = client.chat.completions.create(
-                model=model,
-                messages=messages,
-                temperature=0.1,
-            )
-            raw_mapping = response.choices[0].message.content
-            think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
-            if think_text:
-                print(f"🧠 Thought Process: {think_text}")
-                logging.info(f"Think text: {think_text}")
-            raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
-            # Remove any backticks or json tags
-            raw_mapping = re.sub(r"```json|```", "", raw_mapping)
-            # Parse the mapping with standard fields as keys
-            standard_field_mapping = json.loads(raw_mapping.strip())
-            print(f"📊 Standard field mapping: {json.dumps(standard_field_mapping, ensure_ascii=False, indent=2)}")
-            # Function to separate Chinese and English text
-            def separate_chinese_english(text):
-                if not text or not isinstance(text, str):
-                    return "", ""
-                # First check if there's a clear separator like hyphen or space
-                # Common patterns: "中文-English", "中文（English）", "中文 English"
-                patterns = [
-                    r'^([\u4e00-\u9fff\-]+)[:\-\s]+([a-zA-Z].*)$',  # Chinese-English
-                    r'^([\u4e00-\u9fff\-]+)[\(（]([a-zA-Z].*)[\)）]$',  # Chinese(English)
-                ]
-                for pattern in patterns:
-                    match = re.search(pattern, text)
-                    if match:
-                        return match.group(1), match.group(2)
-                # Find the first Chinese character index
-                first_chinese_idx = -1
-                for i, char in enumerate(text):
-                    if '\u4e00' <= char <= '\u9fff':  # Chinese character
-                        first_chinese_idx = i
-                        break
-                # Find where English starts after Chinese
-                english_start_idx = len(text)
-                if first_chinese_idx >= 0:
-                    # Search for the first English character that comes after Chinese
-                    for i in range(first_chinese_idx, len(text)):
-                        # Skip to the end of Chinese characters
-                        if '\u4e00' <= text[i] <= '\u9fff':
-                            continue
-                        # Look ahead for English characters
-                        for j in range(i, len(text)):
-                            if 'a' <= text[j].lower() <= 'z':
-                                english_start_idx = j
-                                break
-                        if english_start_idx < len(text):
-                            break
-                # If we found the boundaries
-                if first_chinese_idx >= 0 and english_start_idx < len(text):
-                    # Handle prefix: any Latin characters before Chinese should be part of Chinese name
-                    prefix = text[:first_chinese_idx].strip() if first_chinese_idx > 0 else ""
-                    chinese_part = text[first_chinese_idx:english_start_idx].strip()
-                    english_part = text[english_start_idx:].strip()
-                    # Combine prefix with Chinese part
-                    if prefix:
-                        chinese_part = f"{prefix} {chinese_part}"
-                    return chinese_part, english_part
-                # Special case for prefix like "PVC" with no space before Chinese
-                if first_chinese_idx > 0:
-                    prefix = text[:first_chinese_idx].strip()
-                    rest_of_text = text[first_chinese_idx:]
-                    # Extract Chinese and English from the rest of the text
-                    chinese_chars = []
-                    english_chars = []
-                    in_chinese = True
-                    for char in rest_of_text:
-                        if '\u4e00' <= char <= '\u9fff':  # Chinese character
-                            if not in_chinese and english_chars:  # If we've already seen English, something is wrong
-                                chinese_chars = []
-                                english_chars = []
-                                break
-                            chinese_chars.append(char)
-                            in_chinese = True
-                        elif 'a' <= char.lower() <= 'z' or char in ' -_()':  # English or separator
-                            if in_chinese and chinese_chars:  # We've seen Chinese and now see English
-                                english_chars.append(char)
-                                in_chinese = False
-                            elif not in_chinese:  # Continue collecting English
-                                english_chars.append(char)
-                            else:  # No Chinese seen yet, might be part of prefix
-                                chinese_chars.append(char)
-                        else:  # Other characters (numbers, etc.)
-                            if in_chinese:
-                                chinese_chars.append(char)
-                            else:
-                                english_chars.append(char)
-                    if chinese_chars and english_chars:
-                        chinese_text = prefix + " " + ''.join(chinese_chars).strip()
-                        english_text = ''.join(english_chars).strip()
-                        return chinese_text, english_text
-                    else:
-                        # No clean separation possible
-                        return prefix + " " + rest_of_text, ""
-                # Fallback: Try simple pattern matching
-                # Find all Chinese characters
-                chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
-                chinese = ''.join(chinese_chars)
-                # If we have Chinese, extract everything up to the last Chinese character
-                if chinese:
-                    last_chinese_idx = text.rindex(chinese_chars[-1]) + len(chinese_chars[-1])
-                    # Anything before the first Chinese character is a prefix
-                    first_chinese_idx = text.index(chinese_chars[0])
-                    prefix = text[:first_chinese_idx].strip()
-                    # Everything after the last Chinese character is English
-                    chinese_part = prefix + " " + text[first_chinese_idx:last_chinese_idx].strip() if prefix else text[first_chinese_idx:last_chinese_idx].strip()
-                    english_part = text[last_chinese_idx:].strip()
-                    # If English part doesn't actually contain English letters, treat it as empty
-                    if not re.search(r'[a-zA-Z]', english_part):
-                        english_part = ""
-                    return chinese_part, english_part
-                # No Chinese characters found, check if there are any English letters
-                if re.search(r'[a-zA-Z]', text):
-                    return "", text.strip()
-                # No clear separation possible
-                return text.strip(), ""
-            # Process the data based on the standard field mapping
-            transformed_data = []
-            for row in price_list:
-                new_row = {field: "" for field in target_fields}  # Initialize with empty strings
-                other_fields = {}
-                # Step 1: Handle name fields first - look for any field with "名称" or "name"
-                for header, value in row.items():
-                    # Clean the header for comparison
-                    cleaned_header = re.sub(r'\s+', ' ', header).strip()
-                    header_lower = cleaned_header.lower()
-                    if ("名称" in header_lower or "name" in header_lower) and value:
-                        # If field contains both Chinese and English, separate them
-                        if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
-                            chinese, english = separate_chinese_english(value)
-                            if chinese:
-                                new_row["名称"] = chinese
-                            if english:
-                                new_row["名称(英文)"] = english
-                            print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
-                        else:
-                            # Just set the name directly
-                            new_row["名称"] = value
-                        break  # Stop after finding first name field
-                # Step 2: Fill in all other fields using standard mapping
-                for header, value in row.items():
-                    # Skip empty values
-                    if not value:
                         continue
-                    # Clean the header for comparison
-                    cleaned_header = re.sub(r'\s+', ' ', header).strip()
-                    # Check if this maps to a standard field
-                    matched_field = None
-                    for std_field, mapped_header in standard_field_mapping.items():
-                        # Make comparison more flexible by lowercasing and stripping spaces
-                        if mapped_header.lower().strip() == cleaned_header.lower().strip():
-                            matched_field = std_field
-                            break
-                    # If we found a mapping, use it (but don't overwrite name fields)
-                    if matched_field:
-                        if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
-                            new_row[matched_field] = value
-                    # If no mapping found, add to other_fields
-                    else:
-                        # Skip name fields we already processed
-                        header_lower = cleaned_header.lower()
-                        if not ("名称" in header_lower or "name" in header_lower):
-                            other_fields[header] = value
-                # Add remaining fields to "其他"
-                if other_fields:
-                    new_row["其他"] = other_fields
                 else:
-                    new_row["其他"] = {}
-                # Convert field names for validation
-                if "名称(英文)" in new_row:
-                    new_row["名称(英文)"] = new_row.pop("名称(英文)")
-                transformed_data.append(new_row)
-            # Success! Break out of the retry loop
-            print(f"✅ Successfully processed price list on attempt {attempt + 1}")
-            break
-        except json.JSONDecodeError as e:
-            error_msg = f"JSON decode error in field mapping: {e}"
-            logging.error(f"{error_msg}")
-            print(f"❌ {error_msg}")
-        except KeyError as e:
-            error_msg = f"KeyError during data transformation: {e}"
-            logging.error(f"{error_msg}")
-            print(f"❌ {error_msg}")
-        except Exception as e:
-            error_msg = f"Error processing price list: {e}"
-            logging.error(f"{error_msg}")
-            print(f"❌ {error_msg}")
-        # Don't retry on the last attempt
-        if attempt < max_retries - 1:
-            # Add error message to the conversation and retry
-            if 'response' in locals():
-                messages.append({
-                    "role": "assistant",
-                    "content": response.choices[0].message.content
-                })
-            messages.append({
-                "role": "user",
-                "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
-            })
         else:
-            print(f"⚠️ All {max_retries} attempts failed, returning empty result")
-            transformed_data = []  # Return empty list after all retries failed
     # Save to file if requested
     if save_json and transformed_data:
@@ -906,6 +1001,30 @@ def json_to_excel(contract_summary, json_data, excel_path):
         contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
         long_table.to_excel(writer, sheet_name="Price List", index=False)
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
@@ -930,6 +1049,7 @@ def extract_po(docx_path):
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
     extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
     # Step 3: Process JSON with OpenAI to get structured output
     print("Processing Contract Summary data with AI...")
@@ -938,17 +1058,17 @@ def extract_po(docx_path):
     # Find the last long table (excluding summary tables)
     print("Processing Price List data with AI...")
-    long_tables = [
-        table for key, table in json.loads(extracted_data).items()
-        if "long_table" in key and "summary" not in key
-    ]
-    last_long_table = long_tables[-1] if long_tables else {}
     # Generate the price list filename in the same folder as the document
     price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
     # Process the price list and save it to a JSON file
-    price_list = extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
     # Step 4: Combine contract summary and long table data into a single JSON object
     print("Combining AI Generated JSON with Extracted Data...")
@@ -985,6 +1105,13 @@ def extract_po(docx_path):
 import gradio as gr
 from gradio.themes.base import Base
 interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",

 # base_url = "https://router.huggingface.co/novita/v3/openai"
 # model="qwen/qwen-2.5-72b-instruct"
+# Qwen 3 32B --------------------------------------------------------
+# base_url = "https://router.huggingface.co/sambanova/v1"
+# model="Qwen3-32B"
 # Configure logging to write to 'zaoju_logs.log' without using pickle
 logging.basicConfig(
     filename='extract_po_logs.log',
 def clean_spaces(text):
     """
     Removes excessive spaces between Chinese characters while preserving spaces in English words.
+    Also normalizes multiple spaces to single space and ensures one space between Chinese and English.
     """
+    if not text or not isinstance(text, str):
+        return text
+    # Remove spaces between Chinese characters
     text = re.sub(r'([\u4e00-\u9fff])\s+([\u4e00-\u9fff])', r'\1\2', text)
+    # Ensure one space between Chinese and English
+    text = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', text)
+    text = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', text)
+    # Normalize multiple spaces to single space
+    text = re.sub(r'\s+', ' ', text)
     return text.strip()
 def extract_key_value_pairs(text, target_dict=None):
     return extracted_data
+def clean_header_spaces(headers):
+    """
+    Cleans headers for consistent matching by:
+    1. Normalizing multiple spaces to single space
+    2. Ensuring exactly one space between Chinese and English
+    3. Converting to lowercase
+    """
+    if not headers:
+        return headers
+    cleaned_headers = []
+    for header in headers:
+        if not header:
+            cleaned_headers.append(header)
+            continue
+        # Normalize multiple spaces to single space
+        header = re.sub(r'\s+', ' ', header)
+        # Ensure exactly one space between Chinese and English
+        header = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', header)
+        header = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', header)
+        # Final cleanup of any remaining multiple spaces
+        header = re.sub(r'\s+', ' ', header)
+        # Convert to lowercase
+        header = header.lower()
+        cleaned_headers.append(header.strip())
+    return cleaned_headers
 def extract_headers(first_row_cells):
     """Extracts unique column headers from the first row of a table."""
     headers = []
         table_data.append(row_data)
+    # Clean the keys in the table data
+    cleaned_table_data = []
+    for row in table_data:
+        cleaned_row = {}
+        for key, value in row.items():
+            # Clean the key using the same function we use for headers
+            cleaned_key = clean_header_spaces([key])[0]
+            cleaned_row[cleaned_key] = value
+        cleaned_table_data.append(cleaned_row)
     # Filter out rows where the "序号" column contains non-numeric values
     filtered_table_data = []
+    for row in cleaned_table_data:
+        # Check if any cell contains "合计" (total) or "折扣" (discount)
         contains_total = False
         for key, value in row.items():
+            if isinstance(value, str) and ("合计" in value or "折扣" in value):
                 contains_total = True
                 break
     return filtered_table_data
+def identify_table_type_and_header_row(rows):
+    """Identify table type and the index of the header row."""
+    header_keywords = ["名称", "Name", "规格", "Unit", "Quantity", "单价", "总价", "Remarks"]
+    for i, row in enumerate(rows):
+        num_cells = len(row.findall('.//w:tc', NS))
+        if num_cells > 1:
+            cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
+            if any(keyword in cell_texts for keyword in header_keywords):
+                # Check for buyer-seller or summary table
+                if num_cells == 2:
+                    if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
+                        return "buyer_seller", i
+                    else:
+                        return "summary", i
+                else:
+                    return "long_table", i
+    # Fallbacks
+    if all(len(row.findall('.//w:tc', NS)) == 1 for row in rows):
+        return "single_column", 0
+    return "unknown", 0
 def extract_tables(root):
     """Extracts tables from the DOCX document and returns structured data."""
     tables = root.findall('.//w:tbl', NS)
         for paragraph in table.findall('.//w:p', NS):
             table_paragraphs.add(paragraph)
+        table_type, header_row_index = identify_table_type_and_header_row(rows)
+        if table_type == "single_column":
             single_column_data = process_single_column_table(rows)
             if single_column_data:
                 table_data[f"table_{table_index}_single_column"] = single_column_data
+            continue
+        elif table_type == "buyer_seller":
+            buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
+            if buyer_seller_data:
+                table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
+            continue
+        elif table_type == "summary":
+            summary_data = process_summary_table(rows[header_row_index:])
+            if summary_data:
+                table_data[f"table_{table_index}_summary"] = summary_data
+            continue
+        elif table_type == "long_table":
+            long_table_data = process_long_table(rows[header_row_index:])
+            if long_table_data:
+                table_data[f"long_table_{table_index}"] = long_table_data
+            continue
+        else:
+            # fallback: try to process as long table from first multi-column row
+            long_table_data = process_long_table(rows[header_row_index:])
+            if long_table_data:
+                table_data[f"long_table_{table_index}"] = long_table_data
+            continue
     return table_data, table_paragraphs
 def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     """
+    Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
     """
     # If price_list is empty, return an empty list
     # Get the headers directly from the sample row
     extracted_headers = list(sample_row.keys())
+    # Clean double spaces in headers to facilitate matching
     def clean_header_spaces(headers):
+        """
+        Cleans headers for consistent matching by:
+        1. Normalizing multiple spaces to single space
+        2. Ensuring exactly one space between Chinese and English
+        """
+        if not headers:
+            return headers
+        cleaned_headers = []
+        for header in headers:
+            if not header:
+                cleaned_headers.append(header)
+                continue
+            # Normalize multiple spaces to single space
+            header = re.sub(r'\s+', ' ', header)
+            # Ensure exactly one space between Chinese and English
+            header = re.sub(r'([\u4e00-\u9fff])\s*([a-zA-Z])', r'\1 \2', header)
+            header = re.sub(r'([a-zA-Z])\s*([\u4e00-\u9fff])', r'\1 \2', header)
+            # Final cleanup of any remaining multiple spaces
+            header = re.sub(r'\s+', ' ', header)
+            cleaned_headers.append(header.strip())
+        return cleaned_headers
     # Apply the cleaning function to extracted headers
     extracted_headers = clean_header_spaces(extracted_headers)
         "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
         "备注", "计划来源"
     ]
+    # Hardcoded mapping dictionary
+    hardcoded_mapping = {
+        # 序号 mappings
+        "序号": ["序号 no.", "序号 no", "no.", "no", "序号no.", "序号no", "序号 item", "序号item", "序号"],
+        # 名称 mappings
+        "名称": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "名称", "产品名称 product name"],
+        # 名称(英文) mappings
+        "名称(英文)": ["名称 name", "名称name", "name", "名称name of materials", "名称name of materials and equipment", "名称 name of materials", "名称 name of materials and equipment", "单价（欧元） unit price（eur）", "名称", "产品名称 product name", "单价（元）unit price（cny）"],
+        # 品牌 mappings
+        "品牌": ["品牌 brand", "品牌brand", "brand", "品牌 brand", "品牌brand", "品牌"],
+        # 规格型号 mappings
+        "规格型号": ["规格型号 specification", "规格型号specification", "规格 specification", "规格specification",
+                "specification", "规格型号specification and model", "型号model", "型号 model", "规格型号 specification and model", "规格型号"],
+        # 所属机型 mappings
+        "所属机型": ["所属机型 applicable models", "所属机型applicable models", "applicable models", "所属机型"],
+        # 数量 mappings
+        "数量": ["数量 quantity", "数量quantity", "quantity", "qty", "数量qty", "数量"],
+        # 单位 mappings
+        "单位": ["单位 unit", "单位unit", "unit", "单位"],
+        # 单价 mappings
+        "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price",
+               "单价（元）", "单价(cny)", "单价 unit price (cny)", "单价（欧元） unit price（eur）", "单价", "单价（元） unit price(cny)", "单价（元）unit price（cny）"],
+        # 总价 mappings
+        "总价": ["总价 total amount (cny)", "总价total amount (cny)", "total amount (cny)", "总价total amount", "总价 total amount",
+               "总价（元）", "总额（元）", "总价 total amount (cny)", "总价（欧元） amount（eur）", "总价", "总价（元）amount (cny)", "总价（元）amount（cny）"],
+        # 几郎单价 mappings
+        "几郎单价": ["几郎单价 unit price (gnf)", "几郎单价unit price (gnf)", "unit price (gnf)", "几郎单价unit price", "几郎单价 unit price",
+                 "几郎单价（元）", "单价（几郎）", "几郎单价 unit price (gnf)", "几郎单价", "单价 unit price（几郎）(gnf)", "单价（元）unit price（cny）"],
+        # 几郎总价 mappings
+        "几郎总价": ["几郎总价 total amount (gnf)", "几郎总价total amount (gnf)", "total amount (gnf)", "几郎总价total amount", "几郎总价 total amount",
+                 "几郎总价（元）", "总额（几郎）", "几郎总价 total amount (gnf)", "几郎总价", "总额 total amount（几郎）(gnf)", "总价（元）amount（cny）"],
+        # 备注 mappings
+        "备注": ["备注 remarks", "备注remarks", "remarks", "备注 notes", "备注notes", "note", "备注"],
+        # 计划来源 mappings
+        "计划来源": ["计划来源 plan no.", "计划来源plan no.", "计划来源（唛头信息）",
+                 "计划来源 planned source", "计划来源planned source", "planned source", "计划来源"]
+    }
+    # Try to map headers using hardcoded mapping
+    standard_field_mapping = {}
+    unmapped_headers = []
+    # Clean the extracted headers first
+    cleaned_extracted_headers = clean_header_spaces(extracted_headers)
+    # Clean all possible headers in the hardcoded mapping
+    cleaned_hardcoded_mapping = {
+        std_field: [clean_header_spaces([h])[0] for h in possible_headers]
+        for std_field, possible_headers in hardcoded_mapping.items()
+    }
+    print("\n🔍 Hardcoded Mapping Results:")
+    print("-" * 50)
+    for header in cleaned_extracted_headers:
+        header_mapped = False
+        for std_field, possible_headers in cleaned_hardcoded_mapping.items():
+            if header in possible_headers:
+                standard_field_mapping[std_field] = header
+                header_mapped = True
+                print(f"✅ {std_field} -> {header}")
+                break
+        if not header_mapped:
+            unmapped_headers.append(header)
+            print(f"❌ No match found for: {header}")
+    print("-" * 50)
+    # If we have unmapped headers, fall back to AI mapping
+    if unmapped_headers:
+        print(f"⚠️ Some headers could not be mapped using hardcoded mapping: {unmapped_headers}")
+        print("🔄 Falling back to AI mapping...")
+        # Get the list of standard fields that haven't been mapped yet
+        unmapped_standard_fields = [field for field in target_fields if field not in standard_field_mapping]
+        # Use AI to map remaining headers
+        base_prompt = f"""
+You are playing a matching game. Match each and every standard fields to the exact column headers within "" separated by ,.
+You must match all the given column headers to the standard fields to you best ability.
 USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
+The standard fields that need mapping are:
+{json.dumps(unmapped_standard_fields, ensure_ascii=False)}
 You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
+{json.dumps(unmapped_headers, ensure_ascii=False)}
 ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
   "名称": "名称Name of Materials and Equipment",
   "名称(英文)": "名称Name of Materials and Equipment"
 }}
 """
+        messages = [{"role": "user", "content": base_prompt}]
+        client = OpenAI(
+            base_url=base_url,
+            api_key=HF_API_KEY,
+        )
+        # Add retry logic for AI mapping
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries})")
+                response = client.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=0.1,
+                )
+                raw_mapping = response.choices[0].message.content
+                think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
+                if think_text:
+                    print(f"🧠 Thought Process: {think_text}")
+                    logging.info(f"Think text: {think_text}")
+                raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
+                # Remove any backticks or json tags
+                raw_mapping = re.sub(r"```json|```", "", raw_mapping)
+                # Parse the AI mapping and merge with hardcoded mapping
+                ai_mapping = json.loads(raw_mapping.strip())
+                standard_field_mapping.update(ai_mapping)
+                # Check if all standard fields are mapped
+                still_unmapped = [field for field in target_fields if field not in standard_field_mapping]
+                if still_unmapped:
+                    print(f"⚠️ Some standard fields are still unmapped: {still_unmapped}")
+                    if attempt < max_retries - 1:
+                        # Add feedback to the prompt for the next attempt
+                        messages.append({
+                            "role": "assistant",
+                            "content": response.choices[0].message.content
+                        })
+                        messages.append({
+                            "role": "user",
+                            "content": f"The following standard fields are still unmapped: {still_unmapped}. Please try to map these fields using the available headers: {unmapped_headers}"
+                        })
                         continue
                 else:
+                    print(f"✅ Successfully mapped all fields using AI")
+                    print("\n📊 AI Mapping Results:")
+                    print("-------------------")
+                    for std_field, mapped_header in ai_mapping.items():
+                        print(f"{std_field} -> {mapped_header}")
+                    print("-------------------")
+                    break
+            except Exception as e:
+                error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
+                logging.error(f"{error_msg}")
+                print(f"❌ {error_msg}")
+                if attempt < max_retries - 1:
+                    messages.append({
+                        "role": "assistant",
+                        "content": response.choices[0].message.content
+                    })
+                    messages.append({
+                        "role": "user",
+                        "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
+                    })
+                else:
+                    print(f"⚠️ All AI mapping attempts failed, proceeding with partial mapping")
+    # After all mapping is done, print the final mapping and unmapped columns
+    print("\n📊 Final Field Mapping:")
+    print("-" * 50)
+    # Print all standard fields, showing mapping if exists or blank if not
+    for field in target_fields:
+        mapped_header = standard_field_mapping.get(field, "")
+        print(f"{field} -> {mapped_header}")
+    print("-" * 50)
+    # Check for unmapped standard fields
+    unmapped_standard = [field for field in target_fields if field not in standard_field_mapping]
+    if unmapped_standard:
+        print("\n⚠️ Unmapped Standard Fields:")
+        print("-" * 50)
+        for field in unmapped_standard:
+            print(f"- {field}")
+        print("-" * 50)
+    # Check for unmapped extracted headers
+    mapped_headers = set(standard_field_mapping.values())
+    unmapped_headers = [header for header in extracted_headers if header not in mapped_headers]
+    if unmapped_headers:
+        print("\n⚠️ Unmapped Extracted Headers:")
+        print("-" * 50)
+        for header in unmapped_headers:
+            print(f"- {header}")
+        print("-" * 50)
+    # Function to separate Chinese and English text
+    def separate_chinese_english(text):
+        if not text or not isinstance(text, str):
+            return "", ""
+        # Find all Chinese character positions
+        chinese_positions = []
+        for i, char in enumerate(text):
+            if '\u4e00' <= char <= '\u9fff':
+                chinese_positions.append(i)
+        if not chinese_positions:
+            # No Chinese characters, return empty Chinese and full text as English
+            return "", text.strip()
+        # Find the last Chinese character position
+        last_chinese_pos = chinese_positions[-1]
+        # Everything up to and including the last Chinese character is Chinese
+        chinese_part = text[:last_chinese_pos + 1].strip()
+        # Everything after the last Chinese character is English
+        english_part = text[last_chinese_pos + 1:].strip()
+        # If English part doesn't actually contain English letters, treat it as empty
+        if not re.search(r'[a-zA-Z]', english_part):
+            english_part = ""
+        return chinese_part, english_part
+    # Process the data based on the final mapping
+    transformed_data = []
+    for row in price_list:
+        new_row = {field: "" for field in target_fields}  # Initialize with empty strings
+        other_fields = {}
+        # Step 1: Handle name fields first - look for any field with "名称" or "name"
+        for header, value in row.items():
+            # Clean the header for comparison
+            cleaned_header = re.sub(r'\s+', ' ', header).strip()
+            header_lower = cleaned_header.lower()
+            if ("名称" in header_lower or "name" in header_lower) and value:
+                # If field contains both Chinese and English, separate them
+                if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
+                    chinese, english = separate_chinese_english(value)
+                    if chinese:
+                        new_row["名称"] = chinese
+                    if english:
+                        new_row["名称(英文)"] = english
+                    print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
+                else:
+                    # Just set the name directly
+                    new_row["名称"] = value
+                break  # Stop after finding first name field
+        # Step 2: Fill in all other fields using standard mapping
+        for header, value in row.items():
+            # Skip empty values
+            if not value:
+                continue
+            # Clean the header for comparison
+            cleaned_header = re.sub(r'\s+', ' ', header).strip()
+            # Check if this maps to a standard field
+            matched_field = None
+            for std_field, mapped_header in standard_field_mapping.items():
+                # Make comparison more flexible by lowercasing and stripping spaces
+                if mapped_header.lower().strip() == cleaned_header.lower().strip():
+                    matched_field = std_field
+                    break
+            # If we found a mapping, use it (but don't overwrite name fields)
+            if matched_field:
+                if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
+                    new_row[matched_field] = value
+            # If no mapping found, add to other_fields
+            else:
+                # Skip name fields we already processed
+                header_lower = cleaned_header.lower()
+                if not ("名称" in header_lower or "name" in header_lower):
+                    other_fields[header] = value
+        # Add remaining fields to "其他"
+        if other_fields:
+            new_row["其他"] = other_fields
         else:
+            new_row["其他"] = {}
+        # Convert field names for validation
+        if "名称(英文)" in new_row:
+            new_row["名称(英文)"] = new_row.pop("名称(英文)")
+        transformed_data.append(new_row)
     # Save to file if requested
     if save_json and transformed_data:
         contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
         long_table.to_excel(writer, sheet_name="Price List", index=False)
+# Add this helper function near your other helpers
+def find_price_list_table(extracted_data, min_matches=3):
+    price_keywords = [
+        "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
+        "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
+        "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no"
+    ]
+    best_table = None
+    best_match_count = 0
+    for key, table in extracted_data.items():
+        if "long_table" in key and isinstance(table, list) and table:
+            headers = list(table[0].keys())
+            match_count = 0
+            for header in headers:
+                header_lower = header.lower()
+                if any(kw in header_lower for kw in price_keywords):
+                    match_count += 1
+            if match_count > best_match_count and match_count >= min_matches:
+                best_match_count = match_count
+                best_table = table
+    return best_table
 #--- Extract PO ------------------------------
 def extract_po(docx_path):
     print("Extracting XML data to JSON...")
     json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
     extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
+    print(f"✅ Extracted Data: {extracted_data}")
     # Step 3: Process JSON with OpenAI to get structured output
     print("Processing Contract Summary data with AI...")
     # Find the last long table (excluding summary tables)
     print("Processing Price List data with AI...")
+    extracted_data_dict = json.loads(extracted_data)
+    price_list_table = find_price_list_table(extracted_data_dict)
+    if not price_list_table:
+        print("⚠️ No suitable price list table found!")
+        price_list_table = []
     # Generate the price list filename in the same folder as the document
     price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
     # Process the price list and save it to a JSON file
+    price_list = extract_price_list(price_list_table, save_json=True, json_name=price_list_filename)
     # Step 4: Combine contract summary and long table data into a single JSON object
     print("Combining AI Generated JSON with Extracted Data...")
 import gradio as gr
 from gradio.themes.base import Base
+# def extract_po_api(docx_path):
+#     try:
+#         return extract_po(docx_path)
+#     except Exception as e:
+#         # Return error details in the API response
+#         return {"error":str(e)}
 interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",