Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on Apr 29, 2025

Commit

dc7b3c3

verified ·

1 Parent(s): 58ff4c6

Update app.py

Browse files

Files changed (1) hide show

app.py +341 -114

app.py CHANGED Viewed

@@ -16,23 +16,23 @@ import re
 import logging
-from pydantic import BaseModel, Field, ValidationError, RootModel, field_validator
 from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
-base_url = "https://router.huggingface.co/novita"
-model = "deepseek/deepseek-r1-distill-qwen-14b"
 # Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
 # base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
 # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
 # Qwen 2.5 7B --------------------------------------------------------
-# base_url = "https://router.huggingface.co/together/v1"
-# model="Qwen/Qwen2.5-7B-Instruct-Turbo"
 # Qwen 2.5 32B --------------------------------------------------------
 # base_url = "https://router.huggingface.co/novita/v3/openai"
@@ -530,81 +530,93 @@ Contract data in JSON format:""" + f"""
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
-def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     """
-    Extracts structured price list using DeepSeek LLM and validates output with Pydantic.
-    Retries up to 3 times with error feedback if output is not valid JSON.
     """
-    # Pydantic schema
-    class PriceItem(BaseModel):
-        序号: Optional[str] = ""
-        名称: Optional[str] = ""
-        名称_英文: Optional[str] = Field("", alias="名称(英文)")
-        品牌: Optional[str] = ""
-        规格: Optional[str] = ""
-        所属机型: Optional[str] = ""
-        采购数量: Optional[str] = ""
-        单位: Optional[str] = ""
-        单价: Optional[str] = ""
-        总价: Optional[str] = ""
-        几郎单价: Optional[str] = ""
-        几郎总额: Optional[str] = ""
-        备注: Optional[str] = ""
-        计划来源: Optional[str] = ""
-        其他: Optional[dict] = Field(default_factory=dict, alias="其他")
-        model_config = {
-            # Ensures numbers remain as strings and aren't converted
-            "coerce_numbers_to_str": True,
-            # Allows numeric strings to be parsed into the model
-            "arbitrary_types_allowed": True
-        }
-        @field_validator('其他', mode='before')
-        def convert_empty_list_to_dict(cls, v):
-            # Convert empty list to empty dict
-            if isinstance(v, list) and len(v) == 0:
-                return {}
-            return v
-    class PriceListModel(BaseModel):
-        items: List[PriceItem]
-        model_config = {
-            # Ensures numbers remain as strings and aren't converted
-            "coerce_numbers_to_str": True,
-            # Allows numeric strings to be parsed into the model
-            "arbitrary_types_allowed": True
-        }
-    base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
-有时候第一行是表头，有时候是数据行，只输入数据行。
-请注意，输出的 JSON 需要符合以下格式要求：
-# 输出格式要求：
-每个条目输出以下字段：
-- 序号
-- 名称：只填中文
-- 名称(英文)：只填英文
-- 品牌
-- 规格
-- 所属机型
-- 采购数量:
-- 单位
-- 单价:
-- 总价:
-- 几郎单价:
-- 几郎总额:
-- 备注
-- 计划来源
-- 其他：如果有以上以外的字段就以list的形式写在其他里 {{"其他": "key1": "value1", "key2":"value2"}},如果没有就给一个空的list:{{}}
-请确保输出的 JSON 是有效的，且字段名称与输入的字段名称一致。请注意，字段名称可能会有不同的拼写方式，请根据上下文进行判断。
-请确保输出的条目数量与输入的列表数量一致。
-# 原始价格表：
-{price_list}"""
     messages = [{"role": "user", "content": base_prompt}]
@@ -613,54 +625,268 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
         api_key=HF_API_KEY,
     )
-    for attempt in range(3):
-        print(f"🔁 Attempt {attempt + 1} to extract and validate Price List")
         try:
             response = client.chat.completions.create(
                 model=model,
                 messages=messages,
             )
-            raw = response.choices[0].message.content
             think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
             if think_text:
                 print(f"🧠 Thought Process: {think_text}")
                 logging.info(f"Think text: {think_text}")
-            # Strip out LLM artifacts
-            raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
-            raw = re.sub(r"^```json\n|```$", "", raw.strip(), flags=re.DOTALL)
-            # Wrap the raw JSON in a proper structure if it's a list
-            if raw.strip().startswith('['):
-                raw = '{"items": ' + raw + '}'
-            validated = PriceListModel.model_validate_json(raw)
-            price_list_json = validated.model_dump(by_alias=True)["items"]
-            if save_json:
-                with open(json_name, "w", encoding="utf-8") as f:
-                    json.dump(price_list_json, f, ensure_ascii=False, indent=4)
-                print(f"✅ Saved to {json_name}")
-            return price_list_json
-        except ValidationError as ve:
-            error_msg = f"Pydantic validation error: {ve}"
         except Exception as e:
-            error_msg = f"Unexpected error: {e}"
-        print(f"❌ {error_msg}")
-        messages.append({
-            "role": "user",
-            "content": f"Your previous attempt gave this error: {error_msg}. Please try again ensuring your response is valid JSON with correct format."
-        })
-    print("⚠️ Failed after 3 attempts.")
-    return raw
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
@@ -720,7 +946,7 @@ def extract_po(docx_path):
     price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
     # Process the price list and save it to a JSON file
-    price_list = deepseek_extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
     # Step 4: Combine contract summary and long table data into a single JSON object
     print("Combining AI Generated JSON with Extracted Data...")
@@ -739,7 +965,8 @@ def extract_po(docx_path):
     Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
-    print(log)
     logging.info(f"""{log}""")
     return combined_data
@@ -747,9 +974,9 @@ def extract_po(docx_path):
 # Example Usage
 # extract_po("test-contract-converted.docx")
-# extract_po("test-contract.docx")
-# print(deepseek_extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------

 import logging
+from pydantic import BaseModel, Field, ValidationError, RootModel
 from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
+# base_url = "https://router.huggingface.co/novita"
+# model = "deepseek/deepseek-r1-distill-qwen-14b"
 # Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
 # base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
 # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
 # Qwen 2.5 7B --------------------------------------------------------
+base_url = "https://router.huggingface.co/together/v1"
+model="Qwen/Qwen2.5-7B-Instruct-Turbo"
 # Qwen 2.5 32B --------------------------------------------------------
 # base_url = "https://router.huggingface.co/novita/v3/openai"
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
+def extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     """
+    Extracts structured price list by first using AI to map column names to standard keys,
+    then programmatically transforming the data to match the Pydantic model.
     """
+    # If price_list is empty, return an empty list
+    if not price_list:
+        return []
+    # Convert price_list to a list if it's a dict
+    if isinstance(price_list, dict):
+        # Check if the dict has any items
+        if len(price_list) == 0:
+            return []
+        # Convert to list if it's just a single entry dict
+        price_list = [price_list]
+    # Extract a sample row for header mapping
+    sample_row = price_list[0] if price_list else {}
+    # If there are no headers, return empty list
+    if not sample_row:
+        return []
+    # Get the headers directly from the sample row
+    extracted_headers = list(sample_row.keys())
+    # Clean double spaces in headers to facilitate AI identification
+    def clean_header_spaces(headers):
+        """Clean double spaces in headers to make them more consistent for AI processing."""
+        return [re.sub(r'\s+', ' ', header).strip() for header in headers]
+    # Apply the cleaning function to extracted headers
+    extracted_headers = clean_header_spaces(extracted_headers)
+    # Define our target fields from the Pydantic model
+    target_fields = [
+        "序号", "名称", "名称(英文)", "品牌", "规格型号", "所属机型",
+        "数量", "单位", "单价", "总价", "几郎单价", "几郎总价",
+        "备注", "计划来源"
+    ]
+    sample_mapping = """Examples of how you should map to guide you, there are other cases so use your own judgement to map the headers to the standard fields:
+- Map "序号" to headers containing "序号No.", "序号 No.",
+- Map "品牌" to headers containing "品牌Brand", "品牌 brand",
+- Map "规格型号" to headers containing "规格型号", "规格 Specification", "Specification and Model", "规格型号Specification and Model", "型号Model"
+- Map "所属机型" to headers containing "所属机型", "Applicable Models"
+- Map "数量" to headers containing "数量Quantity", "数量 Quantity", "Qty"
+- Map "单位" to headers containing "单位Unit", "单位 Unit"
+- Map "单价" to headers containing "单价（元）", "单价(CNY)", "Unit Price (CNY)", "单价Unit Price"
+- Map "总价" to headers containing "总价（元）", "总额（元）", "Amount (CNY)", "Total Amount (CNY)"
+- Map "几郎单价" to headers containing "单价（几郎）", "几郎单价（元）", "Unit Price (GNF)", "单价Unit Price（几郎）(GNF)"
+- Map "几郎总价" to headers containing "总额（几郎）", "几郎总额（元）", "Total Amount (GNF)"
+- Map "备注" to headers containing "备注Remarks", "备注 notes", "Note"
+- Map "计划来源" to headers containing "计划来源Plan No.", "计划来源（唛头信息）", "Planned Source" """
+    # Use AI to map extracted headers to our target fields
+    base_prompt = f"""
+You are playing a matching game. Match each and every standard fields to the exactcolumn headers within "" separated by ,.
+USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
+The standard fields are:
+{json.dumps(target_fields, ensure_ascii=False)}
+You are given column headers below: (YOU MUST USE THE EXACT HEADER BELOW INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING)
+{json.dumps(extracted_headers, ensure_ascii=False)}
+ENSURE ALL STANDARD FIELDS ARE MAPPED TO THE EXACT COLUMN HEADER INCLUDING BOTH CHINESE AND ENGLISH AND THE EXACT SPACING.
+Return only a JSON mapping in this format WITHOUT any explanations:
+```json
+{{
+  "standard_field_1": "column_header_1",
+  "standard_field_2": "column_header_2",
+  ...
+}}
+```
+Important: Map "名称" AND "名称(英文)" to the SAME extracted header.
+For example, if the extracted header is "名称Name of Materials and Equipment", then:
+{{
+  "名称": "名称Name of Materials and Equipment",
+  "名称(英文)": "名称Name of Materials and Equipment"
+}}
+"""
     messages = [{"role": "user", "content": base_prompt}]
         api_key=HF_API_KEY,
     )
+    # Add retry logic similar to deepseek_extract_contract_summary
+    max_retries = 3
+    transformed_data = []
+    for attempt in range(max_retries):
         try:
+            print(f"🔄 Sending prompt to LLM (attempt {attempt + 1} of {max_retries}: {base_prompt})")
             response = client.chat.completions.create(
                 model=model,
                 messages=messages,
+                temperature=0.1,
             )
+            raw_mapping = response.choices[0].message.content
             think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
             if think_text:
                 print(f"🧠 Thought Process: {think_text}")
                 logging.info(f"Think text: {think_text}")
+            raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
+            # Remove any backticks or json tags
+            raw_mapping = re.sub(r"```json|```", "", raw_mapping)
+            # Parse the mapping with standard fields as keys
+            standard_field_mapping = json.loads(raw_mapping.strip())
+            print(f"📊 Standard field mapping: {json.dumps(standard_field_mapping, ensure_ascii=False, indent=2)}")
+            # Function to separate Chinese and English text
+            def separate_chinese_english(text):
+                if not text or not isinstance(text, str):
+                    return "", ""
+                # First check if there's a clear separator like hyphen or space
+                # Common patterns: "中文-English", "中文（English）", "中文 English"
+                patterns = [
+                    r'^([\u4e00-\u9fff\-]+)[:\-\s]+([a-zA-Z].*)$',  # Chinese-English
+                    r'^([\u4e00-\u9fff\-]+)[\(（]([a-zA-Z].*)[\)）]$',  # Chinese(English)
+                ]
+                for pattern in patterns:
+                    match = re.search(pattern, text)
+                    if match:
+                        return match.group(1), match.group(2)
+                # Find the first Chinese character index
+                first_chinese_idx = -1
+                for i, char in enumerate(text):
+                    if '\u4e00' <= char <= '\u9fff':  # Chinese character
+                        first_chinese_idx = i
+                        break
+                # Find where English starts after Chinese
+                english_start_idx = len(text)
+                if first_chinese_idx >= 0:
+                    # Search for the first English character that comes after Chinese
+                    for i in range(first_chinese_idx, len(text)):
+                        # Skip to the end of Chinese characters
+                        if '\u4e00' <= text[i] <= '\u9fff':
+                            continue
+                        # Look ahead for English characters
+                        for j in range(i, len(text)):
+                            if 'a' <= text[j].lower() <= 'z':
+                                english_start_idx = j
+                                break
+                        if english_start_idx < len(text):
+                            break
+                # If we found the boundaries
+                if first_chinese_idx >= 0 and english_start_idx < len(text):
+                    # Handle prefix: any Latin characters before Chinese should be part of Chinese name
+                    prefix = text[:first_chinese_idx].strip() if first_chinese_idx > 0 else ""
+                    chinese_part = text[first_chinese_idx:english_start_idx].strip()
+                    english_part = text[english_start_idx:].strip()
+                    # Combine prefix with Chinese part
+                    if prefix:
+                        chinese_part = f"{prefix} {chinese_part}"
+                    return chinese_part, english_part
+                # Special case for prefix like "PVC" with no space before Chinese
+                if first_chinese_idx > 0:
+                    prefix = text[:first_chinese_idx].strip()
+                    rest_of_text = text[first_chinese_idx:]
+                    # Extract Chinese and English from the rest of the text
+                    chinese_chars = []
+                    english_chars = []
+                    in_chinese = True
+                    for char in rest_of_text:
+                        if '\u4e00' <= char <= '\u9fff':  # Chinese character
+                            if not in_chinese and english_chars:  # If we've already seen English, something is wrong
+                                chinese_chars = []
+                                english_chars = []
+                                break
+                            chinese_chars.append(char)
+                            in_chinese = True
+                        elif 'a' <= char.lower() <= 'z' or char in ' -_()':  # English or separator
+                            if in_chinese and chinese_chars:  # We've seen Chinese and now see English
+                                english_chars.append(char)
+                                in_chinese = False
+                            elif not in_chinese:  # Continue collecting English
+                                english_chars.append(char)
+                            else:  # No Chinese seen yet, might be part of prefix
+                                chinese_chars.append(char)
+                        else:  # Other characters (numbers, etc.)
+                            if in_chinese:
+                                chinese_chars.append(char)
+                            else:
+                                english_chars.append(char)
+                    if chinese_chars and english_chars:
+                        chinese_text = prefix + " " + ''.join(chinese_chars).strip()
+                        english_text = ''.join(english_chars).strip()
+                        return chinese_text, english_text
+                    else:
+                        # No clean separation possible
+                        return prefix + " " + rest_of_text, ""
+                # Fallback: Try simple pattern matching
+                # Find all Chinese characters
+                chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
+                chinese = ''.join(chinese_chars)
+                # If we have Chinese, extract everything up to the last Chinese character
+                if chinese:
+                    last_chinese_idx = text.rindex(chinese_chars[-1]) + len(chinese_chars[-1])
+                    # Anything before the first Chinese character is a prefix
+                    first_chinese_idx = text.index(chinese_chars[0])
+                    prefix = text[:first_chinese_idx].strip()
+                    # Everything after the last Chinese character is English
+                    chinese_part = prefix + " " + text[first_chinese_idx:last_chinese_idx].strip() if prefix else text[first_chinese_idx:last_chinese_idx].strip()
+                    english_part = text[last_chinese_idx:].strip()
+                    # If English part doesn't actually contain English letters, treat it as empty
+                    if not re.search(r'[a-zA-Z]', english_part):
+                        english_part = ""
+                    return chinese_part, english_part
+                # No Chinese characters found, check if there are any English letters
+                if re.search(r'[a-zA-Z]', text):
+                    return "", text.strip()
+                # No clear separation possible
+                return text.strip(), ""
+            # Process the data based on the standard field mapping
+            transformed_data = []
+            for row in price_list:
+                new_row = {field: "" for field in target_fields}  # Initialize with empty strings
+                other_fields = {}
+                # Step 1: Handle name fields first - look for any field with "名称" or "name"
+                for header, value in row.items():
+                    # Clean the header for comparison
+                    cleaned_header = re.sub(r'\s+', ' ', header).strip()
+                    header_lower = cleaned_header.lower()
+                    if ("名称" in header_lower or "name" in header_lower) and value:
+                        # If field contains both Chinese and English, separate them
+                        if re.search(r'[\u4e00-\u9fff]', value) and re.search(r'[a-zA-Z]', value):
+                            chinese, english = separate_chinese_english(value)
+                            if chinese:
+                                new_row["名称"] = chinese
+                            if english:
+                                new_row["名称(英文)"] = english
+                            print(f"Separated: '{value}' → Chinese: '{chinese}', English: '{english}'")
+                        else:
+                            # Just set the name directly
+                            new_row["名称"] = value
+                        break  # Stop after finding first name field
+                # Step 2: Fill in all other fields using standard mapping
+                for header, value in row.items():
+                    # Skip empty values
+                    if not value:
+                        continue
+                    # Clean the header for comparison
+                    cleaned_header = re.sub(r'\s+', ' ', header).strip()
+                    # Check if this maps to a standard field
+                    matched_field = None
+                    for std_field, mapped_header in standard_field_mapping.items():
+                        # Make comparison more flexible by lowercasing and stripping spaces
+                        if mapped_header.lower().strip() == cleaned_header.lower().strip():
+                            matched_field = std_field
+                            break
+                    # If we found a mapping, use it (but don't overwrite name fields)
+                    if matched_field:
+                        if matched_field not in ["名称", "名称(英文)"] or not new_row[matched_field]:
+                            new_row[matched_field] = value
+                    # If no mapping found, add to other_fields
+                    else:
+                        # Skip name fields we already processed
+                        header_lower = cleaned_header.lower()
+                        if not ("名称" in header_lower or "name" in header_lower):
+                            other_fields[header] = value
+                # Add remaining fields to "其他"
+                if other_fields:
+                    new_row["其他"] = other_fields
+                else:
+                    new_row["其他"] = {}
+                # Convert field names for validation
+                if "名称(英文)" in new_row:
+                    new_row["名称(英文)"] = new_row.pop("名称(英文)")
+                transformed_data.append(new_row)
+            # Success! Break out of the retry loop
+            print(f"✅ Successfully processed price list on attempt {attempt + 1}")
+            break
+        except json.JSONDecodeError as e:
+            error_msg = f"JSON decode error in field mapping: {e}"
+            logging.error(f"{error_msg}")
+            print(f"❌ {error_msg}")
+        except KeyError as e:
+            error_msg = f"KeyError during data transformation: {e}"
+            logging.error(f"{error_msg}")
+            print(f"❌ {error_msg}")
         except Exception as e:
+            error_msg = f"Error processing price list: {e}"
+            logging.error(f"{error_msg}")
+            print(f"❌ {error_msg}")
+        # Don't retry on the last attempt
+        if attempt < max_retries - 1:
+            # Add error message to the conversation and retry
+            if 'response' in locals():
+                messages.append({
+                    "role": "assistant",
+                    "content": response.choices[0].message.content
+                })
+            messages.append({
+                "role": "user",
+                "content": f"Your response had the following error: {error_msg}. Please fix your mapping and try again."
+            })
+        else:
+            print(f"⚠️ All {max_retries} attempts failed, returning empty result")
+            transformed_data = []  # Return empty list after all retries failed
+    # Save to file if requested
+    if save_json and transformed_data:
+        with open(json_name, "w", encoding="utf-8") as f:
+            json.dump(transformed_data, f, ensure_ascii=False, indent=4)
+        print(f"✅ Saved to {json_name}")
+    return transformed_data
 def json_to_excel(contract_summary, json_data, excel_path):
     """Converts extracted JSON tables to an Excel file."""
     price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
     # Process the price list and save it to a JSON file
+    price_list = extract_price_list(last_long_table, save_json=True, json_name=price_list_filename)
     # Step 4: Combine contract summary and long table data into a single JSON object
     print("Combining AI Generated JSON with Extracted Data...")
     Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
+    # print(log)
+    # print(f"🔄 Extracted Data: {combined_data}")
     logging.info(f"""{log}""")
     return combined_data
 # Example Usage
 # extract_po("test-contract-converted.docx")
+# extract_po("test-contracts\GN-SMBLMCD202501-032WJ SMB联盟菜地PVC球阀等五金物资采购合同-ZHUOKE.docx")
+# print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '��径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))
 # Gradio Interface ------------------------------