Spaces:

MikeMai
/

PO_Extractor_API

Sleeping

App Files Files Community

MikeMai commited on Jul 7, 2025

Commit

1f01c66

verified ·

1 Parent(s): 83f48f4

Update app.py

Browse files

Files changed (1) hide show

app.py +114 -84

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ from fuzzywuzzy import fuzz
 from fuzzywuzzy import process
-HF_API_KEY = os.getenv("HF_API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
 # base_url = "https://router.huggingface.co/novita"
@@ -33,8 +33,8 @@ HF_API_KEY = os.getenv("HF_API_KEY")
 # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
 # Qwen 2.5 7B --------------------------------------------------------
-base_url = "https://router.huggingface.co/together/v1"
-model="Qwen/Qwen2.5-7B-Instruct-Turbo"
 # Qwen 2.5 32B --------------------------------------------------------
 # base_url = "https://router.huggingface.co/novita/v3/openai"
@@ -394,53 +394,74 @@ def identify_table_type_and_header_row(rows):
     return "unknown", 0
 def extract_tables(root):
-    """Extracts tables from the DOCX document and returns structured data."""
-    tables = root.findall('.//w:tbl', NS)
     table_data = {}
     table_paragraphs = set()
-    for table_index, table in enumerate(tables, start=1):
-        rows = table.findall('.//w:tr', NS)
-        if not rows:
-            continue  # Skip empty tables
-        for paragraph in table.findall('.//w:p', NS):
-            table_paragraphs.add(paragraph)
-        table_type, header_row_index = identify_table_type_and_header_row(rows)
-        if table_type == "single_column":
-            single_column_data = process_single_column_table(rows)
-            if single_column_data:
-                table_data[f"table_{table_index}_single_column"] = single_column_data
-            continue
-        elif table_type == "buyer_seller":
-            buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
-            if buyer_seller_data:
-                table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
-            continue
-        elif table_type == "summary":
-            summary_data = process_summary_table(rows[header_row_index:])
-            if summary_data:
-                table_data[f"table_{table_index}_summary"] = summary_data
-            continue
-        elif table_type == "long_table":
-            long_table_data = process_long_table(rows[header_row_index:])
-            if long_table_data:
-                table_data[f"long_table_{table_index}"] = long_table_data
-            continue
-        else:
-            # fallback: try to process as long table from first multi-column row
-            long_table_data = process_long_table(rows[header_row_index:])
-            if long_table_data:
-                table_data[f"long_table_{table_index}"] = long_table_data
-            continue
-    # # Print the first row's cell texts for debugging
-    # header_cells = rows[0].findall('.//w:tc', NS)
-    # header_texts = ["|".join(extract_text_from_cell(cell)) for cell in header_cells]
-    # print(f"Table {table_index} header: {header_texts}")
     return table_data, table_paragraphs
 # --- Non-Table Processing Functions ---
@@ -568,6 +589,11 @@ Contract data in JSON format:""" + f"""
             # Clean up JSON before validation
             contract_json = json.loads(contract_summary.strip())
             validated_data = ContractSummary.model_validate(contract_json)
             # Success! Return validated data
@@ -603,6 +629,11 @@ Contract data in JSON format:""" + f"""
     # If we get here, all attempts failed - return empty but valid model
     print("⚠️ All attempts failed, returning empty model")
     empty_data = ContractSummary().model_dump(by_alias=True)
     empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
     if save_json:
@@ -612,6 +643,39 @@ Contract data in JSON format:""" + f"""
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
 def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
     """
     Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
@@ -697,7 +761,7 @@ def extract_price_list(price_list, save_json=False, json_name="price_list.json",
         # 单位 mappings
         "单位": ["单位 unit", "单位unit", "unit", "单位"],
         # 单价 mappings
-        "单价": ["单价 unit price (cny)", "单价unit price (cny)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price（cny）",
                "单价（元）", "单价(cny)", "单价 unit price (cny)", "单价（欧元） unit price（eur）", "单价", "单价（元） unit price(cny)", "单价（元）unit price（cny）", "单价（欧元） unit price（eur）",
                "价格 price", "价格price", "价格",
                "美元单价"],
@@ -1092,40 +1156,6 @@ def json_to_excel(contract_summary, json_data, excel_path):
         contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
         long_table.to_excel(writer, sheet_name="Price List", index=False)
-# Add this helper function near your other helpers
-def find_price_list_table(extracted_data, min_matches=3):
-    price_keywords = [
-        "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
-        "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
-        "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
-        "货描", "commodity",
-    ]
-    last_price_list_table = None
-    # Get all long tables and sort them by key to ensure we process them in order
-    long_tables = [(key, table) for key, table in extracted_data.items()
-                   if "long_table" in key and isinstance(table, list) and table]
-    long_tables.sort(key=lambda x: x[0])  # Sort by key to maintain order
-    for key, table in long_tables:
-        headers = list(table[0].keys())
-        match_count = 0
-        for header in headers:
-            header_lower = header.lower()
-            # Use fuzzy matching for keyword detection
-            for keyword in price_keywords:
-                if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
-                    match_count += 1
-                    break  # Found a match for this header, move to next
-        if match_count >= min_matches:
-            last_price_list_table = table  # Keep the last table that meets criteria
-    return last_price_list_table
 #--- Handle Edge Cases ------------------------------
 def handle_weight_conversion_edge_case(transformed_data):
@@ -1320,7 +1350,7 @@ def extract_po(docx_path):
 # Example Usage
 # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
-# print(extract_po(r"UAT Contracts\修改后合同\GN-CGS202410-AMC-169BJ 柳工设备配件采购合同-广西柳工.docx"))
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))

 from fuzzywuzzy import process
+HF_API_KEY = os.getenv("API_KEY")
 # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
 # base_url = "https://router.huggingface.co/novita"
 # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
 # Qwen 2.5 7B --------------------------------------------------------
+base_url = os.getenv("LLM_URL")
+model= os.getenv("MODEL_NAME")
 # Qwen 2.5 32B --------------------------------------------------------
 # base_url = "https://router.huggingface.co/novita/v3/openai"
     return "unknown", 0
 def extract_tables(root):
+    """Extracts tables from the DOCX document and returns structured data, skipping tables whose title contains 'template'.
+    Handles cases where there are blank paragraphs between the title and the table."""
+    # Find the document body (usually the first child of root)
+    body = root.find('.//w:body', NS)
+    if body is None:
+        body = root  # fallback if structure is different
     table_data = {}
     table_paragraphs = set()
+    table_index = 1
+    last_paragraphs = []  # Store the last few paragraphs (max 3)
+    # Iterate through direct children of the body
+    for elem in list(body):
+        if elem.tag == f'{{{NS["w"]}}}p':
+            # Keep a rolling list of the last 3 paragraphs
+            last_paragraphs.append(elem)
+            if len(last_paragraphs) > 3:
+                last_paragraphs.pop(0)
+        elif elem.tag == f'{{{NS["w"]}}}tbl':
+            # Look back through last_paragraphs for the most recent non-empty one
+            title = ""
+            for para in reversed(last_paragraphs):
+                texts = [t.text for t in para.findall('.//w:t', NS) if t.text]
+                candidate = ' '.join(texts).strip()
+                if candidate:
+                    title = candidate
+                    break
+            # If title contains 'template', skip this table
+            if title and 'template' in title.lower():
+                continue
+            rows = elem.findall('.//w:tr', NS)
+            if not rows:
+                continue  # Skip empty tables
+            for paragraph in elem.findall('.//w:p', NS):
+                table_paragraphs.add(paragraph)
+            table_type, header_row_index = identify_table_type_and_header_row(rows)
+            if table_type == "single_column":
+                single_column_data = process_single_column_table(rows)
+                if single_column_data:
+                    table_data[f"table_{table_index}_single_column"] = single_column_data
+                table_index += 1
+                continue
+            elif table_type == "buyer_seller":
+                buyer_seller_data = process_buyer_seller_table(rows[header_row_index:])
+                if buyer_seller_data:
+                    table_data[f"table_{table_index}_buyer_seller"] = buyer_seller_data
+                table_index += 1
+                continue
+            elif table_type == "summary":
+                summary_data = process_summary_table(rows[header_row_index:])
+                if summary_data:
+                    table_data[f"table_{table_index}_summary"] = summary_data
+                table_index += 1
+                continue
+            elif table_type == "long_table":
+                long_table_data = process_long_table(rows[header_row_index:])
+                if long_table_data:
+                    table_data[f"long_table_{table_index}"] = long_table_data
+                table_index += 1
+                continue
+            else:
+                # fallback: try to process as long table from first multi-column row
+                long_table_data = process_long_table(rows[header_row_index:])
+                if long_table_data:
+                    table_data[f"long_table_{table_index}"] = long_table_data
+                table_index += 1
+                continue
     return table_data, table_paragraphs
 # --- Non-Table Processing Functions ---
             # Clean up JSON before validation
             contract_json = json.loads(contract_summary.strip())
+            # Clean 合同编号 by removing all contents in brackets including the brackets themselves
+            if "合同编号" in contract_json and contract_json["合同编号"]:
+                contract_json["合同编号"] = re.sub(r'[\(（].*?[\)）]', '', contract_json["合同编号"]).strip()
             validated_data = ContractSummary.model_validate(contract_json)
             # Success! Return validated data
     # If we get here, all attempts failed - return empty but valid model
     print("⚠️ All attempts failed, returning empty model")
     empty_data = ContractSummary().model_dump(by_alias=True)
+    # Clean 合同编号 by removing all contents in brackets including the brackets themselves
+    if "合同编号" in empty_data and empty_data["合同编号"]:
+        empty_data["合同编号"] = re.sub(r'[\(（].*?[\)）]', '', empty_data["合同编号"]).strip()
     empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
     if save_json:
     return json.dumps(empty_json, ensure_ascii=False, indent=4)
+def find_price_list_table(extracted_data, min_matches=3):
+    price_keywords = [
+        "名称", "name", "规格", "specification", "型号", "model", "所属机型", "applicable models",
+        "单位", "unit", "数量", "quantity", "单价", "unit price", "总价", "amount",
+        "几郎单价", "unit price(gnf)", "几郎总价", "amount(gnf)", "备注", "remarks", "计划来源", "plan no",
+        "货描", "commodity",
+    ]
+    last_price_list_table = None
+    # Get all long tables and sort them by key to ensure we process them in order
+    long_tables = [(key, table) for key, table in extracted_data.items()
+                   if "long_table" in key and isinstance(table, list) and table]
+    long_tables.sort(key=lambda x: x[0])  # Sort by key to maintain order
+    for key, table in long_tables:
+        headers = list(table[0].keys())
+        match_count = 0
+        for header in headers:
+            header_lower = header.lower()
+            # Use fuzzy matching for keyword detection
+            for keyword in price_keywords:
+                if fuzz.partial_ratio(header_lower, keyword.lower()) >= 70:
+                    match_count += 1
+                    break  # Found a match for this header, move to next
+        if match_count >= min_matches:
+            last_price_list_table = table  # Keep the last table that meets criteria
+    return last_price_list_table
 def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
     """
     Extracts structured price list by first using hardcoded mapping, then falling back to AI if needed.
         # 单位 mappings
         "单位": ["单位 unit", "单位unit", "unit", "单位"],
         # 单价 mappings
+        "单价": ["单价 unit price (cny)", "单价unit price (cny)", "单价（元）Unit Price (CNY)", "unit price (cny)", "单价unit price", "单价 unit price", "单价 unit price（cny）",
                "单价（元）", "单价(cny)", "单价 unit price (cny)", "单价（欧元） unit price（eur）", "单价", "单价（元） unit price(cny)", "单价（元）unit price（cny）", "单价（欧元） unit price（eur）",
                "价格 price", "价格price", "价格",
                "美元单价"],
         contract_summary_df.to_excel(writer, sheet_name="Contract Summary", index=False)
         long_table.to_excel(writer, sheet_name="Price List", index=False)
 #--- Handle Edge Cases ------------------------------
 def handle_weight_conversion_edge_case(transformed_data):
 # Example Usage
 # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))
+# print(extract_po(r"UAT Contracts\20250703\GN-WAPJS202405-297HG 1200R20轮胎采购合同-威海君乐-法务审批0515.docx"))
 # print(extract_price_list([{'序号 No.': '1', '名称 Name': 'PE波纹管(双壁波纹管） PE corrugated pipe (double wall corrugated pipe)', '规格 Specification': '内径600mm,6米/根，SN8 Inner diameter 600mm, 6 meters per piece, SN8', '单位 Unit': '米m', '数量 Quantity': '180', '单价（元） Unit Price (CNY)': '106.00', '总额（元） Total Amount (CNY)': '1080.00', '几郎单价（元） Unit Price (GNF)': '16.21', '几郎总额（元） Total Amount (GNF)': '22118.38', '品牌 Brand': '鹏洲PZ', '计划来源 Planned Source': 'SMB268-GNHY-0021-WJ-20250108'}]))