Spaces:

MikeMai
/

PO_Extractor_API

Sleeping

App Files Files Community

MikeMai commited on Jul 9, 2025

Commit

951ce76

verified ·

1 Parent(s): eec7e13

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -40

app.py CHANGED Viewed

@@ -139,9 +139,9 @@ def extract_key_value_pairs(text, target_dict=None):
 # --- Table Processing Functions ---
-def process_single_column_table(rows):
-    """Processes a single-column table and returns the extracted lines as a list."""
-    single_column_data = []
     for row in rows:
         cells = row.findall('.//w:tc', NS)
@@ -149,9 +149,9 @@ def process_single_column_table(rows):
             cell_lines = extract_text_from_cell(cells[0])  # Extract all lines from the cell
             # Append each line directly to the list without splitting
-            single_column_data.extend(cell_lines)
-    return single_column_data  # Return the list of extracted lines
 def process_buyer_seller_table(rows):
     """Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
@@ -365,8 +365,6 @@ def process_long_table(rows):
             # If we couldn't find a serial number column, keep the row
             filtered_table_data.append(row)
-    print(f"Table process_long_table output: {filtered_table_data}")
     # Remove duplicate columns (ending with _2, _3, etc.)
     filtered_table_data = merge_duplicate_columns(filtered_table_data)
@@ -374,23 +372,25 @@ def process_long_table(rows):
 def identify_table_type_and_header_row(rows):
     """Identify table type and the index of the header row."""
-    header_keywords = ["名称", "Name", "规格", "Unit", "Quantity", "单价", "总价", "Remarks"]
     for i, row in enumerate(rows):
         num_cells = len(row.findall('.//w:tc', NS))
         if num_cells > 1:
-            cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
-            if any(keyword in cell_texts for keyword in header_keywords):
-                # Check for buyer-seller or summary table
-                if num_cells == 2:
-                    if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
                         return "buyer_seller", i
                     else:
-                        return "summary", i
                 else:
-                    return "long_table", i
     # Fallbacks
-    if all(len(row.findall('.//w:tc', NS)) == 1 for row in rows):
-        return "single_column", 0
     return "unknown", 0
 def extract_tables(root):
@@ -431,10 +431,10 @@ def extract_tables(root):
             for paragraph in elem.findall('.//w:p', NS):
                 table_paragraphs.add(paragraph)
             table_type, header_row_index = identify_table_type_and_header_row(rows)
-            if table_type == "single_column":
-                single_column_data = process_single_column_table(rows)
-                if single_column_data:
-                    table_data[f"table_{table_index}_single_column"] = single_column_data
                 table_index += 1
                 continue
             elif table_type == "buyer_seller":
@@ -455,13 +455,7 @@ def extract_tables(root):
                     table_data[f"long_table_{table_index}"] = long_table_data
                 table_index += 1
                 continue
-            else:
-                # fallback: try to process as long table from first multi-column row
-                long_table_data = process_long_table(rows[header_row_index:])
-                if long_table_data:
-                    table_data[f"long_table_{table_index}"] = long_table_data
-                table_index += 1
-                continue
     return table_data, table_paragraphs
 # --- Non-Table Processing Functions ---
@@ -593,6 +587,8 @@ Contract data in JSON format:""" + f"""
             # Clean 合同编号 by removing all contents in brackets including the brackets themselves
             if "合同编号" in contract_json and contract_json["合同编号"]:
                 contract_json["合同编号"] = re.sub(r'[\(（].*?[\)）]', '', contract_json["合同编号"]).strip()
             validated_data = ContractSummary.model_validate(contract_json)
@@ -630,10 +626,6 @@ Contract data in JSON format:""" + f"""
     print("⚠️ All attempts failed, returning empty model")
     empty_data = ContractSummary().model_dump(by_alias=True)
-    # Clean 合同编号 by removing all contents in brackets including the brackets themselves
-    if "合同编号" in empty_data and empty_data["合同编号"]:
-        empty_data["合同编号"] = re.sub(r'[\(（].*?[\)）]', '', empty_data["合同编号"]).strip()
     empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
     if save_json:
@@ -651,6 +643,7 @@ def find_price_list_table(extracted_data, min_matches=3):
         "货描", "commodity",
     ]
     last_price_list_table = None
     # Get all long tables and sort them by key to ensure we process them in order
     long_tables = [(key, table) for key, table in extracted_data.items()
@@ -672,8 +665,9 @@ def find_price_list_table(extracted_data, min_matches=3):
         if match_count >= min_matches:
             last_price_list_table = table  # Keep the last table that meets criteria
-    return last_price_list_table
 def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
@@ -1295,14 +1289,13 @@ def extract_po(docx_path):
         # Find and rename the price list table before contract summary processing
         print("Identifying Price List table...")
         extracted_data_dict = json.loads(extracted_data)
-        price_list_table = find_price_list_table(extracted_data_dict)
         # Add the combined price list table to the extracted data
         if price_list_table:
-            # Remove all long_table keys that were used to create the price list
-            keys_to_remove = [key for key in extracted_data_dict.keys() if "long_table" in key]
-            for key in keys_to_remove:
-                del extracted_data_dict[key]
             # Add the combined price list table
             extracted_data_dict["price_list"] = price_list_table
@@ -1314,7 +1307,7 @@ def extract_po(docx_path):
             extracted_data_dict["price_list"] = []
             extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
-        print(f"✅ Extracted Data: {extracted_data}")
         # Create a copy of the data with only first row of price list for contract summary
         contract_summary_dict = json.loads(extracted_data)
@@ -1322,6 +1315,8 @@ def extract_po(docx_path):
             contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
         contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
         # Step 3: Process JSON with OpenAI to get structured output
         print("Processing Contract Summary data with AI...")
         contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
@@ -1346,7 +1341,7 @@ def extract_po(docx_path):
         # Ensure BytesIO is properly closed
         if 'docx_bytes' in locals():
             docx_bytes.close()
 # Example Usage
 # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))

 # --- Table Processing Functions ---
+def process_unknown_table(rows):
+    """Processes unknown tables and returns the extracted lines as a list."""
+    unknown_table_data = []
     for row in rows:
         cells = row.findall('.//w:tc', NS)
             cell_lines = extract_text_from_cell(cells[0])  # Extract all lines from the cell
             # Append each line directly to the list without splitting
+            unknown_table_data.extend(cell_lines)
+    return unknown_table_data  # Return the list of extracted lines
 def process_buyer_seller_table(rows):
     """Processes a two-column buyer-seller table into a structured dictionary using the first row as keys."""
             # If we couldn't find a serial number column, keep the row
             filtered_table_data.append(row)
     # Remove duplicate columns (ending with _2, _3, etc.)
     filtered_table_data = merge_duplicate_columns(filtered_table_data)
 def identify_table_type_and_header_row(rows):
     """Identify table type and the index of the header row."""
     for i, row in enumerate(rows):
         num_cells = len(row.findall('.//w:tc', NS))
         if num_cells > 1:
+            # Check for buyer-seller or summary table based on structure only
+            if num_cells == 2:
+                if all(len(r.findall('.//w:tc', NS)) == 2 for r in rows):
+                    # Check if it contains buyer/seller keywords
+                    cell_texts = " ".join([" ".join(extract_text_from_cell(cell)) for cell in row.findall('.//w:tc', NS)])
+                    buyer_seller_keywords = ["买方", "buyer", "卖方", "seller"]
+                    if any(keyword.lower() in cell_texts.lower() for keyword in buyer_seller_keywords):
                         return "buyer_seller", i
                     else:
+                        return "unknown", i
                 else:
+                    return "summary", i
+            else:
+                # For tables with more than 2 columns, process as long table
+                return "long_table", i
     # Fallbacks
     return "unknown", 0
 def extract_tables(root):
             for paragraph in elem.findall('.//w:p', NS):
                 table_paragraphs.add(paragraph)
             table_type, header_row_index = identify_table_type_and_header_row(rows)
+            if table_type == "unknown":
+                unknown_table_data = process_unknown_table(rows)
+                if unknown_table_data:
+                    table_data[f"table_{table_index}_unknown"] = unknown_table_data
                 table_index += 1
                 continue
             elif table_type == "buyer_seller":
                     table_data[f"long_table_{table_index}"] = long_table_data
                 table_index += 1
                 continue
     return table_data, table_paragraphs
 # --- Non-Table Processing Functions ---
             # Clean 合同编号 by removing all contents in brackets including the brackets themselves
             if "合同编号" in contract_json and contract_json["合同编号"]:
                 contract_json["合同编号"] = re.sub(r'[\(（].*?[\)）]', '', contract_json["合同编号"]).strip()
+                # Remove anything after "/" (including the "/" itself)
+                contract_json["合同编号"] = re.sub(r'/\s*.*$', '', contract_json["合同编号"]).strip()
             validated_data = ContractSummary.model_validate(contract_json)
     print("⚠️ All attempts failed, returning empty model")
     empty_data = ContractSummary().model_dump(by_alias=True)
     empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
     if save_json:
         "货描", "commodity",
     ]
     last_price_list_table = None
+    last_price_list_key = None
     # Get all long tables and sort them by key to ensure we process them in order
     long_tables = [(key, table) for key, table in extracted_data.items()
         if match_count >= min_matches:
             last_price_list_table = table  # Keep the last table that meets criteria
+            last_price_list_key = key  # Keep the key as well
+    return last_price_list_table, last_price_list_key
 def extract_price_list(price_list, save_json=False, json_name="price_list.json", fuzzy=False):
         # Find and rename the price list table before contract summary processing
         print("Identifying Price List table...")
         extracted_data_dict = json.loads(extracted_data)
+        price_list_table, price_list_key = find_price_list_table(extracted_data_dict)
         # Add the combined price list table to the extracted data
         if price_list_table:
+            # Remove only the specific long_table that was used to create the price list
+            if price_list_key:
+                del extracted_data_dict[price_list_key]
             # Add the combined price list table
             extracted_data_dict["price_list"] = price_list_table
             extracted_data_dict["price_list"] = []
             extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
+        # print(f"✅ Extracted Data: {extracted_data}")
         # Create a copy of the data with only first row of price list for contract summary
         contract_summary_dict = json.loads(extracted_data)
             contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
         contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
+        print(f"✅ Contract Summary Data: {contract_summary_data}")
         # Step 3: Process JSON with OpenAI to get structured output
         print("Processing Contract Summary data with AI...")
         contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
         # Ensure BytesIO is properly closed
         if 'docx_bytes' in locals():
             docx_bytes.close()
 # Example Usage
 # print(extract_po("test-contracts\GN-SMB268202501-042WJ SMB268波纹管采购合同-东营顺航.docx"))