Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on Jun 24, 2025

Commit

a4dc2f9

verified ·

1 Parent(s): 7556a1e

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -94

app.py CHANGED Viewed

@@ -14,8 +14,6 @@ from openai import OpenAI
 import re
-import logging
 from pydantic import BaseModel, Field, ValidationError, RootModel
 from typing import List, Optional
@@ -42,14 +40,6 @@ model="Qwen/Qwen2.5-7B-Instruct-Turbo"
 # base_url = "https://router.huggingface.co/sambanova/v1"
 # model="Qwen3-32B"
-# Configure logging to write to 'zaoju_logs.log' without using pickle
-logging.basicConfig(
-    filename='extract_po_logs.log',
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    encoding='utf-8'
-)
 # Default Word XML namespace
 DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
 NS = None  # Global variable to store the namespace
@@ -548,7 +538,6 @@ Contract data in JSON format:""" + f"""
             think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
             if think_text:
                 print(f"🧠 Thought Process: {think_text}")
-                logging.info(f"Think text: {think_text}")
             contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
             contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
@@ -569,14 +558,10 @@ Contract data in JSON format:""" + f"""
         except ValidationError as e:
             error_msg = f"Validation error: {e}"
-            logging.error(f"{error_msg}")
-            logging.error(f"Input data: {contract_summary}")
             print(f"❌ {error_msg}")
         except json.JSONDecodeError as e:
             error_msg = f"JSON decode error: {e}"
-            logging.error(f"{error_msg}")
-            logging.error(f"Input data: {contract_summary}")
             print(f"❌ {error_msg}")
         # Don't retry on the last attempt
@@ -867,7 +852,6 @@ Do not force map 名称(英文) to 单价
                 think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
                 if think_text:
                     print(f"🧠 Thought Process: {think_text}")
-                    logging.info(f"Think text: {think_text}")
                 raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
                 # Remove any backticks or json tags
@@ -903,7 +887,6 @@ Do not force map 名称(英文) to 单价
             except Exception as e:
                 error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
-                logging.error(f"{error_msg}")
                 print(f"❌ {error_msg}")
                 if attempt < max_retries - 1:
@@ -1120,79 +1103,72 @@ def extract_po(docx_path):
     with open(docx_path, "rb") as f:
         docx_bytes = BytesIO(f.read())
-    # Step 1: Extract XML content from DOCX
-    print("Extracting Docs data to XML...")
-    xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
     try:
-        xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
-        get_namespace(ET.fromstring(xml_file))
-    except (zipfile.BadZipFile, KeyError):
-        raise ValueError(f"Invalid file: {docx_path}")
-    # Step 2: Extract tables from DOCX and save JSON
-    print("Extracting XML data to JSON...")
-    json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
-    extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
-    # Find and rename the price list table before contract summary processing
-    print("Identifying Price List table...")
-    extracted_data_dict = json.loads(extracted_data)
-    price_list_table = find_price_list_table(extracted_data_dict)
-    # Rename the price list table key
-    if price_list_table:
-        # Find and rename the key containing the price list table
-        for key in list(extracted_data_dict.keys()):
-            if "long_table" in key and extracted_data_dict[key] == price_list_table:
-                extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
-                break
-        # Update the extracted_data string with proper formatting
-        extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
-    else:
-        print("⚠️ No suitable price list table found!")
-        extracted_data_dict["price_list"] = []
-        extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
-    print(f"✅ Extracted Data: {extracted_data}")
-    # Create a copy of the data with only first row of price list for contract summary
-    contract_summary_dict = json.loads(extracted_data)
-    if contract_summary_dict.get("price_list"):
-        contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
-    contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
-    # Step 3: Process JSON with OpenAI to get structured output
-    print("Processing Contract Summary data with AI...")
-    contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
-    contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
-    # Process the price list
-    print("Processing Price List data with AI...")
-    price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
-    price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
-    # Step 4: Combine contract summary and long table data into a single JSON object
-    print("Combining AI Generated JSON with Extracted Data...")
-    combined_data = {
-        "contract_summary": json.loads(json.loads(contract_summary)),
-        "price_list": price_list
-    }
-    # Logging
-    log = f"""Results:
-    Contract Summary: {contract_summary},
-    RAW Extracted Data: {extracted_data},
-    Combined JSON: {json.dumps(combined_data, ensure_ascii=False, indent=4)}"""
-    # print(log)
-    # print(f"🔄 Extracted Data: {combined_data}")
-    logging.info(f"""{log}""")
-    return combined_data
 # Example Usage
@@ -1206,13 +1182,6 @@ def extract_po(docx_path):
 import gradio as gr
 from gradio.themes.base import Base
-# def extract_po_api(docx_path):
-#     try:
-#         return extract_po(docx_path)
-#     except Exception as e:
-#         # Return error details in the API response
-#         return {"error":str(e)}
 interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",

 import re
 from pydantic import BaseModel, Field, ValidationError, RootModel
 from typing import List, Optional
 # base_url = "https://router.huggingface.co/sambanova/v1"
 # model="Qwen3-32B"
 # Default Word XML namespace
 DEFAULT_NS = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
 NS = None  # Global variable to store the namespace
             think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
             if think_text:
                 print(f"🧠 Thought Process: {think_text}")
             contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
             contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
         except ValidationError as e:
             error_msg = f"Validation error: {e}"
             print(f"❌ {error_msg}")
         except json.JSONDecodeError as e:
             error_msg = f"JSON decode error: {e}"
             print(f"❌ {error_msg}")
         # Don't retry on the last attempt
                 think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
                 if think_text:
                     print(f"🧠 Thought Process: {think_text}")
                 raw_mapping = re.sub(r"<think>.*?</think>\s*", "", raw_mapping, flags=re.DOTALL) # Remove think
                 # Remove any backticks or json tags
             except Exception as e:
                 error_msg = f"Error in AI mapping attempt {attempt + 1}: {e}"
                 print(f"❌ {error_msg}")
                 if attempt < max_retries - 1:
     with open(docx_path, "rb") as f:
         docx_bytes = BytesIO(f.read())
     try:
+        # Step 1: Extract XML content from DOCX
+        print("Extracting Docs data to XML...")
+        xml_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_document.xml"
+        try:
+            xml_file = extract_docx_as_xml(docx_bytes, save_xml=False, xml_filename=xml_filename)
+            get_namespace(ET.fromstring(xml_file))
+        except (zipfile.BadZipFile, KeyError):
+            raise ValueError(f"Invalid file: {docx_path}")
+        # Step 2: Extract tables from DOCX and save JSON
+        print("Extracting XML data to JSON...")
+        json_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_extracted_data.json"
+        extracted_data = xml_to_json(xml_file, save_json=False, json_filename=json_filename)
+        # Find and rename the price list table before contract summary processing
+        print("Identifying Price List table...")
+        extracted_data_dict = json.loads(extracted_data)
+        price_list_table = find_price_list_table(extracted_data_dict)
+        # Rename the price list table key
+        if price_list_table:
+            # Find and rename the key containing the price list table
+            for key in list(extracted_data_dict.keys()):
+                if "long_table" in key and extracted_data_dict[key] == price_list_table:
+                    extracted_data_dict["price_list"] = extracted_data_dict.pop(key)
+                    break
+            # Update the extracted_data string with proper formatting
+            extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
+        else:
+            print("⚠️ No suitable price list table found!")
+            extracted_data_dict["price_list"] = []
+            extracted_data = json.dumps(extracted_data_dict, ensure_ascii=False, indent=4)
+        print(f"✅ Extracted Data: {extracted_data}")
+        # Create a copy of the data with only first row of price list for contract summary
+        contract_summary_dict = json.loads(extracted_data)
+        if contract_summary_dict.get("price_list"):
+            contract_summary_dict["price_list"] = [contract_summary_dict["price_list"][0]] if contract_summary_dict["price_list"] else []
+        contract_summary_data = json.dumps(contract_summary_dict, ensure_ascii=False, indent=4)
+        # Step 3: Process JSON with OpenAI to get structured output
+        print("Processing Contract Summary data with AI...")
+        contract_summary_filename = os.path.splitext(os.path.basename(docx_path))[0] + "_contract_summary.json"
+        contract_summary = deepseek_extract_contract_summary(contract_summary_data, save_json=False, json_filename=contract_summary_filename)
+        # Process the price list
+        print("Processing Price List data with AI...")
+        price_list_filename = os.path.join(os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0] + "_price_list.json")
+        price_list = extract_price_list(price_list_table, save_json=False, json_name=price_list_filename)
+        # Step 4: Combine contract summary and long table data into a single JSON object
+        print("Combining AI Generated JSON with Extracted Data...")
+        combined_data = {
+            "contract_summary": json.loads(json.loads(contract_summary)),
+            "price_list": price_list
+        }
+        return combined_data
+    finally:
+        # Ensure BytesIO is properly closed
+        if 'docx_bytes' in locals():
+            docx_bytes.close()
 # Example Usage
 import gradio as gr
 from gradio.themes.base import Base
 interface = gr.Interface(
     fn=extract_po,
     title="PO Extractor 买卖合同数据提取",