Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on Apr 14, 2025

Commit

acc27da

verified ·

1 Parent(s): a5e3028

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -34

app.py CHANGED Viewed

@@ -388,7 +388,18 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
     # Step 3: Convert back to JSON string (if needed)
     json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
-    prompt = """You are given a contract in JSON format. Extract the following information:
 # Response Format
 Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
@@ -410,7 +421,7 @@ Contract data in JSON format:""" + f"""
     messages = [
         {
             "role": "user",
-            "content": prompt
         }
     ]
@@ -420,26 +431,73 @@ Contract data in JSON format:""" + f"""
         api_key=HF_API_KEY,
     )
-    completion = client.chat.completions.create(
-        model="deepseek/deepseek-r1-distill-qwen-14b",
-        messages=messages,
-        temperature=0.5,
-    )
-    think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
-    if think_text:
-        print(f"Thought Process: {think_text}")
-        logging.info(f"Think text: {think_text}")
-    contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
-    contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
     if save_json:
         with open(json_filename, "w", encoding="utf-8") as f:
-            f.write(contract_summary)
-    return json.dumps(contract_summary, ensure_ascii=False, indent=4)
 def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
@@ -450,27 +508,28 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
     # Pydantic schema
     class PriceItem(BaseModel):
-        序号: str
-        名称: str
-        名称_英文: str = Field(..., alias="名称(英文)")
-        品牌: str
-        规格: str
-        所属机型: str
-        采购数量: str
-        单位: str
-        单价: str
-        总价: str
-        几郎单价: str
-        几郎总额: str
-        备注: str
-        计划来源: str
-        其他: dict = Field(default_factory=dict, alias="其他")
     class PriceListModel(BaseModel):
         items: List[PriceItem]
     base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
-有时候第一行是表头，有时候是数据行，只输入数据行。请注意，输出的 JSON 需要符合以下格式要求：
 # 输出格式要求：
 每个条目输出以下字段：
@@ -512,6 +571,11 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
                 messages=messages,
             )
             raw = response.choices[0].message.content
             # Strip out LLM artifacts
             raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)

     # Step 3: Convert back to JSON string (if needed)
     json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
+    # Define Pydantic model for contract summary validation
+    class ContractSummary(BaseModel):
+        合同编号: Optional[str] = ""
+        接收人: Optional[str] = ""
+        Recipient: Optional[str] = ""
+        接收地: Optional[str] = ""
+        Place_of_receipt: Optional[str] = Field("", alias="Place of receipt")
+        供应商: Optional[str] = ""
+        币种: Optional[str] = ""
+        供货日期: Optional[str] = ""
+    base_prompt = """You are given a contract in JSON format. Extract the following information:
 # Response Format
 Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
     messages = [
         {
             "role": "user",
+            "content": base_prompt
         }
     ]
         api_key=HF_API_KEY,
     )
+    # Try up to 3 times with error feedback
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
+            completion = client.chat.completions.create(
+                model="deepseek/deepseek-r1-distill-qwen-14b",
+                messages=messages,
+                temperature=0.5,
+            )
+            think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
+            if think_text:
+                print(f"🧠 Thought Process: {think_text}")
+                logging.info(f"Think text: {think_text}")
+            contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
+            contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
+            # Clean up JSON before validation
+            contract_json = json.loads(contract_summary.strip())
+            validated_data = ContractSummary.model_validate(contract_json)
+            # Success! Return validated data
+            validated_json = json.dumps(validated_data.model_dump(by_alias=True), ensure_ascii=False, indent=4)
+            if save_json:
+                with open(json_filename, "w", encoding="utf-8") as f:
+                    f.write(validated_json)
+            print(f"✅ Successfully validated contract summary on attempt {attempt + 1}")
+            return json.dumps(validated_json, ensure_ascii=False, indent=4)
+        except ValidationError as e:
+            error_msg = f"Validation error: {e}"
+            logging.error(f"{error_msg}")
+            logging.error(f"Input data: {contract_summary}")
+            print(f"❌ {error_msg}")
+        except json.JSONDecodeError as e:
+            error_msg = f"JSON decode error: {e}"
+            logging.error(f"{error_msg}")
+            logging.error(f"Input data: {contract_summary}")
+            print(f"❌ {error_msg}")
+        # Don't retry on the last attempt
+        if attempt < max_retries - 1:
+            # Add error message to the conversation and retry
+            messages.append({
+                "role": "assistant",
+                "content": completion.choices[0].message.content
+            })
+            messages.append({
+                "role": "user",
+                "content": f"Your response had the following error: {error_msg}. Please fix the format and provide a valid JSON response with the required fields."
+            })
+    # If we get here, all attempts failed - return empty but valid model
+    print("⚠️ All attempts failed, returning empty model")
+    empty_data = ContractSummary().model_dump(by_alias=True)
+    empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
     if save_json:
         with open(json_filename, "w", encoding="utf-8") as f:
+            f.write(empty_json)
+    return json.dumps(empty_json, ensure_ascii=False, indent=4)
 def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
     # Pydantic schema
     class PriceItem(BaseModel):
+        序号: Optional[str] = ""
+        名称: Optional[str] = ""
+        名称_英文: Optional[str] = Field("", alias="名称(英文)")
+        品牌: Optional[str] = ""
+        规格: Optional[str] = ""
+        所属机型: Optional[str] = ""
+        采购数量: Optional[str] = ""
+        单位: Optional[str] = ""
+        单价: Optional[str] = ""
+        总价: Optional[str] = ""
+        几郎单价: Optional[str] = ""
+        几郎总额: Optional[str] = ""
+        备注: Optional[str] = ""
+        计划来源: Optional[str] = ""
+        其他: Optional[dict] = Field(default_factory=dict, alias="其他")
     class PriceListModel(BaseModel):
         items: List[PriceItem]
     base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
+有时候第一行是表头，有时候是数据行，只输入数据行。
+请注意，输出的 JSON 需要符合以下格式要求：
 # 输出格式要求：
 每个条目输出以下字段：
                 messages=messages,
             )
             raw = response.choices[0].message.content
+            think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
+            if think_text:
+                print(f"🧠 Thought Process: {think_text}")
+                logging.info(f"Think text: {think_text}")
             # Strip out LLM artifacts
             raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)