Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -388,7 +388,18 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
|
|
| 388 |
# Step 3: Convert back to JSON string (if needed)
|
| 389 |
json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
|
| 390 |
|
| 391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
# Response Format
|
| 394 |
Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
|
|
@@ -410,7 +421,7 @@ Contract data in JSON format:""" + f"""
|
|
| 410 |
messages = [
|
| 411 |
{
|
| 412 |
"role": "user",
|
| 413 |
-
"content":
|
| 414 |
}
|
| 415 |
]
|
| 416 |
|
|
@@ -420,26 +431,73 @@ Contract data in JSON format:""" + f"""
|
|
| 420 |
api_key=HF_API_KEY,
|
| 421 |
)
|
| 422 |
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
if save_json:
|
| 439 |
with open(json_filename, "w", encoding="utf-8") as f:
|
| 440 |
-
f.write(
|
| 441 |
|
| 442 |
-
return json.dumps(
|
| 443 |
|
| 444 |
|
| 445 |
def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
|
@@ -450,27 +508,28 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 450 |
|
| 451 |
# Pydantic schema
|
| 452 |
class PriceItem(BaseModel):
|
| 453 |
-
序号: str
|
| 454 |
-
名称: str
|
| 455 |
-
名称_英文: str = Field(
|
| 456 |
-
品牌: str
|
| 457 |
-
规格: str
|
| 458 |
-
所属机型: str
|
| 459 |
-
采购数量: str
|
| 460 |
-
单位: str
|
| 461 |
-
单价: str
|
| 462 |
-
总价: str
|
| 463 |
-
几郎单价: str
|
| 464 |
-
几郎总额: str
|
| 465 |
-
备注: str
|
| 466 |
-
计划来源: str
|
| 467 |
-
其他: dict = Field(default_factory=dict, alias="其他")
|
| 468 |
|
| 469 |
class PriceListModel(BaseModel):
|
| 470 |
items: List[PriceItem]
|
| 471 |
|
| 472 |
base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
|
| 473 |
-
|
|
|
|
| 474 |
|
| 475 |
# 输出格式要求:
|
| 476 |
每个条目输出以下字段:
|
|
@@ -512,6 +571,11 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 512 |
messages=messages,
|
| 513 |
)
|
| 514 |
raw = response.choices[0].message.content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 515 |
|
| 516 |
# Strip out LLM artifacts
|
| 517 |
raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
|
|
|
|
| 388 |
# Step 3: Convert back to JSON string (if needed)
|
| 389 |
json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
|
| 390 |
|
| 391 |
+
# Define Pydantic model for contract summary validation
|
| 392 |
+
class ContractSummary(BaseModel):
|
| 393 |
+
合同编号: Optional[str] = ""
|
| 394 |
+
接收人: Optional[str] = ""
|
| 395 |
+
Recipient: Optional[str] = ""
|
| 396 |
+
接收地: Optional[str] = ""
|
| 397 |
+
Place_of_receipt: Optional[str] = Field("", alias="Place of receipt")
|
| 398 |
+
供应商: Optional[str] = ""
|
| 399 |
+
币种: Optional[str] = ""
|
| 400 |
+
供货日期: Optional[str] = ""
|
| 401 |
+
|
| 402 |
+
base_prompt = """You are given a contract in JSON format. Extract the following information:
|
| 403 |
|
| 404 |
# Response Format
|
| 405 |
Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
|
|
|
|
| 421 |
messages = [
|
| 422 |
{
|
| 423 |
"role": "user",
|
| 424 |
+
"content": base_prompt
|
| 425 |
}
|
| 426 |
]
|
| 427 |
|
|
|
|
| 431 |
api_key=HF_API_KEY,
|
| 432 |
)
|
| 433 |
|
| 434 |
+
# Try up to 3 times with error feedback
|
| 435 |
+
max_retries = 3
|
| 436 |
+
for attempt in range(max_retries):
|
| 437 |
+
try:
|
| 438 |
+
print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
|
| 439 |
+
completion = client.chat.completions.create(
|
| 440 |
+
model="deepseek/deepseek-r1-distill-qwen-14b",
|
| 441 |
+
messages=messages,
|
| 442 |
+
temperature=0.5,
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
| 446 |
+
if think_text:
|
| 447 |
+
print(f"🧠 Thought Process: {think_text}")
|
| 448 |
+
logging.info(f"Think text: {think_text}")
|
| 449 |
+
|
| 450 |
+
contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
|
| 451 |
+
contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
|
| 452 |
+
|
| 453 |
+
# Clean up JSON before validation
|
| 454 |
+
contract_json = json.loads(contract_summary.strip())
|
| 455 |
+
validated_data = ContractSummary.model_validate(contract_json)
|
| 456 |
+
|
| 457 |
+
# Success! Return validated data
|
| 458 |
+
validated_json = json.dumps(validated_data.model_dump(by_alias=True), ensure_ascii=False, indent=4)
|
| 459 |
+
|
| 460 |
+
if save_json:
|
| 461 |
+
with open(json_filename, "w", encoding="utf-8") as f:
|
| 462 |
+
f.write(validated_json)
|
| 463 |
+
|
| 464 |
+
print(f"✅ Successfully validated contract summary on attempt {attempt + 1}")
|
| 465 |
+
return json.dumps(validated_json, ensure_ascii=False, indent=4)
|
| 466 |
+
|
| 467 |
+
except ValidationError as e:
|
| 468 |
+
error_msg = f"Validation error: {e}"
|
| 469 |
+
logging.error(f"{error_msg}")
|
| 470 |
+
logging.error(f"Input data: {contract_summary}")
|
| 471 |
+
print(f"❌ {error_msg}")
|
| 472 |
+
|
| 473 |
+
except json.JSONDecodeError as e:
|
| 474 |
+
error_msg = f"JSON decode error: {e}"
|
| 475 |
+
logging.error(f"{error_msg}")
|
| 476 |
+
logging.error(f"Input data: {contract_summary}")
|
| 477 |
+
print(f"❌ {error_msg}")
|
| 478 |
+
|
| 479 |
+
# Don't retry on the last attempt
|
| 480 |
+
if attempt < max_retries - 1:
|
| 481 |
+
# Add error message to the conversation and retry
|
| 482 |
+
messages.append({
|
| 483 |
+
"role": "assistant",
|
| 484 |
+
"content": completion.choices[0].message.content
|
| 485 |
+
})
|
| 486 |
+
messages.append({
|
| 487 |
+
"role": "user",
|
| 488 |
+
"content": f"Your response had the following error: {error_msg}. Please fix the format and provide a valid JSON response with the required fields."
|
| 489 |
+
})
|
| 490 |
+
|
| 491 |
+
# If we get here, all attempts failed - return empty but valid model
|
| 492 |
+
print("⚠️ All attempts failed, returning empty model")
|
| 493 |
+
empty_data = ContractSummary().model_dump(by_alias=True)
|
| 494 |
+
empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
|
| 495 |
|
| 496 |
if save_json:
|
| 497 |
with open(json_filename, "w", encoding="utf-8") as f:
|
| 498 |
+
f.write(empty_json)
|
| 499 |
|
| 500 |
+
return json.dumps(empty_json, ensure_ascii=False, indent=4)
|
| 501 |
|
| 502 |
|
| 503 |
def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
|
|
|
|
| 508 |
|
| 509 |
# Pydantic schema
|
| 510 |
class PriceItem(BaseModel):
|
| 511 |
+
序号: Optional[str] = ""
|
| 512 |
+
名称: Optional[str] = ""
|
| 513 |
+
名称_英文: Optional[str] = Field("", alias="名称(英文)")
|
| 514 |
+
品牌: Optional[str] = ""
|
| 515 |
+
规格: Optional[str] = ""
|
| 516 |
+
所属机型: Optional[str] = ""
|
| 517 |
+
采购数量: Optional[str] = ""
|
| 518 |
+
单位: Optional[str] = ""
|
| 519 |
+
单价: Optional[str] = ""
|
| 520 |
+
总价: Optional[str] = ""
|
| 521 |
+
几郎单价: Optional[str] = ""
|
| 522 |
+
几郎总额: Optional[str] = ""
|
| 523 |
+
备注: Optional[str] = ""
|
| 524 |
+
计划来源: Optional[str] = ""
|
| 525 |
+
其他: Optional[dict] = Field(default_factory=dict, alias="其他")
|
| 526 |
|
| 527 |
class PriceListModel(BaseModel):
|
| 528 |
items: List[PriceItem]
|
| 529 |
|
| 530 |
base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
|
| 531 |
+
有时候第一行是表头,有时候是数据行,只输入数据行。
|
| 532 |
+
请注意,输出的 JSON 需要符合以下格式要求:
|
| 533 |
|
| 534 |
# 输出格式要求:
|
| 535 |
每个条目输出以下字段:
|
|
|
|
| 571 |
messages=messages,
|
| 572 |
)
|
| 573 |
raw = response.choices[0].message.content
|
| 574 |
+
|
| 575 |
+
think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
|
| 576 |
+
if think_text:
|
| 577 |
+
print(f"🧠 Thought Process: {think_text}")
|
| 578 |
+
logging.info(f"Think text: {think_text}")
|
| 579 |
|
| 580 |
# Strip out LLM artifacts
|
| 581 |
raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
|