MikeMai commited on
Commit
acc27da
·
verified ·
1 Parent(s): a5e3028

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -34
app.py CHANGED
@@ -388,7 +388,18 @@ def deepseek_extract_contract_summary(json_data, save_json=False, json_filename=
388
  # Step 3: Convert back to JSON string (if needed)
389
  json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
390
 
391
- prompt = """You are given a contract in JSON format. Extract the following information:
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  # Response Format
394
  Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
@@ -410,7 +421,7 @@ Contract data in JSON format:""" + f"""
410
  messages = [
411
  {
412
  "role": "user",
413
- "content": prompt
414
  }
415
  ]
416
 
@@ -420,26 +431,73 @@ Contract data in JSON format:""" + f"""
420
  api_key=HF_API_KEY,
421
  )
422
 
423
- completion = client.chat.completions.create(
424
- model="deepseek/deepseek-r1-distill-qwen-14b",
425
- messages=messages,
426
- temperature=0.5,
427
- )
428
-
429
- think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
430
- if think_text:
431
- print(f"Thought Process: {think_text}")
432
- logging.info(f"Think text: {think_text}")
433
-
434
- contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
435
-
436
- contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
438
  if save_json:
439
  with open(json_filename, "w", encoding="utf-8") as f:
440
- f.write(contract_summary)
441
 
442
- return json.dumps(contract_summary, ensure_ascii=False, indent=4)
443
 
444
 
445
  def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
@@ -450,27 +508,28 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
450
 
451
  # Pydantic schema
452
  class PriceItem(BaseModel):
453
- 序号: str
454
- 名称: str
455
- 名称_英文: str = Field(..., alias="名称(英文)")
456
- 品牌: str
457
- 规格: str
458
- 所属机型: str
459
- 采购数量: str
460
- 单位: str
461
- 单价: str
462
- 总价: str
463
- 几郎单价: str
464
- 几郎总额: str
465
- 备注: str
466
- 计划来源: str
467
- 其他: dict = Field(default_factory=dict, alias="其他")
468
 
469
  class PriceListModel(BaseModel):
470
  items: List[PriceItem]
471
 
472
  base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
473
- 有时候第一行是表头,有时候是数据行,只输入数据行。请注意,输出的 JSON 需要符合以下格式要求:
 
474
 
475
  # 输出格式要求:
476
  每个条目输出以下字段:
@@ -512,6 +571,11 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
512
  messages=messages,
513
  )
514
  raw = response.choices[0].message.content
 
 
 
 
 
515
 
516
  # Strip out LLM artifacts
517
  raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)
 
388
  # Step 3: Convert back to JSON string (if needed)
389
  json_output = json.dumps(contract_data, ensure_ascii=False, indent=4)
390
 
391
+ # Define Pydantic model for contract summary validation
392
+ class ContractSummary(BaseModel):
393
+ 合同编号: Optional[str] = ""
394
+ 接收人: Optional[str] = ""
395
+ Recipient: Optional[str] = ""
396
+ 接收地: Optional[str] = ""
397
+ Place_of_receipt: Optional[str] = Field("", alias="Place of receipt")
398
+ 供应商: Optional[str] = ""
399
+ 币种: Optional[str] = ""
400
+ 供货日期: Optional[str] = ""
401
+
402
+ base_prompt = """You are given a contract in JSON format. Extract the following information:
403
 
404
  # Response Format
405
  Return the extracted information as a structured JSON in the exact format shown below (Note: Do not repeat any keys, if unsure leave the value empty):
 
421
  messages = [
422
  {
423
  "role": "user",
424
+ "content": base_prompt
425
  }
426
  ]
427
 
 
431
  api_key=HF_API_KEY,
432
  )
433
 
434
+ # Try up to 3 times with error feedback
435
+ max_retries = 3
436
+ for attempt in range(max_retries):
437
+ try:
438
+ print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
439
+ completion = client.chat.completions.create(
440
+ model="deepseek/deepseek-r1-distill-qwen-14b",
441
+ messages=messages,
442
+ temperature=0.5,
443
+ )
444
+
445
+ think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
446
+ if think_text:
447
+ print(f"🧠 Thought Process: {think_text}")
448
+ logging.info(f"Think text: {think_text}")
449
+
450
+ contract_summary = re.sub(r"<think>.*?</think>\s*", "", completion.choices[0].message.content, flags=re.DOTALL) # Remove think
451
+ contract_summary = re.sub(r"^```json\n|```$", "", contract_summary, flags=re.DOTALL) # Remove ```
452
+
453
+ # Clean up JSON before validation
454
+ contract_json = json.loads(contract_summary.strip())
455
+ validated_data = ContractSummary.model_validate(contract_json)
456
+
457
+ # Success! Return validated data
458
+ validated_json = json.dumps(validated_data.model_dump(by_alias=True), ensure_ascii=False, indent=4)
459
+
460
+ if save_json:
461
+ with open(json_filename, "w", encoding="utf-8") as f:
462
+ f.write(validated_json)
463
+
464
+ print(f"✅ Successfully validated contract summary on attempt {attempt + 1}")
465
+ return json.dumps(validated_json, ensure_ascii=False, indent=4)
466
+
467
+ except ValidationError as e:
468
+ error_msg = f"Validation error: {e}"
469
+ logging.error(f"{error_msg}")
470
+ logging.error(f"Input data: {contract_summary}")
471
+ print(f"❌ {error_msg}")
472
+
473
+ except json.JSONDecodeError as e:
474
+ error_msg = f"JSON decode error: {e}"
475
+ logging.error(f"{error_msg}")
476
+ logging.error(f"Input data: {contract_summary}")
477
+ print(f"❌ {error_msg}")
478
+
479
+ # Don't retry on the last attempt
480
+ if attempt < max_retries - 1:
481
+ # Add error message to the conversation and retry
482
+ messages.append({
483
+ "role": "assistant",
484
+ "content": completion.choices[0].message.content
485
+ })
486
+ messages.append({
487
+ "role": "user",
488
+ "content": f"Your response had the following error: {error_msg}. Please fix the format and provide a valid JSON response with the required fields."
489
+ })
490
+
491
+ # If we get here, all attempts failed - return empty but valid model
492
+ print("⚠️ All attempts failed, returning empty model")
493
+ empty_data = ContractSummary().model_dump(by_alias=True)
494
+ empty_json = json.dumps(empty_data, ensure_ascii=False, indent=4)
495
 
496
  if save_json:
497
  with open(json_filename, "w", encoding="utf-8") as f:
498
+ f.write(empty_json)
499
 
500
+ return json.dumps(empty_json, ensure_ascii=False, indent=4)
501
 
502
 
503
  def deepseek_extract_price_list(price_list, save_json=False, json_name="price_list.json"):
 
508
 
509
  # Pydantic schema
510
  class PriceItem(BaseModel):
511
+ 序号: Optional[str] = ""
512
+ 名称: Optional[str] = ""
513
+ 名称_英文: Optional[str] = Field("", alias="名称(英文)")
514
+ 品牌: Optional[str] = ""
515
+ 规格: Optional[str] = ""
516
+ 所属机型: Optional[str] = ""
517
+ 采购数量: Optional[str] = ""
518
+ 单位: Optional[str] = ""
519
+ 单价: Optional[str] = ""
520
+ 总价: Optional[str] = ""
521
+ 几郎单价: Optional[str] = ""
522
+ 几郎总额: Optional[str] = ""
523
+ 备注: Optional[str] = ""
524
+ 计划来源: Optional[str] = ""
525
+ 其他: Optional[dict] = Field(default_factory=dict, alias="其他")
526
 
527
  class PriceListModel(BaseModel):
528
  items: List[PriceItem]
529
 
530
  base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
531
+ 有时候第一行是表头,有时候是数据行,只输入数据行。
532
+ 请注意,输出的 JSON 需要符合以下格式要求:
533
 
534
  # 输出格式要求:
535
  每个条目输出以下字段:
 
571
  messages=messages,
572
  )
573
  raw = response.choices[0].message.content
574
+
575
+ think_text = re.findall(r"<think>(.*?)</think>", response.choices[0].message.content, flags=re.DOTALL)
576
+ if think_text:
577
+ print(f"🧠 Thought Process: {think_text}")
578
+ logging.info(f"Think text: {think_text}")
579
 
580
  # Strip out LLM artifacts
581
  raw = re.sub(r"<think>.*?</think>\s*", "", raw, flags=re.DOTALL)