MikeMai commited on
Commit
0d9f3a7
·
verified ·
1 Parent(s): 9ed5434

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -5
app.py CHANGED
@@ -22,6 +22,22 @@ from typing import List, Optional
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Configure logging to write to 'zaoju_logs.log' without using pickle
26
  logging.basicConfig(
27
  filename='extract_po_logs.log',
@@ -253,6 +269,16 @@ def process_long_table(rows):
253
  # Filter out rows where the "序号" column contains non-numeric values
254
  filtered_table_data = []
255
  for row in table_data:
 
 
 
 
 
 
 
 
 
 
256
  # Check potential serial number columns (use both Chinese and English variants)
257
  serial_number = None
258
  for column in row:
@@ -262,6 +288,10 @@ def process_long_table(rows):
262
 
263
  # If we found a serial number column, check if its value is numeric
264
  if serial_number is not None:
 
 
 
 
265
  # Strip any non-numeric characters and check if there's still a value
266
  # This keeps values like "1", "2." etc. but filters out "No." or other text
267
  cleaned_number = re.sub(r'[^\d]', '', serial_number)
@@ -427,7 +457,7 @@ Contract data in JSON format:""" + f"""
427
 
428
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
429
  client = OpenAI(
430
- base_url="https://router.huggingface.co/novita",
431
  api_key=HF_API_KEY,
432
  )
433
 
@@ -437,9 +467,9 @@ Contract data in JSON format:""" + f"""
437
  try:
438
  print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
439
  completion = client.chat.completions.create(
440
- model="deepseek/deepseek-r1-distill-qwen-14b",
441
  messages=messages,
442
- temperature=0.5,
443
  )
444
 
445
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
@@ -523,9 +553,21 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
523
  备注: Optional[str] = ""
524
  计划来源: Optional[str] = ""
525
  其他: Optional[dict] = Field(default_factory=dict, alias="其他")
 
 
 
 
 
 
526
 
527
  class PriceListModel(BaseModel):
528
  items: List[PriceItem]
 
 
 
 
 
 
529
 
530
  base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
531
  有时候第一行是表头,有时候是数据行,只输入数据行。
@@ -558,7 +600,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
558
  messages = [{"role": "user", "content": base_prompt}]
559
 
560
  client = OpenAI(
561
- base_url="https://router.huggingface.co/novita",
562
  api_key=HF_API_KEY,
563
  )
564
 
@@ -567,7 +609,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
567
 
568
  try:
569
  response = client.chat.completions.create(
570
- model="deepseek/deepseek-r1-distill-qwen-14b",
571
  messages=messages,
572
  )
573
  raw = response.choices[0].message.content
 
22
 
23
  HF_API_KEY = os.getenv("HF_API_KEY")
24
 
25
+ # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
26
+ base_url = "https://router.huggingface.co/novita"
27
+ model = "deepseek/deepseek-r1-distill-qwen-14b"
28
+
29
+ # Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
30
+ # base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
31
+ # model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
32
+
33
+ # Qwen 2.5 7B --------------------------------------------------------
34
+ # base_url = "https://router.huggingface.co/together/v1"
35
+ # model="Qwen/Qwen2.5-7B-Instruct-Turbo"
36
+
37
+ # Qwen 2.5 32B --------------------------------------------------------
38
+ # base_url = "https://router.huggingface.co/novita/v3/openai"
39
+ # model="qwen/qwen-2.5-72b-instruct"
40
+
41
  # Configure logging to write to 'zaoju_logs.log' without using pickle
42
  logging.basicConfig(
43
  filename='extract_po_logs.log',
 
269
  # Filter out rows where the "序号" column contains non-numeric values
270
  filtered_table_data = []
271
  for row in table_data:
272
+ # Check if any cell contains "合计" (total)
273
+ contains_total = False
274
+ for key, value in row.items():
275
+ if isinstance(value, str) and "合计" in value:
276
+ contains_total = True
277
+ break
278
+
279
+ if contains_total:
280
+ continue
281
+
282
  # Check potential serial number columns (use both Chinese and English variants)
283
  serial_number = None
284
  for column in row:
 
288
 
289
  # If we found a serial number column, check if its value is numeric
290
  if serial_number is not None:
291
+ # Skip if serial number is empty
292
+ if not serial_number.strip():
293
+ continue
294
+
295
  # Strip any non-numeric characters and check if there's still a value
296
  # This keeps values like "1", "2." etc. but filters out "No." or other text
297
  cleaned_number = re.sub(r'[^\d]', '', serial_number)
 
457
 
458
  # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
459
  client = OpenAI(
460
+ base_url=base_url,
461
  api_key=HF_API_KEY,
462
  )
463
 
 
467
  try:
468
  print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
469
  completion = client.chat.completions.create(
470
+ model=model,
471
  messages=messages,
472
+ temperature=0.1,
473
  )
474
 
475
  think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
 
553
  备注: Optional[str] = ""
554
  计划来源: Optional[str] = ""
555
  其他: Optional[dict] = Field(default_factory=dict, alias="其他")
556
+
557
+ class Config:
558
+ # Ensures numbers remain as strings and aren't converted
559
+ coerce_numbers_to_str = True
560
+ # Allows numeric strings to be parsed into the model
561
+ arbitrary_types_allowed = True
562
 
563
  class PriceListModel(BaseModel):
564
  items: List[PriceItem]
565
+
566
+ class Config:
567
+ # Ensures numbers remain as strings and aren't converted
568
+ coerce_numbers_to_str = True
569
+ # Allows numeric strings to be parsed into the model
570
+ arbitrary_types_allowed = True
571
 
572
  base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
573
  有时候第一行是表头,有时候是数据行,只输入数据行。
 
600
  messages = [{"role": "user", "content": base_prompt}]
601
 
602
  client = OpenAI(
603
+ base_url=base_url,
604
  api_key=HF_API_KEY,
605
  )
606
 
 
609
 
610
  try:
611
  response = client.chat.completions.create(
612
+ model=model,
613
  messages=messages,
614
  )
615
  raw = response.choices[0].message.content