Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,6 +22,22 @@ from typing import List, Optional
|
|
| 22 |
|
| 23 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# Configure logging to write to 'zaoju_logs.log' without using pickle
|
| 26 |
logging.basicConfig(
|
| 27 |
filename='extract_po_logs.log',
|
|
@@ -253,6 +269,16 @@ def process_long_table(rows):
|
|
| 253 |
# Filter out rows where the "序号" column contains non-numeric values
|
| 254 |
filtered_table_data = []
|
| 255 |
for row in table_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
# Check potential serial number columns (use both Chinese and English variants)
|
| 257 |
serial_number = None
|
| 258 |
for column in row:
|
|
@@ -262,6 +288,10 @@ def process_long_table(rows):
|
|
| 262 |
|
| 263 |
# If we found a serial number column, check if its value is numeric
|
| 264 |
if serial_number is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# Strip any non-numeric characters and check if there's still a value
|
| 266 |
# This keeps values like "1", "2." etc. but filters out "No." or other text
|
| 267 |
cleaned_number = re.sub(r'[^\d]', '', serial_number)
|
|
@@ -427,7 +457,7 @@ Contract data in JSON format:""" + f"""
|
|
| 427 |
|
| 428 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 429 |
client = OpenAI(
|
| 430 |
-
base_url=
|
| 431 |
api_key=HF_API_KEY,
|
| 432 |
)
|
| 433 |
|
|
@@ -437,9 +467,9 @@ Contract data in JSON format:""" + f"""
|
|
| 437 |
try:
|
| 438 |
print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
|
| 439 |
completion = client.chat.completions.create(
|
| 440 |
-
model=
|
| 441 |
messages=messages,
|
| 442 |
-
temperature=0.
|
| 443 |
)
|
| 444 |
|
| 445 |
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
|
@@ -523,9 +553,21 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 523 |
备注: Optional[str] = ""
|
| 524 |
计划来源: Optional[str] = ""
|
| 525 |
其他: Optional[dict] = Field(default_factory=dict, alias="其他")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
|
| 527 |
class PriceListModel(BaseModel):
|
| 528 |
items: List[PriceItem]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
|
| 531 |
有时候第一行是表头,有时候是数据行,只输入数据行。
|
|
@@ -558,7 +600,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 558 |
messages = [{"role": "user", "content": base_prompt}]
|
| 559 |
|
| 560 |
client = OpenAI(
|
| 561 |
-
base_url=
|
| 562 |
api_key=HF_API_KEY,
|
| 563 |
)
|
| 564 |
|
|
@@ -567,7 +609,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
|
|
| 567 |
|
| 568 |
try:
|
| 569 |
response = client.chat.completions.create(
|
| 570 |
-
model=
|
| 571 |
messages=messages,
|
| 572 |
)
|
| 573 |
raw = response.choices[0].message.content
|
|
|
|
| 22 |
|
| 23 |
HF_API_KEY = os.getenv("HF_API_KEY")
|
| 24 |
|
| 25 |
+
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 26 |
+
base_url = "https://router.huggingface.co/novita"
|
| 27 |
+
model = "deepseek/deepseek-r1-distill-qwen-14b"
|
| 28 |
+
|
| 29 |
+
# Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
|
| 30 |
+
# base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
|
| 31 |
+
# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
|
| 32 |
+
|
| 33 |
+
# Qwen 2.5 7B --------------------------------------------------------
|
| 34 |
+
# base_url = "https://router.huggingface.co/together/v1"
|
| 35 |
+
# model="Qwen/Qwen2.5-7B-Instruct-Turbo"
|
| 36 |
+
|
| 37 |
+
# Qwen 2.5 32B --------------------------------------------------------
|
| 38 |
+
# base_url = "https://router.huggingface.co/novita/v3/openai"
|
| 39 |
+
# model="qwen/qwen-2.5-72b-instruct"
|
| 40 |
+
|
| 41 |
# Configure logging to write to 'zaoju_logs.log' without using pickle
|
| 42 |
logging.basicConfig(
|
| 43 |
filename='extract_po_logs.log',
|
|
|
|
| 269 |
# Filter out rows where the "序号" column contains non-numeric values
|
| 270 |
filtered_table_data = []
|
| 271 |
for row in table_data:
|
| 272 |
+
# Check if any cell contains "合计" (total)
|
| 273 |
+
contains_total = False
|
| 274 |
+
for key, value in row.items():
|
| 275 |
+
if isinstance(value, str) and "合计" in value:
|
| 276 |
+
contains_total = True
|
| 277 |
+
break
|
| 278 |
+
|
| 279 |
+
if contains_total:
|
| 280 |
+
continue
|
| 281 |
+
|
| 282 |
# Check potential serial number columns (use both Chinese and English variants)
|
| 283 |
serial_number = None
|
| 284 |
for column in row:
|
|
|
|
| 288 |
|
| 289 |
# If we found a serial number column, check if its value is numeric
|
| 290 |
if serial_number is not None:
|
| 291 |
+
# Skip if serial number is empty
|
| 292 |
+
if not serial_number.strip():
|
| 293 |
+
continue
|
| 294 |
+
|
| 295 |
# Strip any non-numeric characters and check if there's still a value
|
| 296 |
# This keeps values like "1", "2." etc. but filters out "No." or other text
|
| 297 |
cleaned_number = re.sub(r'[^\d]', '', serial_number)
|
|
|
|
| 457 |
|
| 458 |
# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
|
| 459 |
client = OpenAI(
|
| 460 |
+
base_url=base_url,
|
| 461 |
api_key=HF_API_KEY,
|
| 462 |
)
|
| 463 |
|
|
|
|
| 467 |
try:
|
| 468 |
print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
|
| 469 |
completion = client.chat.completions.create(
|
| 470 |
+
model=model,
|
| 471 |
messages=messages,
|
| 472 |
+
temperature=0.1,
|
| 473 |
)
|
| 474 |
|
| 475 |
think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
|
|
|
|
| 553 |
备注: Optional[str] = ""
|
| 554 |
计划来源: Optional[str] = ""
|
| 555 |
其他: Optional[dict] = Field(default_factory=dict, alias="其他")
|
| 556 |
+
|
| 557 |
+
class Config:
|
| 558 |
+
# Ensures numbers remain as strings and aren't converted
|
| 559 |
+
coerce_numbers_to_str = True
|
| 560 |
+
# Allows numeric strings to be parsed into the model
|
| 561 |
+
arbitrary_types_allowed = True
|
| 562 |
|
| 563 |
class PriceListModel(BaseModel):
|
| 564 |
items: List[PriceItem]
|
| 565 |
+
|
| 566 |
+
class Config:
|
| 567 |
+
# Ensures numbers remain as strings and aren't converted
|
| 568 |
+
coerce_numbers_to_str = True
|
| 569 |
+
# Allows numeric strings to be parsed into the model
|
| 570 |
+
arbitrary_types_allowed = True
|
| 571 |
|
| 572 |
base_prompt = f"""你会接收到一个采购清单列表,请你提取以下字段并重新输出为一个结构化的 JSON 格式。
|
| 573 |
有时候第一行是表头,有时候是数据行,只输入数据行。
|
|
|
|
| 600 |
messages = [{"role": "user", "content": base_prompt}]
|
| 601 |
|
| 602 |
client = OpenAI(
|
| 603 |
+
base_url=base_url,
|
| 604 |
api_key=HF_API_KEY,
|
| 605 |
)
|
| 606 |
|
|
|
|
| 609 |
|
| 610 |
try:
|
| 611 |
response = client.chat.completions.create(
|
| 612 |
+
model=model,
|
| 613 |
messages=messages,
|
| 614 |
)
|
| 615 |
raw = response.choices[0].message.content
|