Spaces:

MikeMai
/

PO_Extractor_API

Running

App Files Files Community

MikeMai commited on Apr 16, 2025

Commit

0d9f3a7

verified ·

1 Parent(s): 9ed5434

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -5

app.py CHANGED Viewed

@@ -22,6 +22,22 @@ from typing import List, Optional
 HF_API_KEY = os.getenv("HF_API_KEY")
 # Configure logging to write to 'zaoju_logs.log' without using pickle
 logging.basicConfig(
     filename='extract_po_logs.log',
@@ -253,6 +269,16 @@ def process_long_table(rows):
     # Filter out rows where the "序号" column contains non-numeric values
     filtered_table_data = []
     for row in table_data:
         # Check potential serial number columns (use both Chinese and English variants)
         serial_number = None
         for column in row:
@@ -262,6 +288,10 @@ def process_long_table(rows):
         # If we found a serial number column, check if its value is numeric
         if serial_number is not None:
             # Strip any non-numeric characters and check if there's still a value
             # This keeps values like "1", "2." etc. but filters out "No." or other text
             cleaned_number = re.sub(r'[^\d]', '', serial_number)
@@ -427,7 +457,7 @@ Contract data in JSON format:""" + f"""
     # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
     client = OpenAI(
-        base_url="https://router.huggingface.co/novita",
         api_key=HF_API_KEY,
     )
@@ -437,9 +467,9 @@ Contract data in JSON format:""" + f"""
         try:
             print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
             completion = client.chat.completions.create(
-                model="deepseek/deepseek-r1-distill-qwen-14b",
                 messages=messages,
-                temperature=0.5,
             )
             think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
@@ -523,9 +553,21 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
         备注: Optional[str] = ""
         计划来源: Optional[str] = ""
         其他: Optional[dict] = Field(default_factory=dict, alias="其他")
     class PriceListModel(BaseModel):
         items: List[PriceItem]
     base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
 有时候第一行是表头，有时候是数据行，只输入数据行。
@@ -558,7 +600,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
     messages = [{"role": "user", "content": base_prompt}]
     client = OpenAI(
-        base_url="https://router.huggingface.co/novita",
         api_key=HF_API_KEY,
     )
@@ -567,7 +609,7 @@ def deepseek_extract_price_list(price_list, save_json=False, json_name="price_li
         try:
             response = client.chat.completions.create(
-                model="deepseek/deepseek-r1-distill-qwen-14b",
                 messages=messages,
             )
             raw = response.choices[0].message.content

 HF_API_KEY = os.getenv("HF_API_KEY")
+# Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
+base_url = "https://router.huggingface.co/novita"
+model = "deepseek/deepseek-r1-distill-qwen-14b"
+# Deepseek R1 Distilled Qwen 2.5 32B --------------------------------
+# base_url = "https://router.huggingface.co/hf-inference/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/v1"
+# model = "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+# Qwen 2.5 7B --------------------------------------------------------
+# base_url = "https://router.huggingface.co/together/v1"
+# model="Qwen/Qwen2.5-7B-Instruct-Turbo"
+# Qwen 2.5 32B --------------------------------------------------------
+# base_url = "https://router.huggingface.co/novita/v3/openai"
+# model="qwen/qwen-2.5-72b-instruct"
 # Configure logging to write to 'zaoju_logs.log' without using pickle
 logging.basicConfig(
     filename='extract_po_logs.log',
     # Filter out rows where the "序号" column contains non-numeric values
     filtered_table_data = []
     for row in table_data:
+        # Check if any cell contains "合计" (total)
+        contains_total = False
+        for key, value in row.items():
+            if isinstance(value, str) and "合计" in value:
+                contains_total = True
+                break
+        if contains_total:
+            continue
         # Check potential serial number columns (use both Chinese and English variants)
         serial_number = None
         for column in row:
         # If we found a serial number column, check if its value is numeric
         if serial_number is not None:
+            # Skip if serial number is empty
+            if not serial_number.strip():
+                continue
             # Strip any non-numeric characters and check if there's still a value
             # This keeps values like "1", "2." etc. but filters out "No." or other text
             cleaned_number = re.sub(r'[^\d]', '', serial_number)
     # Deepseek R1 Distilled Qwen 2.5 14B --------------------------------
     client = OpenAI(
+        base_url=base_url,
         api_key=HF_API_KEY,
     )
         try:
             print(f"🔄 LLM attempt {attempt + 1} of {max_retries}")
             completion = client.chat.completions.create(
+                model=model,
                 messages=messages,
+                temperature=0.1,
             )
             think_text = re.findall(r"<think>(.*?)</think>", completion.choices[0].message.content, flags=re.DOTALL)
         备注: Optional[str] = ""
         计划来源: Optional[str] = ""
         其他: Optional[dict] = Field(default_factory=dict, alias="其他")
+        class Config:
+            # Ensures numbers remain as strings and aren't converted
+            coerce_numbers_to_str = True
+            # Allows numeric strings to be parsed into the model
+            arbitrary_types_allowed = True
     class PriceListModel(BaseModel):
         items: List[PriceItem]
+        class Config:
+            # Ensures numbers remain as strings and aren't converted
+            coerce_numbers_to_str = True
+            # Allows numeric strings to be parsed into the model
+            arbitrary_types_allowed = True
     base_prompt = f"""你会接收到一个采购清单列表，请你提取以下字段并重新输出为一个结构化的 JSON 格式。
 有时候第一行是表头，有时候是数据行，只输入数据行。
     messages = [{"role": "user", "content": base_prompt}]
     client = OpenAI(
+        base_url=base_url,
         api_key=HF_API_KEY,
     )
         try:
             response = client.chat.completions.create(
+                model=model,
                 messages=messages,
             )
             raw = response.choices[0].message.content