Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1617,7 +1617,6 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
|
| 1617 |
try:
|
| 1618 |
result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
|
| 1619 |
jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
|
| 1620 |
-
print(jsons)
|
| 1621 |
if not result:
|
| 1622 |
df = pd.DataFrame([{
|
| 1623 |
"text": None,
|
|
@@ -1628,33 +1627,54 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
|
| 1628 |
"System Message": "No headers were identified by the LLM."
|
| 1629 |
}])
|
| 1630 |
else:
|
| 1631 |
-
print('here')
|
| 1632 |
df = pd.DataFrame(result)
|
| 1633 |
|
| 1634 |
subject_body_map = {}
|
| 1635 |
|
| 1636 |
-
|
| 1637 |
-
|
| 1638 |
-
|
| 1639 |
-
|
| 1640 |
-
|
| 1641 |
-
|
| 1642 |
-
|
| 1643 |
-
|
| 1644 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1645 |
|
|
|
|
| 1646 |
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 1647 |
df.to_excel(output_path, index=False, engine="openpyxl")
|
|
|
|
|
|
|
| 1648 |
print(df)
|
| 1649 |
|
| 1650 |
return output_path
|
| 1651 |
|
| 1652 |
except Exception as e:
|
| 1653 |
-
|
|
|
|
| 1654 |
return None
|
| 1655 |
-
|
| 1656 |
-
|
| 1657 |
-
|
| 1658 |
# Improved launch with debug mode enabled
|
| 1659 |
iface = gr.Interface(
|
| 1660 |
fn=identify_headers_and_save_excel,
|
|
|
|
| 1617 |
try:
|
| 1618 |
result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
|
| 1619 |
jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
|
|
|
|
| 1620 |
if not result:
|
| 1621 |
df = pd.DataFrame([{
|
| 1622 |
"text": None,
|
|
|
|
| 1627 |
"System Message": "No headers were identified by the LLM."
|
| 1628 |
}])
|
| 1629 |
else:
|
|
|
|
| 1630 |
df = pd.DataFrame(result)
|
| 1631 |
|
| 1632 |
subject_body_map = {}
|
| 1633 |
|
| 1634 |
+
# Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
|
| 1635 |
+
for pdf_level in jsons:
|
| 1636 |
+
if not isinstance(pdf_level, list):
|
| 1637 |
+
continue
|
| 1638 |
+
|
| 1639 |
+
for section_level in pdf_level:
|
| 1640 |
+
# If the LLM returns a list of dictionaries here
|
| 1641 |
+
if isinstance(section_level, list):
|
| 1642 |
+
for obj in section_level:
|
| 1643 |
+
if isinstance(obj, dict):
|
| 1644 |
+
subject = obj.get("Subject")
|
| 1645 |
+
body = obj.get("BodyText", [])
|
| 1646 |
+
if subject:
|
| 1647 |
+
# Ensure body is a list before joining
|
| 1648 |
+
body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 1649 |
+
subject_body_map[subject.strip()] = body_str
|
| 1650 |
+
|
| 1651 |
+
# If the LLM returns a single dictionary here
|
| 1652 |
+
elif isinstance(section_level, dict):
|
| 1653 |
+
subject = section_level.get("Subject")
|
| 1654 |
+
body = section_level.get("BodyText", [])
|
| 1655 |
+
if subject:
|
| 1656 |
+
body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 1657 |
+
subject_body_map[subject.strip()] = body_str
|
| 1658 |
+
|
| 1659 |
+
# Map the extracted body text to the "text" column in your main DataFrame
|
| 1660 |
+
if "text" in df.columns:
|
| 1661 |
+
df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
|
| 1662 |
+
else:
|
| 1663 |
+
df["body"] = None
|
| 1664 |
|
| 1665 |
+
# Save to Excel
|
| 1666 |
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 1667 |
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 1668 |
+
|
| 1669 |
+
print("--- Processed DataFrame ---")
|
| 1670 |
print(df)
|
| 1671 |
|
| 1672 |
return output_path
|
| 1673 |
|
| 1674 |
except Exception as e:
|
| 1675 |
+
print(f"ERROR - Critical error in processing: {e}")
|
| 1676 |
+
# Re-raise or handle as needed
|
| 1677 |
return None
|
|
|
|
|
|
|
|
|
|
| 1678 |
# Improved launch with debug mode enabled
|
| 1679 |
iface = gr.Interface(
|
| 1680 |
fn=identify_headers_and_save_excel,
|