Marthee commited on
Commit
341b3ef
·
verified ·
1 Parent(s): c8a8b66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -15
app.py CHANGED
@@ -1617,7 +1617,6 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
1617
  try:
1618
  result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
1619
  jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
1620
- print(jsons)
1621
  if not result:
1622
  df = pd.DataFrame([{
1623
  "text": None,
@@ -1628,33 +1627,54 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
1628
  "System Message": "No headers were identified by the LLM."
1629
  }])
1630
  else:
1631
- print('here')
1632
  df = pd.DataFrame(result)
1633
 
1634
  subject_body_map = {}
1635
 
1636
- for pdf_sections in jsons:
1637
- for obj in pdf_sections:
1638
- subject = obj.get("Subject")
1639
- body = obj.get("BodyText", [])
1640
-
1641
- if subject:
1642
- subject_body_map[subject.strip()] = " ".join(body)
1643
-
1644
- df["body"] = df["text"].map(subject_body_map)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1645
 
 
1646
  output_path = os.path.abspath("header_analysis_output.xlsx")
1647
  df.to_excel(output_path, index=False, engine="openpyxl")
 
 
1648
  print(df)
1649
 
1650
  return output_path
1651
 
1652
  except Exception as e:
1653
- logger.error(f"Critical error in processing: {str(e)}")
 
1654
  return None
1655
-
1656
-
1657
-
1658
  # Improved launch with debug mode enabled
1659
  iface = gr.Interface(
1660
  fn=identify_headers_and_save_excel,
 
1617
  try:
1618
  result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
1619
  jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
 
1620
  if not result:
1621
  df = pd.DataFrame([{
1622
  "text": None,
 
1627
  "System Message": "No headers were identified by the LLM."
1628
  }])
1629
  else:
 
1630
  df = pd.DataFrame(result)
1631
 
1632
  subject_body_map = {}
1633
 
1634
+ # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
1635
+ for pdf_level in jsons:
1636
+ if not isinstance(pdf_level, list):
1637
+ continue
1638
+
1639
+ for section_level in pdf_level:
1640
+ # If the LLM returns a list of dictionaries here
1641
+ if isinstance(section_level, list):
1642
+ for obj in section_level:
1643
+ if isinstance(obj, dict):
1644
+ subject = obj.get("Subject")
1645
+ body = obj.get("BodyText", [])
1646
+ if subject:
1647
+ # Ensure body is a list before joining
1648
+ body_str = " ".join(body) if isinstance(body, list) else str(body)
1649
+ subject_body_map[subject.strip()] = body_str
1650
+
1651
+ # If the LLM returns a single dictionary here
1652
+ elif isinstance(section_level, dict):
1653
+ subject = section_level.get("Subject")
1654
+ body = section_level.get("BodyText", [])
1655
+ if subject:
1656
+ body_str = " ".join(body) if isinstance(body, list) else str(body)
1657
+ subject_body_map[subject.strip()] = body_str
1658
+
1659
+ # Map the extracted body text to the "text" column in your main DataFrame
1660
+ if "text" in df.columns:
1661
+ df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
1662
+ else:
1663
+ df["body"] = None
1664
 
1665
+ # Save to Excel
1666
  output_path = os.path.abspath("header_analysis_output.xlsx")
1667
  df.to_excel(output_path, index=False, engine="openpyxl")
1668
+
1669
+ print("--- Processed DataFrame ---")
1670
  print(df)
1671
 
1672
  return output_path
1673
 
1674
  except Exception as e:
1675
+ print(f"ERROR - Critical error in processing: {e}")
1676
+ # Re-raise or handle as needed
1677
  return None
 
 
 
1678
  # Improved launch with debug mode enabled
1679
  iface = gr.Interface(
1680
  fn=identify_headers_and_save_excel,