Spaces:

findConsole
/

PromptTesting

Running

App Files Files Community

Marthee commited on 15 days ago

Commit

341b3ef

verified ·

1 Parent(s): c8a8b66

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -15

app.py CHANGED Viewed

@@ -1617,7 +1617,6 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
     try:
         result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
         jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
-        print(jsons)
         if not result:
             df = pd.DataFrame([{
                 "text": None,
@@ -1628,33 +1627,54 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
                 "System Message": "No headers were identified by the LLM."
             }])
         else:
-            print('here')
             df = pd.DataFrame(result)
             subject_body_map = {}
-            for pdf_sections in jsons:
-                for obj in pdf_sections:
-                    subject = obj.get("Subject")
-                    body = obj.get("BodyText", [])
-                    if subject:
-                        subject_body_map[subject.strip()] = " ".join(body)
-            df["body"] = df["text"].map(subject_body_map)
         output_path = os.path.abspath("header_analysis_output.xlsx")
         df.to_excel(output_path, index=False, engine="openpyxl")
         print(df)
         return output_path
     except Exception as e:
-        logger.error(f"Critical error in processing: {str(e)}")
         return None
 # Improved launch with debug mode enabled
 iface = gr.Interface(
     fn=identify_headers_and_save_excel,

     try:
         result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
         jsons = extract_section_under_header_tobebilledMultiplePDFS(pdf_path, model,result)
         if not result:
             df = pd.DataFrame([{
                 "text": None,
                 "System Message": "No headers were identified by the LLM."
             }])
         else:
             df = pd.DataFrame(result)
             subject_body_map = {}
+            # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
+            for pdf_level in jsons:
+                if not isinstance(pdf_level, list):
+                    continue
+                for section_level in pdf_level:
+                    # If the LLM returns a list of dictionaries here
+                    if isinstance(section_level, list):
+                        for obj in section_level:
+                            if isinstance(obj, dict):
+                                subject = obj.get("Subject")
+                                body = obj.get("BodyText", [])
+                                if subject:
+                                    # Ensure body is a list before joining
+                                    body_str = " ".join(body) if isinstance(body, list) else str(body)
+                                    subject_body_map[subject.strip()] = body_str
+                    # If the LLM returns a single dictionary here
+                    elif isinstance(section_level, dict):
+                        subject = section_level.get("Subject")
+                        body = section_level.get("BodyText", [])
+                        if subject:
+                            body_str = " ".join(body) if isinstance(body, list) else str(body)
+                            subject_body_map[subject.strip()] = body_str
+            # Map the extracted body text to the "text" column in your main DataFrame
+            if "text" in df.columns:
+                df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
+            else:
+                df["body"] = None
+        # Save to Excel
         output_path = os.path.abspath("header_analysis_output.xlsx")
         df.to_excel(output_path, index=False, engine="openpyxl")
+        print("--- Processed DataFrame ---")
         print(df)
         return output_path
     except Exception as e:
+        print(f"ERROR - Critical error in processing: {e}")
+        # Re-raise or handle as needed
         return None
 # Improved launch with debug mode enabled
 iface = gr.Interface(
     fn=identify_headers_and_save_excel,