Marthee commited on
Commit
fe4d2e3
·
verified ·
1 Parent(s): d0e48f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -18
app.py CHANGED
@@ -2395,43 +2395,46 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2395
  # subject_body_map[subject.strip()] = " ".join(body)
2396
 
2397
  # df["body"] = df["text"].map(subject_body_map)
2398
-
2399
  subject_body_map = {}
2400
-
2401
  def process_obj(obj):
2402
  if not isinstance(obj, dict):
2403
  return
2404
-
2405
  subject = obj.get("Subject")
2406
  body = obj.get("BodyText", [])
2407
-
2408
  if subject:
2409
  if isinstance(body, list):
2410
  body_text = " ".join(body)
2411
  else:
2412
  body_text = str(body)
2413
-
2414
  subject_body_map[subject.strip()] = body_text
2415
-
2416
-
2417
  for item in jsons:
2418
-
2419
- # Case: flat list of dicts (like your example)
2420
  if isinstance(item, dict):
2421
  process_obj(item)
2422
-
2423
- # Case: nested list of dicts
2424
  elif isinstance(item, list):
2425
  for obj in item:
2426
  process_obj(obj)
2427
-
2428
- output_path = os.path.abspath("header_analysis_output.xlsx")
2429
- df.to_excel(output_path, index=False, engine="openpyxl")
2430
-
2431
- print("--- Processed DataFrame ---")
2432
- print(df)
2433
 
2434
- return output_path
 
 
 
 
 
 
 
 
 
 
2435
 
2436
  except Exception as e:
2437
  logger.error(f"Critical error in processing: {str(e)}")
 
2395
  # subject_body_map[subject.strip()] = " ".join(body)
2396
 
2397
  # df["body"] = df["text"].map(subject_body_map)
 
2398
  subject_body_map = {}
2399
+
2400
  def process_obj(obj):
2401
  if not isinstance(obj, dict):
2402
  return
2403
+
2404
  subject = obj.get("Subject")
2405
  body = obj.get("BodyText", [])
2406
+
2407
  if subject:
2408
  if isinstance(body, list):
2409
  body_text = " ".join(body)
2410
  else:
2411
  body_text = str(body)
2412
+
2413
  subject_body_map[subject.strip()] = body_text
2414
+
2415
+
2416
  for item in jsons:
2417
+
2418
+ # Case: flat list of dicts
2419
  if isinstance(item, dict):
2420
  process_obj(item)
2421
+
2422
+ # Case: nested list
2423
  elif isinstance(item, list):
2424
  for obj in item:
2425
  process_obj(obj)
 
 
 
 
 
 
2426
 
2427
+
2428
+ df["body"] = df["text"].map(subject_body_map).fillna("")
2429
+
2430
+ output_path = os.path.abspath("header_analysis_output.xlsx")
2431
+ df.to_excel(output_path, index=False, engine="openpyxl")
2432
+
2433
+ print("--- Processed DataFrame ---")
2434
+ print(df)
2435
+
2436
+ return output_path
2437
+
2438
 
2439
  except Exception as e:
2440
  logger.error(f"Critical error in processing: {str(e)}")