Marthee commited on
Commit
1a4314b
·
verified ·
1 Parent(s): fe4d2e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -46
app.py CHANGED
@@ -2368,9 +2368,8 @@ def build_subject_body_map(jsons):
2368
 
2369
  def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2370
  try:
2371
- # result = identify_headers_with_openrouterNEWW(pdf_path, model)
2372
- jsons,result = testFunction(pdf_path, model,LLM_prompt)
2373
- print('jsonssss',jsons)
2374
  if not result:
2375
  df = pd.DataFrame([{
2376
  "text": None,
@@ -2380,66 +2379,46 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2380
  "body": None,
2381
  "System Message": "No headers were identified by the LLM."
2382
  }])
 
2383
  else:
2384
  print('here')
2385
  df = pd.DataFrame(result)
2386
 
2387
- # subject_body_map = {}
 
 
2388
 
2389
- # for pdf_sections in jsons:
2390
- # for obj in pdf_sections:
2391
- # subject = obj.get("Subject")
2392
- # body = obj.get("BodyText", [])
2393
 
2394
- # if subject:
2395
- # subject_body_map[subject.strip()] = " ".join(body)
2396
 
2397
- # df["body"] = df["text"].map(subject_body_map)
2398
- subject_body_map = {}
2399
-
2400
- def process_obj(obj):
2401
  if not isinstance(obj, dict):
2402
- return
2403
-
2404
  subject = obj.get("Subject")
2405
  body = obj.get("BodyText", [])
2406
-
2407
  if subject:
2408
- if isinstance(body, list):
2409
- body_text = " ".join(body)
2410
- else:
2411
- body_text = str(body)
2412
-
2413
- subject_body_map[subject.strip()] = body_text
2414
-
2415
-
2416
- for item in jsons:
2417
-
2418
- # Case: flat list of dicts
2419
- if isinstance(item, dict):
2420
- process_obj(item)
2421
-
2422
- # Case: nested list
2423
- elif isinstance(item, list):
2424
- for obj in item:
2425
- process_obj(obj)
2426
-
2427
-
2428
  df["body"] = df["text"].map(subject_body_map).fillna("")
2429
-
2430
- output_path = os.path.abspath("header_analysis_output.xlsx")
2431
- df.to_excel(output_path, index=False, engine="openpyxl")
2432
-
2433
- print("--- Processed DataFrame ---")
2434
- print(df)
2435
-
2436
- return output_path
2437
 
 
 
 
 
 
 
 
 
2438
 
2439
  except Exception as e:
2440
  logger.error(f"Critical error in processing: {str(e)}")
2441
  return None
2442
-
 
2443
  # Improved launch with debug mode enabled
2444
  iface = gr.Interface(
2445
  fn=identify_headers_and_save_excel,
 
2368
 
2369
  def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2370
  try:
2371
+ jsons, result = testFunction(pdf_path, model,LLM_prompt)
2372
+
 
2373
  if not result:
2374
  df = pd.DataFrame([{
2375
  "text": None,
 
2379
  "body": None,
2380
  "System Message": "No headers were identified by the LLM."
2381
  }])
2382
+
2383
  else:
2384
  print('here')
2385
  df = pd.DataFrame(result)
2386
 
2387
+ # Convert JSON string to list if needed
2388
+ if isinstance(jsons, str):
2389
+ jsons = json.loads(jsons)
2390
 
2391
+ subject_body_map = {}
 
 
 
2392
 
2393
+ # ✅ jsons is a flat list of dicts
2394
+ for obj in jsons:
2395
 
 
 
 
 
2396
  if not isinstance(obj, dict):
2397
+ continue
2398
+
2399
  subject = obj.get("Subject")
2400
  body = obj.get("BodyText", [])
2401
+
2402
  if subject:
2403
+ subject_body_map[subject.strip()] = " ".join(body)
2404
+
2405
+ # ✅ Map body to dataframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2406
  df["body"] = df["text"].map(subject_body_map).fillna("")
 
 
 
 
 
 
 
 
2407
 
2408
+ # ✅ Save once at end
2409
+ output_path = os.path.abspath("header_analysis_output.xlsx")
2410
+ df.to_excel(output_path, index=False, engine="openpyxl")
2411
+
2412
+ print("--- Processed DataFrame ---")
2413
+ print(df)
2414
+
2415
+ return output_path
2416
 
2417
  except Exception as e:
2418
  logger.error(f"Critical error in processing: {str(e)}")
2419
  return None
2420
+
2421
+
2422
  # Improved launch with debug mode enabled
2423
  iface = gr.Interface(
2424
  fn=identify_headers_and_save_excel,