Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2395,43 +2395,46 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
|
| 2395 |
# subject_body_map[subject.strip()] = " ".join(body)
|
| 2396 |
|
| 2397 |
# df["body"] = df["text"].map(subject_body_map)
|
| 2398 |
-
|
| 2399 |
subject_body_map = {}
|
| 2400 |
-
|
| 2401 |
def process_obj(obj):
|
| 2402 |
if not isinstance(obj, dict):
|
| 2403 |
return
|
| 2404 |
-
|
| 2405 |
subject = obj.get("Subject")
|
| 2406 |
body = obj.get("BodyText", [])
|
| 2407 |
-
|
| 2408 |
if subject:
|
| 2409 |
if isinstance(body, list):
|
| 2410 |
body_text = " ".join(body)
|
| 2411 |
else:
|
| 2412 |
body_text = str(body)
|
| 2413 |
-
|
| 2414 |
subject_body_map[subject.strip()] = body_text
|
| 2415 |
-
|
| 2416 |
-
|
| 2417 |
for item in jsons:
|
| 2418 |
-
|
| 2419 |
-
# Case: flat list of dicts
|
| 2420 |
if isinstance(item, dict):
|
| 2421 |
process_obj(item)
|
| 2422 |
-
|
| 2423 |
-
# Case: nested list
|
| 2424 |
elif isinstance(item, list):
|
| 2425 |
for obj in item:
|
| 2426 |
process_obj(obj)
|
| 2427 |
-
|
| 2428 |
-
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2429 |
-
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2430 |
-
|
| 2431 |
-
print("--- Processed DataFrame ---")
|
| 2432 |
-
print(df)
|
| 2433 |
|
| 2434 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2435 |
|
| 2436 |
except Exception as e:
|
| 2437 |
logger.error(f"Critical error in processing: {str(e)}")
|
|
|
|
| 2395 |
# subject_body_map[subject.strip()] = " ".join(body)
|
| 2396 |
|
| 2397 |
# df["body"] = df["text"].map(subject_body_map)
|
|
|
|
| 2398 |
subject_body_map = {}
|
| 2399 |
+
|
| 2400 |
def process_obj(obj):
|
| 2401 |
if not isinstance(obj, dict):
|
| 2402 |
return
|
| 2403 |
+
|
| 2404 |
subject = obj.get("Subject")
|
| 2405 |
body = obj.get("BodyText", [])
|
| 2406 |
+
|
| 2407 |
if subject:
|
| 2408 |
if isinstance(body, list):
|
| 2409 |
body_text = " ".join(body)
|
| 2410 |
else:
|
| 2411 |
body_text = str(body)
|
| 2412 |
+
|
| 2413 |
subject_body_map[subject.strip()] = body_text
|
| 2414 |
+
|
| 2415 |
+
|
| 2416 |
for item in jsons:
|
| 2417 |
+
|
| 2418 |
+
# Case: flat list of dicts
|
| 2419 |
if isinstance(item, dict):
|
| 2420 |
process_obj(item)
|
| 2421 |
+
|
| 2422 |
+
# Case: nested list
|
| 2423 |
elif isinstance(item, list):
|
| 2424 |
for obj in item:
|
| 2425 |
process_obj(obj)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2426 |
|
| 2427 |
+
|
| 2428 |
+
df["body"] = df["text"].map(subject_body_map).fillna("")
|
| 2429 |
+
|
| 2430 |
+
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2431 |
+
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2432 |
+
|
| 2433 |
+
print("--- Processed DataFrame ---")
|
| 2434 |
+
print(df)
|
| 2435 |
+
|
| 2436 |
+
return output_path
|
| 2437 |
+
|
| 2438 |
|
| 2439 |
except Exception as e:
|
| 2440 |
logger.error(f"Critical error in processing: {str(e)}")
|