Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2368,9 +2368,8 @@ def build_subject_body_map(jsons):
|
|
| 2368 |
|
| 2369 |
def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 2370 |
try:
|
| 2371 |
-
|
| 2372 |
-
|
| 2373 |
-
print('jsonssss',jsons)
|
| 2374 |
if not result:
|
| 2375 |
df = pd.DataFrame([{
|
| 2376 |
"text": None,
|
|
@@ -2380,66 +2379,46 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
|
| 2380 |
"body": None,
|
| 2381 |
"System Message": "No headers were identified by the LLM."
|
| 2382 |
}])
|
|
|
|
| 2383 |
else:
|
| 2384 |
print('here')
|
| 2385 |
df = pd.DataFrame(result)
|
| 2386 |
|
| 2387 |
-
#
|
|
|
|
|
|
|
| 2388 |
|
| 2389 |
-
|
| 2390 |
-
# for obj in pdf_sections:
|
| 2391 |
-
# subject = obj.get("Subject")
|
| 2392 |
-
# body = obj.get("BodyText", [])
|
| 2393 |
|
| 2394 |
-
#
|
| 2395 |
-
|
| 2396 |
|
| 2397 |
-
# df["body"] = df["text"].map(subject_body_map)
|
| 2398 |
-
subject_body_map = {}
|
| 2399 |
-
|
| 2400 |
-
def process_obj(obj):
|
| 2401 |
if not isinstance(obj, dict):
|
| 2402 |
-
|
| 2403 |
-
|
| 2404 |
subject = obj.get("Subject")
|
| 2405 |
body = obj.get("BodyText", [])
|
| 2406 |
-
|
| 2407 |
if subject:
|
| 2408 |
-
|
| 2409 |
-
|
| 2410 |
-
|
| 2411 |
-
body_text = str(body)
|
| 2412 |
-
|
| 2413 |
-
subject_body_map[subject.strip()] = body_text
|
| 2414 |
-
|
| 2415 |
-
|
| 2416 |
-
for item in jsons:
|
| 2417 |
-
|
| 2418 |
-
# Case: flat list of dicts
|
| 2419 |
-
if isinstance(item, dict):
|
| 2420 |
-
process_obj(item)
|
| 2421 |
-
|
| 2422 |
-
# Case: nested list
|
| 2423 |
-
elif isinstance(item, list):
|
| 2424 |
-
for obj in item:
|
| 2425 |
-
process_obj(obj)
|
| 2426 |
-
|
| 2427 |
-
|
| 2428 |
df["body"] = df["text"].map(subject_body_map).fillna("")
|
| 2429 |
-
|
| 2430 |
-
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2431 |
-
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2432 |
-
|
| 2433 |
-
print("--- Processed DataFrame ---")
|
| 2434 |
-
print(df)
|
| 2435 |
-
|
| 2436 |
-
return output_path
|
| 2437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2438 |
|
| 2439 |
except Exception as e:
|
| 2440 |
logger.error(f"Critical error in processing: {str(e)}")
|
| 2441 |
return None
|
| 2442 |
-
|
|
|
|
| 2443 |
# Improved launch with debug mode enabled
|
| 2444 |
iface = gr.Interface(
|
| 2445 |
fn=identify_headers_and_save_excel,
|
|
|
|
| 2368 |
|
| 2369 |
def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 2370 |
try:
|
| 2371 |
+
jsons, result = testFunction(pdf_path, model,LLM_prompt)
|
| 2372 |
+
|
|
|
|
| 2373 |
if not result:
|
| 2374 |
df = pd.DataFrame([{
|
| 2375 |
"text": None,
|
|
|
|
| 2379 |
"body": None,
|
| 2380 |
"System Message": "No headers were identified by the LLM."
|
| 2381 |
}])
|
| 2382 |
+
|
| 2383 |
else:
|
| 2384 |
print('here')
|
| 2385 |
df = pd.DataFrame(result)
|
| 2386 |
|
| 2387 |
+
# Convert JSON string to list if needed
|
| 2388 |
+
if isinstance(jsons, str):
|
| 2389 |
+
jsons = json.loads(jsons)
|
| 2390 |
|
| 2391 |
+
subject_body_map = {}
|
|
|
|
|
|
|
|
|
|
| 2392 |
|
| 2393 |
+
# ✅ jsons is a flat list of dicts
|
| 2394 |
+
for obj in jsons:
|
| 2395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2396 |
if not isinstance(obj, dict):
|
| 2397 |
+
continue
|
| 2398 |
+
|
| 2399 |
subject = obj.get("Subject")
|
| 2400 |
body = obj.get("BodyText", [])
|
| 2401 |
+
|
| 2402 |
if subject:
|
| 2403 |
+
subject_body_map[subject.strip()] = " ".join(body)
|
| 2404 |
+
|
| 2405 |
+
# ✅ Map body to dataframe
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2406 |
df["body"] = df["text"].map(subject_body_map).fillna("")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2407 |
|
| 2408 |
+
# ✅ Save once at end
|
| 2409 |
+
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2410 |
+
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2411 |
+
|
| 2412 |
+
print("--- Processed DataFrame ---")
|
| 2413 |
+
print(df)
|
| 2414 |
+
|
| 2415 |
+
return output_path
|
| 2416 |
|
| 2417 |
except Exception as e:
|
| 2418 |
logger.error(f"Critical error in processing: {str(e)}")
|
| 2419 |
return None
|
| 2420 |
+
|
| 2421 |
+
|
| 2422 |
# Improved launch with debug mode enabled
|
| 2423 |
iface = gr.Interface(
|
| 2424 |
fn=identify_headers_and_save_excel,
|