Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import json
|
|
| 4 |
import requests
|
| 5 |
from io import BytesIO
|
| 6 |
from datetime import datetime
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
from io import BytesIO
|
| 9 |
import fitz # PyMuPDF
|
|
@@ -492,6 +493,20 @@ def get_toc_page_numbers(doc, max_pages_to_check=15):
|
|
| 492 |
logger.info("No TOC pages found")
|
| 493 |
return [] # Return empty list if nothing found
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
|
| 496 |
def openPDF(pdf_path):
|
| 497 |
logger.info(f"Opening PDF from URL: {pdf_path}")
|
|
@@ -1838,7 +1853,7 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 1838 |
heading_to_search = heading_to_searchDict['text']
|
| 1839 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1840 |
paths=heading_to_searchDict['path']
|
| 1841 |
-
xloc=heading_to_searchDict['x']
|
| 1842 |
yloc=heading_to_searchDict['y']
|
| 1843 |
|
| 1844 |
# Initialize variables
|
|
@@ -2285,14 +2300,14 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 2285 |
|
| 2286 |
# If they match or the subject is inside the first line, remove it
|
| 2287 |
if subject in first_line or first_line in subject:
|
| 2288 |
-
entry["BodyText"] = entry["BodyText"][1:]
|
| 2289 |
-
|
| 2290 |
-
#
|
| 2291 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 2292 |
-
logger.info(f"Markups done!")
|
| 2293 |
logger.info(f"Uploaded and Readyy!")
|
| 2294 |
-
|
| 2295 |
-
|
| 2296 |
return json_output,identified_headers
|
| 2297 |
|
| 2298 |
|
|
@@ -2310,12 +2325,77 @@ def build_subject_body_map(jsons):
|
|
| 2310 |
|
| 2311 |
return subject_body
|
| 2312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2313 |
def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 2314 |
try:
|
| 2315 |
-
# result = identify_headers_with_openrouterNEWW(pdf_path, model
|
| 2316 |
-
print('beginnging identify')
|
| 2317 |
jsons,result = testFunction(pdf_path, model,LLM_prompt)
|
| 2318 |
-
print('
|
| 2319 |
if not result:
|
| 2320 |
df = pd.DataFrame([{
|
| 2321 |
"text": None,
|
|
@@ -2326,54 +2406,62 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
|
| 2326 |
"System Message": "No headers were identified by the LLM."
|
| 2327 |
}])
|
| 2328 |
else:
|
|
|
|
| 2329 |
df = pd.DataFrame(result)
|
| 2330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2331 |
subject_body_map = {}
|
| 2332 |
|
| 2333 |
-
|
| 2334 |
-
|
| 2335 |
-
|
| 2336 |
-
continue
|
| 2337 |
-
|
| 2338 |
-
for section_level in pdf_level:
|
| 2339 |
-
# If the LLM returns a list of dictionaries here
|
| 2340 |
-
if isinstance(section_level, list):
|
| 2341 |
-
for obj in section_level:
|
| 2342 |
-
if isinstance(obj, dict):
|
| 2343 |
-
subject = obj.get("Subject")
|
| 2344 |
-
body = obj.get("BodyText", [])
|
| 2345 |
-
if subject:
|
| 2346 |
-
# Ensure body is a list before joining
|
| 2347 |
-
body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 2348 |
-
subject_body_map[subject.strip()] = body_str
|
| 2349 |
-
|
| 2350 |
-
# If the LLM returns a single dictionary here
|
| 2351 |
-
elif isinstance(section_level, dict):
|
| 2352 |
-
subject = section_level.get("Subject")
|
| 2353 |
-
body = section_level.get("BodyText", [])
|
| 2354 |
-
if subject:
|
| 2355 |
-
body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 2356 |
-
subject_body_map[subject.strip()] = body_str
|
| 2357 |
-
|
| 2358 |
-
# Map the extracted body text to the "text" column in your main DataFrame
|
| 2359 |
-
if "text" in df.columns:
|
| 2360 |
-
df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
|
| 2361 |
-
else:
|
| 2362 |
-
df["body"] = None
|
| 2363 |
|
| 2364 |
-
|
| 2365 |
-
|
| 2366 |
-
|
| 2367 |
-
|
| 2368 |
-
|
| 2369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2370 |
|
| 2371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2372 |
|
| 2373 |
except Exception as e:
|
| 2374 |
-
|
| 2375 |
-
# Re-raise or handle as needed
|
| 2376 |
return None
|
|
|
|
| 2377 |
# Improved launch with debug mode enabled
|
| 2378 |
iface = gr.Interface(
|
| 2379 |
fn=identify_headers_and_save_excel,
|
|
|
|
| 4 |
import requests
|
| 5 |
from io import BytesIO
|
| 6 |
from datetime import datetime
|
| 7 |
+
from difflib import SequenceMatcher
|
| 8 |
import pandas as pd
|
| 9 |
from io import BytesIO
|
| 10 |
import fitz # PyMuPDF
|
|
|
|
| 493 |
logger.info("No TOC pages found")
|
| 494 |
return [] # Return empty list if nothing found
|
| 495 |
|
| 496 |
+
def is_header(span, most_common_font_size, most_common_color, most_common_font,allheadersLLM):
|
| 497 |
+
fontname = span.get("font", "").lower()
|
| 498 |
+
# is_italic = "italic" in fontname or "oblique" in fontname
|
| 499 |
+
isheader=False
|
| 500 |
+
is_bold = "bold" in fontname or span.get("bold", False)
|
| 501 |
+
if span['text'] in allheadersLLM:
|
| 502 |
+
isheader=True
|
| 503 |
+
return (
|
| 504 |
+
(
|
| 505 |
+
span["size"] > most_common_font_size or
|
| 506 |
+
span["font"].lower() != most_common_font.lower() or
|
| 507 |
+
(isheader and span["size"] > most_common_font_size )
|
| 508 |
+
)
|
| 509 |
+
)
|
| 510 |
|
| 511 |
def openPDF(pdf_path):
|
| 512 |
logger.info(f"Opening PDF from URL: {pdf_path}")
|
|
|
|
| 1853 |
heading_to_search = heading_to_searchDict['text']
|
| 1854 |
heading_to_searchPageNum = heading_to_searchDict['page']
|
| 1855 |
paths=heading_to_searchDict['path']
|
| 1856 |
+
# xloc=heading_to_searchDict['x']
|
| 1857 |
yloc=heading_to_searchDict['y']
|
| 1858 |
|
| 1859 |
# Initialize variables
|
|
|
|
| 2300 |
|
| 2301 |
# If they match or the subject is inside the first line, remove it
|
| 2302 |
if subject in first_line or first_line in subject:
|
| 2303 |
+
entry["BodyText"] = entry["BodyText"][1:]
|
| 2304 |
+
print('data_list_JSON',data_list_JSON)
|
| 2305 |
+
# json_output.append(data_list_JSON)
|
| 2306 |
json_output = json.dumps(data_list_JSON, indent=4)
|
| 2307 |
+
logger.info(f"Markups done! Uploading to dropbox")
|
| 2308 |
logger.info(f"Uploaded and Readyy!")
|
| 2309 |
+
|
| 2310 |
+
|
| 2311 |
return json_output,identified_headers
|
| 2312 |
|
| 2313 |
|
|
|
|
| 2325 |
|
| 2326 |
return subject_body
|
| 2327 |
|
| 2328 |
+
# def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 2329 |
+
# try:
|
| 2330 |
+
# # result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
|
| 2331 |
+
# print('beginnging identify')
|
| 2332 |
+
# jsons,result = testFunction(pdf_path, model,LLM_prompt)
|
| 2333 |
+
# print('done , will start dataframe',jsons,result)
|
| 2334 |
+
# if not result:
|
| 2335 |
+
# df = pd.DataFrame([{
|
| 2336 |
+
# "text": None,
|
| 2337 |
+
# "page": None,
|
| 2338 |
+
# "suggested_level": None,
|
| 2339 |
+
# "confidence": None,
|
| 2340 |
+
# "body": None,
|
| 2341 |
+
# "System Message": "No headers were identified by the LLM."
|
| 2342 |
+
# }])
|
| 2343 |
+
# else:
|
| 2344 |
+
# df = pd.DataFrame(result)
|
| 2345 |
+
|
| 2346 |
+
# subject_body_map = {}
|
| 2347 |
+
|
| 2348 |
+
# # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
|
| 2349 |
+
# for pdf_level in jsons:
|
| 2350 |
+
# if not isinstance(pdf_level, list):
|
| 2351 |
+
# continue
|
| 2352 |
+
|
| 2353 |
+
# for section_level in pdf_level:
|
| 2354 |
+
# # If the LLM returns a list of dictionaries here
|
| 2355 |
+
# if isinstance(section_level, list):
|
| 2356 |
+
# for obj in section_level:
|
| 2357 |
+
# if isinstance(obj, dict):
|
| 2358 |
+
# subject = obj.get("Subject")
|
| 2359 |
+
# body = obj.get("BodyText", [])
|
| 2360 |
+
# if subject:
|
| 2361 |
+
# # Ensure body is a list before joining
|
| 2362 |
+
# body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 2363 |
+
# subject_body_map[subject.strip()] = body_str
|
| 2364 |
+
|
| 2365 |
+
# # If the LLM returns a single dictionary here
|
| 2366 |
+
# elif isinstance(section_level, dict):
|
| 2367 |
+
# subject = section_level.get("Subject")
|
| 2368 |
+
# body = section_level.get("BodyText", [])
|
| 2369 |
+
# if subject:
|
| 2370 |
+
# body_str = " ".join(body) if isinstance(body, list) else str(body)
|
| 2371 |
+
# subject_body_map[subject.strip()] = body_str
|
| 2372 |
+
|
| 2373 |
+
# # Map the extracted body text to the "text" column in your main DataFrame
|
| 2374 |
+
# if "text" in df.columns:
|
| 2375 |
+
# df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
|
| 2376 |
+
# else:
|
| 2377 |
+
# df["body"] = None
|
| 2378 |
+
|
| 2379 |
+
# # Save to Excel
|
| 2380 |
+
# output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2381 |
+
# df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2382 |
+
|
| 2383 |
+
# print("--- Processed DataFrame ---")
|
| 2384 |
+
# print(df)
|
| 2385 |
+
|
| 2386 |
+
# return output_path
|
| 2387 |
+
|
| 2388 |
+
# except Exception as e:
|
| 2389 |
+
# print(f"ERROR - Critical error in processing: {e}")
|
| 2390 |
+
# # Re-raise or handle as needed
|
| 2391 |
+
# return None
|
| 2392 |
+
|
| 2393 |
+
|
| 2394 |
def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
|
| 2395 |
try:
|
| 2396 |
+
# result = identify_headers_with_openrouterNEWW(pdf_path, model)
|
|
|
|
| 2397 |
jsons,result = testFunction(pdf_path, model,LLM_prompt)
|
| 2398 |
+
print('jsonssss',jsons)
|
| 2399 |
if not result:
|
| 2400 |
df = pd.DataFrame([{
|
| 2401 |
"text": None,
|
|
|
|
| 2406 |
"System Message": "No headers were identified by the LLM."
|
| 2407 |
}])
|
| 2408 |
else:
|
| 2409 |
+
print('here')
|
| 2410 |
df = pd.DataFrame(result)
|
| 2411 |
|
| 2412 |
+
# subject_body_map = {}
|
| 2413 |
+
|
| 2414 |
+
# for pdf_sections in jsons:
|
| 2415 |
+
# for obj in pdf_sections:
|
| 2416 |
+
# subject = obj.get("Subject")
|
| 2417 |
+
# body = obj.get("BodyText", [])
|
| 2418 |
+
|
| 2419 |
+
# if subject:
|
| 2420 |
+
# subject_body_map[subject.strip()] = " ".join(body)
|
| 2421 |
+
|
| 2422 |
+
# df["body"] = df["text"].map(subject_body_map)
|
| 2423 |
+
|
| 2424 |
subject_body_map = {}
|
| 2425 |
|
| 2426 |
+
def process_obj(obj):
|
| 2427 |
+
if not isinstance(obj, dict):
|
| 2428 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2429 |
|
| 2430 |
+
subject = obj.get("Subject")
|
| 2431 |
+
body = obj.get("BodyText", [])
|
| 2432 |
+
|
| 2433 |
+
if subject:
|
| 2434 |
+
if isinstance(body, list):
|
| 2435 |
+
body_text = " ".join(body)
|
| 2436 |
+
else:
|
| 2437 |
+
body_text = str(body)
|
| 2438 |
+
|
| 2439 |
+
subject_body_map[subject.strip()] = body_text
|
| 2440 |
+
|
| 2441 |
+
|
| 2442 |
+
for item in jsons:
|
| 2443 |
|
| 2444 |
+
# Case: flat list of dicts (like your example)
|
| 2445 |
+
if isinstance(item, dict):
|
| 2446 |
+
process_obj(item)
|
| 2447 |
+
|
| 2448 |
+
# Case: nested list of dicts
|
| 2449 |
+
elif isinstance(item, list):
|
| 2450 |
+
for obj in item:
|
| 2451 |
+
process_obj(obj)
|
| 2452 |
+
|
| 2453 |
+
output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 2454 |
+
df.to_excel(output_path, index=False, engine="openpyxl")
|
| 2455 |
+
|
| 2456 |
+
print("--- Processed DataFrame ---")
|
| 2457 |
+
print(df)
|
| 2458 |
+
|
| 2459 |
+
return output_path
|
| 2460 |
|
| 2461 |
except Exception as e:
|
| 2462 |
+
logger.error(f"Critical error in processing: {str(e)}")
|
|
|
|
| 2463 |
return None
|
| 2464 |
+
|
| 2465 |
# Improved launch with debug mode enabled
|
| 2466 |
iface = gr.Interface(
|
| 2467 |
fn=identify_headers_and_save_excel,
|