Marthee commited on
Commit
7f5d965
·
verified ·
1 Parent(s): b13a7a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -49
app.py CHANGED
@@ -4,6 +4,7 @@ import json
4
  import requests
5
  from io import BytesIO
6
  from datetime import datetime
 
7
  import pandas as pd
8
  from io import BytesIO
9
  import fitz # PyMuPDF
@@ -492,6 +493,20 @@ def get_toc_page_numbers(doc, max_pages_to_check=15):
492
  logger.info("No TOC pages found")
493
  return [] # Return empty list if nothing found
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  def openPDF(pdf_path):
497
  logger.info(f"Opening PDF from URL: {pdf_path}")
@@ -1838,7 +1853,7 @@ def testFunction(pdf_path, model,LLM_prompt):
1838
  heading_to_search = heading_to_searchDict['text']
1839
  heading_to_searchPageNum = heading_to_searchDict['page']
1840
  paths=heading_to_searchDict['path']
1841
- xloc=heading_to_searchDict['x']
1842
  yloc=heading_to_searchDict['y']
1843
 
1844
  # Initialize variables
@@ -2285,14 +2300,14 @@ def testFunction(pdf_path, model,LLM_prompt):
2285
 
2286
  # If they match or the subject is inside the first line, remove it
2287
  if subject in first_line or first_line in subject:
2288
- entry["BodyText"] = entry["BodyText"][1:]
2289
-
2290
- # jsons.append(data_list_JSON)
2291
  json_output = json.dumps(data_list_JSON, indent=4)
2292
- logger.info(f"Markups done!")
2293
  logger.info(f"Uploaded and Readyy!")
2294
-
2295
-
2296
  return json_output,identified_headers
2297
 
2298
 
@@ -2310,12 +2325,77 @@ def build_subject_body_map(jsons):
2310
 
2311
  return subject_body
2312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2313
  def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2314
  try:
2315
- # result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
2316
- print('beginnging identify')
2317
  jsons,result = testFunction(pdf_path, model,LLM_prompt)
2318
- print('done , will start dataframe',jsons,result)
2319
  if not result:
2320
  df = pd.DataFrame([{
2321
  "text": None,
@@ -2326,54 +2406,62 @@ def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2326
  "System Message": "No headers were identified by the LLM."
2327
  }])
2328
  else:
 
2329
  df = pd.DataFrame(result)
2330
 
 
 
 
 
 
 
 
 
 
 
 
 
2331
  subject_body_map = {}
2332
 
2333
- # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
2334
- for pdf_level in jsons:
2335
- if not isinstance(pdf_level, list):
2336
- continue
2337
-
2338
- for section_level in pdf_level:
2339
- # If the LLM returns a list of dictionaries here
2340
- if isinstance(section_level, list):
2341
- for obj in section_level:
2342
- if isinstance(obj, dict):
2343
- subject = obj.get("Subject")
2344
- body = obj.get("BodyText", [])
2345
- if subject:
2346
- # Ensure body is a list before joining
2347
- body_str = " ".join(body) if isinstance(body, list) else str(body)
2348
- subject_body_map[subject.strip()] = body_str
2349
-
2350
- # If the LLM returns a single dictionary here
2351
- elif isinstance(section_level, dict):
2352
- subject = section_level.get("Subject")
2353
- body = section_level.get("BodyText", [])
2354
- if subject:
2355
- body_str = " ".join(body) if isinstance(body, list) else str(body)
2356
- subject_body_map[subject.strip()] = body_str
2357
-
2358
- # Map the extracted body text to the "text" column in your main DataFrame
2359
- if "text" in df.columns:
2360
- df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
2361
- else:
2362
- df["body"] = None
2363
 
2364
- # Save to Excel
2365
- output_path = os.path.abspath("header_analysis_output.xlsx")
2366
- df.to_excel(output_path, index=False, engine="openpyxl")
2367
-
2368
- print("--- Processed DataFrame ---")
2369
- print(df)
 
 
 
 
 
 
 
2370
 
2371
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2372
 
2373
  except Exception as e:
2374
- print(f"ERROR - Critical error in processing: {e}")
2375
- # Re-raise or handle as needed
2376
  return None
 
2377
  # Improved launch with debug mode enabled
2378
  iface = gr.Interface(
2379
  fn=identify_headers_and_save_excel,
 
4
  import requests
5
  from io import BytesIO
6
  from datetime import datetime
7
+ from difflib import SequenceMatcher
8
  import pandas as pd
9
  from io import BytesIO
10
  import fitz # PyMuPDF
 
493
  logger.info("No TOC pages found")
494
  return [] # Return empty list if nothing found
495
 
496
+ def is_header(span, most_common_font_size, most_common_color, most_common_font,allheadersLLM):
497
+ fontname = span.get("font", "").lower()
498
+ # is_italic = "italic" in fontname or "oblique" in fontname
499
+ isheader=False
500
+ is_bold = "bold" in fontname or span.get("bold", False)
501
+ if span['text'] in allheadersLLM:
502
+ isheader=True
503
+ return (
504
+ (
505
+ span["size"] > most_common_font_size or
506
+ span["font"].lower() != most_common_font.lower() or
507
+ (isheader and span["size"] > most_common_font_size )
508
+ )
509
+ )
510
 
511
  def openPDF(pdf_path):
512
  logger.info(f"Opening PDF from URL: {pdf_path}")
 
1853
  heading_to_search = heading_to_searchDict['text']
1854
  heading_to_searchPageNum = heading_to_searchDict['page']
1855
  paths=heading_to_searchDict['path']
1856
+ # xloc=heading_to_searchDict['x']
1857
  yloc=heading_to_searchDict['y']
1858
 
1859
  # Initialize variables
 
2300
 
2301
  # If they match or the subject is inside the first line, remove it
2302
  if subject in first_line or first_line in subject:
2303
+ entry["BodyText"] = entry["BodyText"][1:]
2304
+ print('data_list_JSON',data_list_JSON)
2305
+ # json_output.append(data_list_JSON)
2306
  json_output = json.dumps(data_list_JSON, indent=4)
2307
+ logger.info(f"Markups done! Uploading to dropbox")
2308
  logger.info(f"Uploaded and Readyy!")
2309
+
2310
+
2311
  return json_output,identified_headers
2312
 
2313
 
 
2325
 
2326
  return subject_body
2327
 
2328
+ # def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2329
+ # try:
2330
+ # # result = identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt)
2331
+ # print('beginnging identify')
2332
+ # jsons,result = testFunction(pdf_path, model,LLM_prompt)
2333
+ # print('done , will start dataframe',jsons,result)
2334
+ # if not result:
2335
+ # df = pd.DataFrame([{
2336
+ # "text": None,
2337
+ # "page": None,
2338
+ # "suggested_level": None,
2339
+ # "confidence": None,
2340
+ # "body": None,
2341
+ # "System Message": "No headers were identified by the LLM."
2342
+ # }])
2343
+ # else:
2344
+ # df = pd.DataFrame(result)
2345
+
2346
+ # subject_body_map = {}
2347
+
2348
+ # # Safely navigate the nested structure: [ [ [ {dict}, {dict} ] ] ]
2349
+ # for pdf_level in jsons:
2350
+ # if not isinstance(pdf_level, list):
2351
+ # continue
2352
+
2353
+ # for section_level in pdf_level:
2354
+ # # If the LLM returns a list of dictionaries here
2355
+ # if isinstance(section_level, list):
2356
+ # for obj in section_level:
2357
+ # if isinstance(obj, dict):
2358
+ # subject = obj.get("Subject")
2359
+ # body = obj.get("BodyText", [])
2360
+ # if subject:
2361
+ # # Ensure body is a list before joining
2362
+ # body_str = " ".join(body) if isinstance(body, list) else str(body)
2363
+ # subject_body_map[subject.strip()] = body_str
2364
+
2365
+ # # If the LLM returns a single dictionary here
2366
+ # elif isinstance(section_level, dict):
2367
+ # subject = section_level.get("Subject")
2368
+ # body = section_level.get("BodyText", [])
2369
+ # if subject:
2370
+ # body_str = " ".join(body) if isinstance(body, list) else str(body)
2371
+ # subject_body_map[subject.strip()] = body_str
2372
+
2373
+ # # Map the extracted body text to the "text" column in your main DataFrame
2374
+ # if "text" in df.columns:
2375
+ # df["body"] = df["text"].map(lambda x: subject_body_map.get(str(x).strip()) if x else None)
2376
+ # else:
2377
+ # df["body"] = None
2378
+
2379
+ # # Save to Excel
2380
+ # output_path = os.path.abspath("header_analysis_output.xlsx")
2381
+ # df.to_excel(output_path, index=False, engine="openpyxl")
2382
+
2383
+ # print("--- Processed DataFrame ---")
2384
+ # print(df)
2385
+
2386
+ # return output_path
2387
+
2388
+ # except Exception as e:
2389
+ # print(f"ERROR - Critical error in processing: {e}")
2390
+ # # Re-raise or handle as needed
2391
+ # return None
2392
+
2393
+
2394
  def identify_headers_and_save_excel(pdf_path, model,LLM_prompt):
2395
  try:
2396
+ # result = identify_headers_with_openrouterNEWW(pdf_path, model)
 
2397
  jsons,result = testFunction(pdf_path, model,LLM_prompt)
2398
+ print('jsonssss',jsons)
2399
  if not result:
2400
  df = pd.DataFrame([{
2401
  "text": None,
 
2406
  "System Message": "No headers were identified by the LLM."
2407
  }])
2408
  else:
2409
+ print('here')
2410
  df = pd.DataFrame(result)
2411
 
2412
+ # subject_body_map = {}
2413
+
2414
+ # for pdf_sections in jsons:
2415
+ # for obj in pdf_sections:
2416
+ # subject = obj.get("Subject")
2417
+ # body = obj.get("BodyText", [])
2418
+
2419
+ # if subject:
2420
+ # subject_body_map[subject.strip()] = " ".join(body)
2421
+
2422
+ # df["body"] = df["text"].map(subject_body_map)
2423
+
2424
  subject_body_map = {}
2425
 
2426
+ def process_obj(obj):
2427
+ if not isinstance(obj, dict):
2428
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2429
 
2430
+ subject = obj.get("Subject")
2431
+ body = obj.get("BodyText", [])
2432
+
2433
+ if subject:
2434
+ if isinstance(body, list):
2435
+ body_text = " ".join(body)
2436
+ else:
2437
+ body_text = str(body)
2438
+
2439
+ subject_body_map[subject.strip()] = body_text
2440
+
2441
+
2442
+ for item in jsons:
2443
 
2444
+ # Case: flat list of dicts (like your example)
2445
+ if isinstance(item, dict):
2446
+ process_obj(item)
2447
+
2448
+ # Case: nested list of dicts
2449
+ elif isinstance(item, list):
2450
+ for obj in item:
2451
+ process_obj(obj)
2452
+
2453
+ output_path = os.path.abspath("header_analysis_output.xlsx")
2454
+ df.to_excel(output_path, index=False, engine="openpyxl")
2455
+
2456
+ print("--- Processed DataFrame ---")
2457
+ print(df)
2458
+
2459
+ return output_path
2460
 
2461
  except Exception as e:
2462
+ logger.error(f"Critical error in processing: {str(e)}")
 
2463
  return None
2464
+
2465
  # Improved launch with debug mode enabled
2466
  iface = gr.Interface(
2467
  fn=identify_headers_and_save_excel,