Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -484,325 +484,6 @@ def openPDF(pdf_path):
|
|
| 484 |
logger.info(f"PDF opened successfully, {len(doc)} pages")
|
| 485 |
return doc
|
| 486 |
|
| 487 |
-
# def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
|
| 488 |
-
# """Ask an LLM (OpenRouter) to identify headers in the document.
|
| 489 |
-
# Returns a list of dicts: {text, page, suggested_level, confidence}.
|
| 490 |
-
# The function sends plain page-line strings to the LLM (including page numbers)
|
| 491 |
-
# and asks for a JSON array containing only header lines with suggested levels.
|
| 492 |
-
# """
|
| 493 |
-
# logger.info("=" * 80)
|
| 494 |
-
# logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
|
| 495 |
-
# logger.info(f"PDF Path: {pdf_path}")
|
| 496 |
-
# logger.info(f"Model: {model}")
|
| 497 |
-
# logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
|
| 498 |
-
|
| 499 |
-
# doc = openPDF(pdf_path)
|
| 500 |
-
# api_key = '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'
|
| 501 |
-
# if api_key is None:
|
| 502 |
-
# api_key = os.getenv("OPENROUTER_API_KEY") or None
|
| 503 |
-
# model = str(model)
|
| 504 |
-
# # toc_pages = get_toc_page_numbers(doc)
|
| 505 |
-
# lines_for_prompt = []
|
| 506 |
-
# pgestoRun=20
|
| 507 |
-
# # logger.info(f"TOC pages to skip: {toc_pages}")
|
| 508 |
-
# logger.info(f"Total pages in document: {pgestoRun}")
|
| 509 |
-
|
| 510 |
-
# # Collect text lines from pages (skip TOC pages)
|
| 511 |
-
# total_lines = 0
|
| 512 |
-
# for pno in range(len(doc)):
|
| 513 |
-
# # if pages_to_check and pno not in pages_to_check:
|
| 514 |
-
# # continue
|
| 515 |
-
# # if pno in toc_pages:
|
| 516 |
-
# # logger.debug(f"Skipping TOC page {pno}")
|
| 517 |
-
# # continue
|
| 518 |
-
# page = doc.load_page(pno)
|
| 519 |
-
# page_height = page.rect.height
|
| 520 |
-
|
| 521 |
-
# text_dict = page.get_text("dict")
|
| 522 |
-
# lines_for_prompt = []
|
| 523 |
-
# lines_on_page = 0
|
| 524 |
-
|
| 525 |
-
# for block in text_dict.get("blocks", []):
|
| 526 |
-
# if block.get("type") != 0: # text blocks only
|
| 527 |
-
# continue
|
| 528 |
-
|
| 529 |
-
# for line in block.get("lines", []):
|
| 530 |
-
# spans = line.get("spans", [])
|
| 531 |
-
# if not spans:
|
| 532 |
-
# continue
|
| 533 |
-
|
| 534 |
-
# # Use first span to check vertical position
|
| 535 |
-
# y0 = spans[0]["bbox"][1]
|
| 536 |
-
# y1 = spans[0]['bbox'][3]
|
| 537 |
-
# # if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 538 |
-
# # continue
|
| 539 |
-
# text = " ".join(s.get('text','') for s in spans).strip()
|
| 540 |
-
# if text:
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
# # prefix with page for easier mapping back
|
| 544 |
-
# lines_for_prompt.append(f"PAGE {pno+1}: {text}")
|
| 545 |
-
# lines_on_page += 1
|
| 546 |
-
|
| 547 |
-
# # if lines_on_page > 0:
|
| 548 |
-
|
| 549 |
-
# # page = doc.load_page(pno)
|
| 550 |
-
# # page_height = page.rect.height
|
| 551 |
-
# # lines_on_page = 0
|
| 552 |
-
# # text_dict = page.get_text("dict")
|
| 553 |
-
# # lines = []
|
| 554 |
-
# # y_tolerance = 0.2 # tweak if needed (1–3 usually works)
|
| 555 |
-
# # for block in page.get_text("dict").get('blocks', []):
|
| 556 |
-
# # if block.get('type') != 0:
|
| 557 |
-
# # continue
|
| 558 |
-
# # for line in block.get('lines', []):
|
| 559 |
-
# # spans = line.get('spans', [])
|
| 560 |
-
# # if not spans:
|
| 561 |
-
# # continue
|
| 562 |
-
# # y0 = spans[0]['bbox'][1]
|
| 563 |
-
# # y1 = spans[0]['bbox'][3]
|
| 564 |
-
# # if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 565 |
-
# # continue
|
| 566 |
-
# # for s in spans:
|
| 567 |
-
# # # text,font,size,flags,color
|
| 568 |
-
# # # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
|
| 569 |
-
|
| 570 |
-
# # # prefix with page for easier mapping back
|
| 571 |
-
# # text = s["text"].strip()
|
| 572 |
-
# # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
|
| 573 |
-
|
| 574 |
-
# # # if not lines_for_prompt:
|
| 575 |
-
# # # return []
|
| 576 |
-
|
| 577 |
-
# # if text:
|
| 578 |
-
# # # prefix with page for easier mapping back
|
| 579 |
-
# # # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
|
| 580 |
-
# # lines_on_page += 1
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
# if lines_on_page > 0:
|
| 584 |
-
# logger.debug(f"Page {pno}: collected {lines_on_page} lines")
|
| 585 |
-
# total_lines += lines_on_page
|
| 586 |
-
|
| 587 |
-
# logger.info(f"Total lines collected for LLM: {total_lines}")
|
| 588 |
-
|
| 589 |
-
# if not lines_for_prompt:
|
| 590 |
-
# logger.warning("No lines collected for prompt")
|
| 591 |
-
# return []
|
| 592 |
-
|
| 593 |
-
# # Log sample of lines
|
| 594 |
-
# logger.info("Sample lines (first 10):")
|
| 595 |
-
# for i, line in enumerate(lines_for_prompt[:10]):
|
| 596 |
-
# logger.info(f" {i}: {line}")
|
| 597 |
-
|
| 598 |
-
# prompt = LLM_prompt+"\n\nLines:\n" + "\n".join(lines_for_prompt)
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
# logger.debug(f"Full prompt length: {len(prompt)} characters")
|
| 602 |
-
# # Changed: Print entire prompt, not truncated
|
| 603 |
-
# print("=" * 80)
|
| 604 |
-
# print("FULL LLM PROMPT:")
|
| 605 |
-
# print(prompt)
|
| 606 |
-
# print("=" * 80)
|
| 607 |
-
|
| 608 |
-
# # Also log to file
|
| 609 |
-
# # try:
|
| 610 |
-
# # with open("full_prompt.txt", "w", encoding="utf-8") as f:
|
| 611 |
-
# # f.write(prompt)
|
| 612 |
-
# # logger.info("Full prompt saved to full_prompt.txt")
|
| 613 |
-
# # except Exception as e:
|
| 614 |
-
# # logger.error(f"Could not save prompt to file: {e}")
|
| 615 |
-
|
| 616 |
-
# if not api_key:
|
| 617 |
-
# # No API key: return empty so caller can fallback to heuristics
|
| 618 |
-
# logger.error("No API key provided")
|
| 619 |
-
# return []
|
| 620 |
-
|
| 621 |
-
# url = "https://openrouter.ai/api/v1/chat/completions"
|
| 622 |
-
|
| 623 |
-
# # Build headers following the OpenRouter example
|
| 624 |
-
# headers = {
|
| 625 |
-
# "Authorization": f"Bearer {api_key}",
|
| 626 |
-
# "Content-Type": "application/json",
|
| 627 |
-
# "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
|
| 628 |
-
# "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
|
| 629 |
-
# }
|
| 630 |
-
|
| 631 |
-
# # Log request details (without exposing full API key)
|
| 632 |
-
# logger.info(f"Making request to OpenRouter with model: {model}")
|
| 633 |
-
# logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
|
| 634 |
-
|
| 635 |
-
# # Wrap the prompt as the example 'content' array expected by OpenRouter
|
| 636 |
-
# body = {
|
| 637 |
-
# "model": model,
|
| 638 |
-
# "messages": [
|
| 639 |
-
# {
|
| 640 |
-
# "role": "user",
|
| 641 |
-
# "content": [
|
| 642 |
-
# {"type": "text", "text": prompt}
|
| 643 |
-
# ]
|
| 644 |
-
# }
|
| 645 |
-
# ]
|
| 646 |
-
# }
|
| 647 |
-
|
| 648 |
-
# # Debug: log request body (truncated) and write raw response for inspection
|
| 649 |
-
# try:
|
| 650 |
-
# # Changed: Log full body (excluding prompt text which is already logged)
|
| 651 |
-
# logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
|
| 652 |
-
|
| 653 |
-
# # Removed timeout parameter
|
| 654 |
-
# resp = requests.post(
|
| 655 |
-
# url=url,
|
| 656 |
-
# headers=headers,
|
| 657 |
-
# data=json.dumps(body)
|
| 658 |
-
# )
|
| 659 |
-
|
| 660 |
-
# logger.info(f"HTTP Response Status: {resp.status_code}")
|
| 661 |
-
# resp.raise_for_status()
|
| 662 |
-
|
| 663 |
-
# resp_text = resp.text
|
| 664 |
-
# # Changed: Print entire response
|
| 665 |
-
# print("=" * 80)
|
| 666 |
-
# print("FULL LLM RESPONSE:")
|
| 667 |
-
# print(resp_text)
|
| 668 |
-
# print("=" * 80)
|
| 669 |
-
|
| 670 |
-
# logger.info(f"LLM raw response length: {len(resp_text)}")
|
| 671 |
-
|
| 672 |
-
# # Save raw response for offline inspection
|
| 673 |
-
# try:
|
| 674 |
-
# with open("llm_debug.json", "w", encoding="utf-8") as fh:
|
| 675 |
-
# fh.write(resp_text)
|
| 676 |
-
# logger.info("Raw response saved to llm_debug.json")
|
| 677 |
-
# except Exception as e:
|
| 678 |
-
# logger.error(f"Warning: could not write llm_debug.json: {e}")
|
| 679 |
-
|
| 680 |
-
# rj = resp.json()
|
| 681 |
-
# logger.info(f"LLM parsed response type: {type(rj)}")
|
| 682 |
-
# if isinstance(rj, dict):
|
| 683 |
-
# logger.debug(f"Response keys: {list(rj.keys())}")
|
| 684 |
-
|
| 685 |
-
# except requests.exceptions.RequestException as e:
|
| 686 |
-
# logger.error(f"HTTP request failed: {repr(e)}")
|
| 687 |
-
# return []
|
| 688 |
-
# except Exception as e:
|
| 689 |
-
# logger.error(f"LLM call failed: {repr(e)}")
|
| 690 |
-
# return []
|
| 691 |
-
|
| 692 |
-
# # Extract textual reply robustly
|
| 693 |
-
# text_reply = None
|
| 694 |
-
# if isinstance(rj, dict):
|
| 695 |
-
# choices = rj.get('choices') or []
|
| 696 |
-
# logger.debug(f"Number of choices in response: {len(choices)}")
|
| 697 |
-
|
| 698 |
-
# if choices:
|
| 699 |
-
# for i, c in enumerate(choices):
|
| 700 |
-
# logger.debug(f"Choice {i}: {c}")
|
| 701 |
-
|
| 702 |
-
# c0 = choices[0]
|
| 703 |
-
# msg = c0.get('message') or c0.get('delta') or {}
|
| 704 |
-
# content = msg.get('content')
|
| 705 |
-
|
| 706 |
-
# if isinstance(content, list):
|
| 707 |
-
# logger.debug(f"Content is a list with {len(content)} items")
|
| 708 |
-
# for idx, c in enumerate(content):
|
| 709 |
-
# if c.get('type') == 'text' and c.get('text'):
|
| 710 |
-
# text_reply = c.get('text')
|
| 711 |
-
# logger.debug(f"Found text reply in content[{idx}], length: {len(text_reply)}")
|
| 712 |
-
# break
|
| 713 |
-
# elif isinstance(content, str):
|
| 714 |
-
# text_reply = content
|
| 715 |
-
# logger.debug(f"Content is string, length: {len(text_reply)}")
|
| 716 |
-
# elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
|
| 717 |
-
# text_reply = msg.get('content').get('text')
|
| 718 |
-
# logger.debug(f"Found text in nested content dict")
|
| 719 |
-
|
| 720 |
-
# # Fallback extraction
|
| 721 |
-
# if not text_reply:
|
| 722 |
-
# logger.debug("Trying fallback extraction from choices")
|
| 723 |
-
# for c in rj.get('choices', []):
|
| 724 |
-
# if isinstance(c.get('text'), str):
|
| 725 |
-
# text_reply = c.get('text')
|
| 726 |
-
# logger.debug(f"Found text reply in choice.text, length: {len(text_reply)}")
|
| 727 |
-
# break
|
| 728 |
-
|
| 729 |
-
# if not text_reply:
|
| 730 |
-
# logger.error("Could not extract text reply from response")
|
| 731 |
-
# # Changed: Print the entire response structure for debugging
|
| 732 |
-
# print("=" * 80)
|
| 733 |
-
# print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
|
| 734 |
-
# print(json.dumps(rj, indent=2))
|
| 735 |
-
# print("=" * 80)
|
| 736 |
-
# return []
|
| 737 |
-
|
| 738 |
-
# # Changed: Print the extracted text reply
|
| 739 |
-
# print("=" * 80)
|
| 740 |
-
# print("EXTRACTED TEXT REPLY:")
|
| 741 |
-
# print(text_reply)
|
| 742 |
-
# print("=" * 80)
|
| 743 |
-
|
| 744 |
-
# logger.info(f"Extracted text reply length: {len(text_reply)}")
|
| 745 |
-
# logger.debug(f"First 500 chars of reply: {text_reply[:500]}...")
|
| 746 |
-
|
| 747 |
-
# s = text_reply.strip()
|
| 748 |
-
# start = s.find('[')
|
| 749 |
-
# end = s.rfind(']')
|
| 750 |
-
# js = s[start:end+1] if start != -1 and end != -1 else s
|
| 751 |
-
|
| 752 |
-
# logger.debug(f"Looking for JSON array: start={start}, end={end}")
|
| 753 |
-
# logger.debug(f"Extracted JSON string (first 500 chars): {js[:500]}...")
|
| 754 |
-
|
| 755 |
-
# try:
|
| 756 |
-
# parsed = json.loads(js)
|
| 757 |
-
# logger.info(f"Successfully parsed JSON, got {len(parsed)} items")
|
| 758 |
-
# except json.JSONDecodeError as e:
|
| 759 |
-
# logger.error(f"Failed to parse JSON: {e}")
|
| 760 |
-
# logger.error(f"JSON string that failed to parse: {js[:1000]}")
|
| 761 |
-
# # Try to find any JSON-like structure
|
| 762 |
-
# try:
|
| 763 |
-
# # Try to extract any JSON array
|
| 764 |
-
# import re
|
| 765 |
-
# json_pattern = r'\[\s*\{.*?\}\s*\]'
|
| 766 |
-
# matches = re.findall(json_pattern, text_reply, re.DOTALL)
|
| 767 |
-
# if matches:
|
| 768 |
-
# logger.info(f"Found {len(matches)} potential JSON arrays via regex")
|
| 769 |
-
# for i, match in enumerate(matches):
|
| 770 |
-
# try:
|
| 771 |
-
# parsed = json.loads(match)
|
| 772 |
-
# logger.info(f"Successfully parsed regex match {i} with {len(parsed)} items")
|
| 773 |
-
# break
|
| 774 |
-
# except json.JSONDecodeError as e2:
|
| 775 |
-
# logger.debug(f"Regex match {i} also failed: {e2}")
|
| 776 |
-
# continue
|
| 777 |
-
# else:
|
| 778 |
-
# logger.error("All regex matches failed to parse")
|
| 779 |
-
# return []
|
| 780 |
-
# else:
|
| 781 |
-
# logger.error("No JSON-like pattern found via regex")
|
| 782 |
-
# return []
|
| 783 |
-
# except Exception as e2:
|
| 784 |
-
# logger.error(f"Regex extraction also failed: {e2}")
|
| 785 |
-
# return []
|
| 786 |
-
|
| 787 |
-
# # Log parsed results
|
| 788 |
-
# logger.info(f"Parsed {len(parsed)} header items:")
|
| 789 |
-
# for i, obj in enumerate(parsed[:10]): # Log first 10 items
|
| 790 |
-
# logger.info(f" Item {i}: {obj}")
|
| 791 |
-
|
| 792 |
-
# # Normalize parsed entries and return
|
| 793 |
-
# out = []
|
| 794 |
-
# for obj in parsed:
|
| 795 |
-
# t = obj.get('text')
|
| 796 |
-
# page = int(obj.get('page')) if obj.get('page') else None
|
| 797 |
-
# level = obj.get('suggested_level')
|
| 798 |
-
# conf = float(obj.get('confidence') or 0)
|
| 799 |
-
# if t and page is not None:
|
| 800 |
-
# out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
|
| 801 |
-
|
| 802 |
-
# logger.info(f"Returning {len(out)} valid header entries")
|
| 803 |
-
# return out
|
| 804 |
-
|
| 805 |
-
|
| 806 |
|
| 807 |
def process_document_in_chunks(
|
| 808 |
lengthofDoc,
|
|
@@ -874,39 +555,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
|
|
| 874 |
|
| 875 |
for pno in range(start_page, end_page):
|
| 876 |
page = doc.load_page(pno)
|
| 877 |
-
# # Collect text lines from pages (skip TOC pages)
|
| 878 |
-
# total_lines = 0
|
| 879 |
-
# for pno in range(len(doc)):
|
| 880 |
-
# if pages_to_check and pno not in pages_to_check:
|
| 881 |
-
# continue
|
| 882 |
-
# if pno in toc_pages:
|
| 883 |
-
# logger.debug(f"Skipping TOC page {pno}")
|
| 884 |
-
# continue
|
| 885 |
-
|
| 886 |
-
# page = doc.load_page(pno)
|
| 887 |
-
# page_height = page.rect.height
|
| 888 |
-
# lines_on_page = 0
|
| 889 |
-
# text_dict = page.get_text("dict")
|
| 890 |
-
# lines = []
|
| 891 |
-
# # y_tolerance = 0.2 # tweak if needed (1–3 usually works)
|
| 892 |
-
# for block in text_dict["blocks"]:
|
| 893 |
-
# if block["type"] != 0:
|
| 894 |
-
# continue
|
| 895 |
-
# for line in block["lines"]:
|
| 896 |
-
# for span in line["spans"]:
|
| 897 |
-
# text = span["text"].strip()
|
| 898 |
-
# if not text:
|
| 899 |
-
# continue
|
| 900 |
-
# if text:
|
| 901 |
-
# # prefix with page for easier mapping back
|
| 902 |
-
# lines_for_prompt.append(f"PAGE {pno+1}: {text}")
|
| 903 |
-
# lines_on_page += 1
|
| 904 |
-
|
| 905 |
-
# if lines_on_page > 0:
|
| 906 |
-
# logger.debug(f"Page {pno}: collected {lines_on_page} lines")
|
| 907 |
-
# total_lines += lines_on_page
|
| 908 |
-
|
| 909 |
-
# logger.info(f"Total lines collected for LLM: {total_lines}")
|
| 910 |
page_height = page.rect.height
|
| 911 |
lines_on_page = 0
|
| 912 |
text_dict = page.get_text("dict")
|
|
@@ -1220,32 +868,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
|
|
| 1220 |
logger.info(f"Returning {len(out)} valid header entries")
|
| 1221 |
return out
|
| 1222 |
|
| 1223 |
-
# def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
|
| 1224 |
-
# try:
|
| 1225 |
-
# # 1. Get the result from your LLM function
|
| 1226 |
-
# result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
|
| 1227 |
-
|
| 1228 |
-
# # 2. Safety Check: If LLM failed or returned nothing
|
| 1229 |
-
# if not result:
|
| 1230 |
-
# logger.warning("No headers found or LLM failed. Creating an empty report.")
|
| 1231 |
-
# df = pd.DataFrame([{"System Message": "No headers were identified by the LLM."}])
|
| 1232 |
-
# else:
|
| 1233 |
-
# df = pd.DataFrame(result)
|
| 1234 |
-
|
| 1235 |
-
# # 3. Use an Absolute Path for the output
|
| 1236 |
-
# # This ensures Gradio knows exactly where the file is
|
| 1237 |
-
# output_path = os.path.abspath("header_analysis_output.xlsx")
|
| 1238 |
-
|
| 1239 |
-
# # 4. Save using the engine explicitly
|
| 1240 |
-
# df.to_excel(output_path, index=False, engine='openpyxl')
|
| 1241 |
-
|
| 1242 |
-
# logger.info(f"File successfully saved to {output_path}")
|
| 1243 |
-
# return output_path
|
| 1244 |
-
|
| 1245 |
-
# except Exception as e:
|
| 1246 |
-
# logger.error(f"Critical error in processing: {str(e)}")
|
| 1247 |
-
# # Return None or a custom error message to Gradio
|
| 1248 |
-
# return None
|
| 1249 |
|
| 1250 |
def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
|
| 1251 |
logger.debug(f"Starting function")
|
|
@@ -1504,9 +1126,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
|
|
| 1504 |
# Construct the final encoded link
|
| 1505 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1506 |
|
| 1507 |
-
# Correctly construct the final URL with page and zoom
|
| 1508 |
-
# final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1509 |
-
|
| 1510 |
# Get current date and time
|
| 1511 |
now = datetime.now()
|
| 1512 |
|
|
@@ -1608,9 +1227,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
|
|
| 1608 |
# Construct the final encoded link
|
| 1609 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1610 |
|
| 1611 |
-
# Correctly construct the final URL with page and zoom
|
| 1612 |
-
# final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1613 |
-
|
| 1614 |
# Get current date and time
|
| 1615 |
now = datetime.now()
|
| 1616 |
|
|
@@ -1975,9 +1591,6 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 1975 |
# Construct the final encoded link
|
| 1976 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1977 |
|
| 1978 |
-
# Correctly construct the final URL with page and zoom
|
| 1979 |
-
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 1980 |
-
|
| 1981 |
# Get current date and time
|
| 1982 |
now = datetime.now()
|
| 1983 |
|
|
@@ -2080,9 +1693,6 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 2080 |
# Construct the final encoded link
|
| 2081 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 2082 |
|
| 2083 |
-
# Correctly construct the final URL with page and zoom
|
| 2084 |
-
final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
|
| 2085 |
-
|
| 2086 |
# Get current date and time
|
| 2087 |
now = datetime.now()
|
| 2088 |
|
|
@@ -2160,10 +1770,7 @@ def testFunction(pdf_path, model,LLM_prompt):
|
|
| 2160 |
# for header in allheaders_LLM
|
| 2161 |
# )
|
| 2162 |
|
| 2163 |
-
# # ✅ FINAL header
|
| 2164 |
-
# line_is_header = text_matches_header and max_font_size > 11
|
| 2165 |
-
|
| 2166 |
-
|
| 2167 |
if line_is_header:
|
| 2168 |
header_font_size = max(span["size"] for span in spans)
|
| 2169 |
is_probably_real_header = (
|
|
|
|
| 484 |
logger.info(f"PDF opened successfully, {len(doc)} pages")
|
| 485 |
return doc
|
| 486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
def process_document_in_chunks(
|
| 489 |
lengthofDoc,
|
|
|
|
| 555 |
|
| 556 |
for pno in range(start_page, end_page):
|
| 557 |
page = doc.load_page(pno)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
page_height = page.rect.height
|
| 559 |
lines_on_page = 0
|
| 560 |
text_dict = page.get_text("dict")
|
|
|
|
| 868 |
logger.info(f"Returning {len(out)} valid header entries")
|
| 869 |
return out
|
| 870 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
|
| 873 |
logger.debug(f"Starting function")
|
|
|
|
| 1126 |
# Construct the final encoded link
|
| 1127 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1128 |
|
|
|
|
|
|
|
|
|
|
| 1129 |
# Get current date and time
|
| 1130 |
now = datetime.now()
|
| 1131 |
|
|
|
|
| 1227 |
# Construct the final encoded link
|
| 1228 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1229 |
|
|
|
|
|
|
|
|
|
|
| 1230 |
# Get current date and time
|
| 1231 |
now = datetime.now()
|
| 1232 |
|
|
|
|
| 1591 |
# Construct the final encoded link
|
| 1592 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1593 |
|
|
|
|
|
|
|
|
|
|
| 1594 |
# Get current date and time
|
| 1595 |
now = datetime.now()
|
| 1596 |
|
|
|
|
| 1693 |
# Construct the final encoded link
|
| 1694 |
encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
|
| 1695 |
|
|
|
|
|
|
|
|
|
|
| 1696 |
# Get current date and time
|
| 1697 |
now = datetime.now()
|
| 1698 |
|
|
|
|
| 1770 |
# for header in allheaders_LLM
|
| 1771 |
# )
|
| 1772 |
|
| 1773 |
+
# # ✅ FINAL header
|
|
|
|
|
|
|
|
|
|
| 1774 |
if line_is_header:
|
| 1775 |
header_font_size = max(span["size"] for span in spans)
|
| 1776 |
is_probably_real_header = (
|