rawanessam commited on
Commit
c2dc4a5
·
verified ·
1 Parent(s): cd2c25f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -394
app.py CHANGED
@@ -484,325 +484,6 @@ def openPDF(pdf_path):
484
  logger.info(f"PDF opened successfully, {len(doc)} pages")
485
  return doc
486
 
487
- # def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
488
- # """Ask an LLM (OpenRouter) to identify headers in the document.
489
- # Returns a list of dicts: {text, page, suggested_level, confidence}.
490
- # The function sends plain page-line strings to the LLM (including page numbers)
491
- # and asks for a JSON array containing only header lines with suggested levels.
492
- # """
493
- # logger.info("=" * 80)
494
- # logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
495
- # logger.info(f"PDF Path: {pdf_path}")
496
- # logger.info(f"Model: {model}")
497
- # logger.info(f"LLM Prompt: {LLM_prompt[:200]}..." if len(LLM_prompt) > 200 else f"LLM Prompt: {LLM_prompt}")
498
-
499
- # doc = openPDF(pdf_path)
500
- # api_key = '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'
501
- # if api_key is None:
502
- # api_key = os.getenv("OPENROUTER_API_KEY") or None
503
- # model = str(model)
504
- # # toc_pages = get_toc_page_numbers(doc)
505
- # lines_for_prompt = []
506
- # pgestoRun=20
507
- # # logger.info(f"TOC pages to skip: {toc_pages}")
508
- # logger.info(f"Total pages in document: {pgestoRun}")
509
-
510
- # # Collect text lines from pages (skip TOC pages)
511
- # total_lines = 0
512
- # for pno in range(len(doc)):
513
- # # if pages_to_check and pno not in pages_to_check:
514
- # # continue
515
- # # if pno in toc_pages:
516
- # # logger.debug(f"Skipping TOC page {pno}")
517
- # # continue
518
- # page = doc.load_page(pno)
519
- # page_height = page.rect.height
520
-
521
- # text_dict = page.get_text("dict")
522
- # lines_for_prompt = []
523
- # lines_on_page = 0
524
-
525
- # for block in text_dict.get("blocks", []):
526
- # if block.get("type") != 0: # text blocks only
527
- # continue
528
-
529
- # for line in block.get("lines", []):
530
- # spans = line.get("spans", [])
531
- # if not spans:
532
- # continue
533
-
534
- # # Use first span to check vertical position
535
- # y0 = spans[0]["bbox"][1]
536
- # y1 = spans[0]['bbox'][3]
537
- # # if y0 < top_margin or y1 > (page_height - bottom_margin):
538
- # # continue
539
- # text = " ".join(s.get('text','') for s in spans).strip()
540
- # if text:
541
-
542
-
543
- # # prefix with page for easier mapping back
544
- # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
545
- # lines_on_page += 1
546
-
547
- # # if lines_on_page > 0:
548
-
549
- # # page = doc.load_page(pno)
550
- # # page_height = page.rect.height
551
- # # lines_on_page = 0
552
- # # text_dict = page.get_text("dict")
553
- # # lines = []
554
- # # y_tolerance = 0.2 # tweak if needed (1–3 usually works)
555
- # # for block in page.get_text("dict").get('blocks', []):
556
- # # if block.get('type') != 0:
557
- # # continue
558
- # # for line in block.get('lines', []):
559
- # # spans = line.get('spans', [])
560
- # # if not spans:
561
- # # continue
562
- # # y0 = spans[0]['bbox'][1]
563
- # # y1 = spans[0]['bbox'][3]
564
- # # if y0 < top_margin or y1 > (page_height - bottom_margin):
565
- # # continue
566
- # # for s in spans:
567
- # # # text,font,size,flags,color
568
- # # # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
569
-
570
- # # # prefix with page for easier mapping back
571
- # # text = s["text"].strip()
572
- # # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
573
-
574
- # # # if not lines_for_prompt:
575
- # # # return []
576
-
577
- # # if text:
578
- # # # prefix with page for easier mapping back
579
- # # # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
580
- # # lines_on_page += 1
581
-
582
-
583
- # if lines_on_page > 0:
584
- # logger.debug(f"Page {pno}: collected {lines_on_page} lines")
585
- # total_lines += lines_on_page
586
-
587
- # logger.info(f"Total lines collected for LLM: {total_lines}")
588
-
589
- # if not lines_for_prompt:
590
- # logger.warning("No lines collected for prompt")
591
- # return []
592
-
593
- # # Log sample of lines
594
- # logger.info("Sample lines (first 10):")
595
- # for i, line in enumerate(lines_for_prompt[:10]):
596
- # logger.info(f" {i}: {line}")
597
-
598
- # prompt = LLM_prompt+"\n\nLines:\n" + "\n".join(lines_for_prompt)
599
-
600
-
601
- # logger.debug(f"Full prompt length: {len(prompt)} characters")
602
- # # Changed: Print entire prompt, not truncated
603
- # print("=" * 80)
604
- # print("FULL LLM PROMPT:")
605
- # print(prompt)
606
- # print("=" * 80)
607
-
608
- # # Also log to file
609
- # # try:
610
- # # with open("full_prompt.txt", "w", encoding="utf-8") as f:
611
- # # f.write(prompt)
612
- # # logger.info("Full prompt saved to full_prompt.txt")
613
- # # except Exception as e:
614
- # # logger.error(f"Could not save prompt to file: {e}")
615
-
616
- # if not api_key:
617
- # # No API key: return empty so caller can fallback to heuristics
618
- # logger.error("No API key provided")
619
- # return []
620
-
621
- # url = "https://openrouter.ai/api/v1/chat/completions"
622
-
623
- # # Build headers following the OpenRouter example
624
- # headers = {
625
- # "Authorization": f"Bearer {api_key}",
626
- # "Content-Type": "application/json",
627
- # "HTTP-Referer": os.getenv("OPENROUTER_REFERER", ""),
628
- # "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
629
- # }
630
-
631
- # # Log request details (without exposing full API key)
632
- # logger.info(f"Making request to OpenRouter with model: {model}")
633
- # logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
634
-
635
- # # Wrap the prompt as the example 'content' array expected by OpenRouter
636
- # body = {
637
- # "model": model,
638
- # "messages": [
639
- # {
640
- # "role": "user",
641
- # "content": [
642
- # {"type": "text", "text": prompt}
643
- # ]
644
- # }
645
- # ]
646
- # }
647
-
648
- # # Debug: log request body (truncated) and write raw response for inspection
649
- # try:
650
- # # Changed: Log full body (excluding prompt text which is already logged)
651
- # logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
652
-
653
- # # Removed timeout parameter
654
- # resp = requests.post(
655
- # url=url,
656
- # headers=headers,
657
- # data=json.dumps(body)
658
- # )
659
-
660
- # logger.info(f"HTTP Response Status: {resp.status_code}")
661
- # resp.raise_for_status()
662
-
663
- # resp_text = resp.text
664
- # # Changed: Print entire response
665
- # print("=" * 80)
666
- # print("FULL LLM RESPONSE:")
667
- # print(resp_text)
668
- # print("=" * 80)
669
-
670
- # logger.info(f"LLM raw response length: {len(resp_text)}")
671
-
672
- # # Save raw response for offline inspection
673
- # try:
674
- # with open("llm_debug.json", "w", encoding="utf-8") as fh:
675
- # fh.write(resp_text)
676
- # logger.info("Raw response saved to llm_debug.json")
677
- # except Exception as e:
678
- # logger.error(f"Warning: could not write llm_debug.json: {e}")
679
-
680
- # rj = resp.json()
681
- # logger.info(f"LLM parsed response type: {type(rj)}")
682
- # if isinstance(rj, dict):
683
- # logger.debug(f"Response keys: {list(rj.keys())}")
684
-
685
- # except requests.exceptions.RequestException as e:
686
- # logger.error(f"HTTP request failed: {repr(e)}")
687
- # return []
688
- # except Exception as e:
689
- # logger.error(f"LLM call failed: {repr(e)}")
690
- # return []
691
-
692
- # # Extract textual reply robustly
693
- # text_reply = None
694
- # if isinstance(rj, dict):
695
- # choices = rj.get('choices') or []
696
- # logger.debug(f"Number of choices in response: {len(choices)}")
697
-
698
- # if choices:
699
- # for i, c in enumerate(choices):
700
- # logger.debug(f"Choice {i}: {c}")
701
-
702
- # c0 = choices[0]
703
- # msg = c0.get('message') or c0.get('delta') or {}
704
- # content = msg.get('content')
705
-
706
- # if isinstance(content, list):
707
- # logger.debug(f"Content is a list with {len(content)} items")
708
- # for idx, c in enumerate(content):
709
- # if c.get('type') == 'text' and c.get('text'):
710
- # text_reply = c.get('text')
711
- # logger.debug(f"Found text reply in content[{idx}], length: {len(text_reply)}")
712
- # break
713
- # elif isinstance(content, str):
714
- # text_reply = content
715
- # logger.debug(f"Content is string, length: {len(text_reply)}")
716
- # elif isinstance(msg, dict) and msg.get('content') and isinstance(msg.get('content'), dict):
717
- # text_reply = msg.get('content').get('text')
718
- # logger.debug(f"Found text in nested content dict")
719
-
720
- # # Fallback extraction
721
- # if not text_reply:
722
- # logger.debug("Trying fallback extraction from choices")
723
- # for c in rj.get('choices', []):
724
- # if isinstance(c.get('text'), str):
725
- # text_reply = c.get('text')
726
- # logger.debug(f"Found text reply in choice.text, length: {len(text_reply)}")
727
- # break
728
-
729
- # if not text_reply:
730
- # logger.error("Could not extract text reply from response")
731
- # # Changed: Print the entire response structure for debugging
732
- # print("=" * 80)
733
- # print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
734
- # print(json.dumps(rj, indent=2))
735
- # print("=" * 80)
736
- # return []
737
-
738
- # # Changed: Print the extracted text reply
739
- # print("=" * 80)
740
- # print("EXTRACTED TEXT REPLY:")
741
- # print(text_reply)
742
- # print("=" * 80)
743
-
744
- # logger.info(f"Extracted text reply length: {len(text_reply)}")
745
- # logger.debug(f"First 500 chars of reply: {text_reply[:500]}...")
746
-
747
- # s = text_reply.strip()
748
- # start = s.find('[')
749
- # end = s.rfind(']')
750
- # js = s[start:end+1] if start != -1 and end != -1 else s
751
-
752
- # logger.debug(f"Looking for JSON array: start={start}, end={end}")
753
- # logger.debug(f"Extracted JSON string (first 500 chars): {js[:500]}...")
754
-
755
- # try:
756
- # parsed = json.loads(js)
757
- # logger.info(f"Successfully parsed JSON, got {len(parsed)} items")
758
- # except json.JSONDecodeError as e:
759
- # logger.error(f"Failed to parse JSON: {e}")
760
- # logger.error(f"JSON string that failed to parse: {js[:1000]}")
761
- # # Try to find any JSON-like structure
762
- # try:
763
- # # Try to extract any JSON array
764
- # import re
765
- # json_pattern = r'\[\s*\{.*?\}\s*\]'
766
- # matches = re.findall(json_pattern, text_reply, re.DOTALL)
767
- # if matches:
768
- # logger.info(f"Found {len(matches)} potential JSON arrays via regex")
769
- # for i, match in enumerate(matches):
770
- # try:
771
- # parsed = json.loads(match)
772
- # logger.info(f"Successfully parsed regex match {i} with {len(parsed)} items")
773
- # break
774
- # except json.JSONDecodeError as e2:
775
- # logger.debug(f"Regex match {i} also failed: {e2}")
776
- # continue
777
- # else:
778
- # logger.error("All regex matches failed to parse")
779
- # return []
780
- # else:
781
- # logger.error("No JSON-like pattern found via regex")
782
- # return []
783
- # except Exception as e2:
784
- # logger.error(f"Regex extraction also failed: {e2}")
785
- # return []
786
-
787
- # # Log parsed results
788
- # logger.info(f"Parsed {len(parsed)} header items:")
789
- # for i, obj in enumerate(parsed[:10]): # Log first 10 items
790
- # logger.info(f" Item {i}: {obj}")
791
-
792
- # # Normalize parsed entries and return
793
- # out = []
794
- # for obj in parsed:
795
- # t = obj.get('text')
796
- # page = int(obj.get('page')) if obj.get('page') else None
797
- # level = obj.get('suggested_level')
798
- # conf = float(obj.get('confidence') or 0)
799
- # if t and page is not None:
800
- # out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
801
-
802
- # logger.info(f"Returning {len(out)} valid header entries")
803
- # return out
804
-
805
-
806
 
807
  def process_document_in_chunks(
808
  lengthofDoc,
@@ -874,39 +555,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
874
 
875
  for pno in range(start_page, end_page):
876
  page = doc.load_page(pno)
877
- # # Collect text lines from pages (skip TOC pages)
878
- # total_lines = 0
879
- # for pno in range(len(doc)):
880
- # if pages_to_check and pno not in pages_to_check:
881
- # continue
882
- # if pno in toc_pages:
883
- # logger.debug(f"Skipping TOC page {pno}")
884
- # continue
885
-
886
- # page = doc.load_page(pno)
887
- # page_height = page.rect.height
888
- # lines_on_page = 0
889
- # text_dict = page.get_text("dict")
890
- # lines = []
891
- # # y_tolerance = 0.2 # tweak if needed (1–3 usually works)
892
- # for block in text_dict["blocks"]:
893
- # if block["type"] != 0:
894
- # continue
895
- # for line in block["lines"]:
896
- # for span in line["spans"]:
897
- # text = span["text"].strip()
898
- # if not text:
899
- # continue
900
- # if text:
901
- # # prefix with page for easier mapping back
902
- # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
903
- # lines_on_page += 1
904
-
905
- # if lines_on_page > 0:
906
- # logger.debug(f"Page {pno}: collected {lines_on_page} lines")
907
- # total_lines += lines_on_page
908
-
909
- # logger.info(f"Total lines collected for LLM: {total_lines}")
910
  page_height = page.rect.height
911
  lines_on_page = 0
912
  text_dict = page.get_text("dict")
@@ -1220,32 +868,6 @@ def identify_headers_with_openrouterNEWW(pdf_path, model,LLM_prompt, pages_to_ch
1220
  logger.info(f"Returning {len(out)} valid header entries")
1221
  return out
1222
 
1223
- # def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
1224
- # try:
1225
- # # 1. Get the result from your LLM function
1226
- # result = identify_headers_with_openrouter(pdf_path, model, llm_prompt)
1227
-
1228
- # # 2. Safety Check: If LLM failed or returned nothing
1229
- # if not result:
1230
- # logger.warning("No headers found or LLM failed. Creating an empty report.")
1231
- # df = pd.DataFrame([{"System Message": "No headers were identified by the LLM."}])
1232
- # else:
1233
- # df = pd.DataFrame(result)
1234
-
1235
- # # 3. Use an Absolute Path for the output
1236
- # # This ensures Gradio knows exactly where the file is
1237
- # output_path = os.path.abspath("header_analysis_output.xlsx")
1238
-
1239
- # # 4. Save using the engine explicitly
1240
- # df.to_excel(output_path, index=False, engine='openpyxl')
1241
-
1242
- # logger.info(f"File successfully saved to {output_path}")
1243
- # return output_path
1244
-
1245
- # except Exception as e:
1246
- # logger.error(f"Critical error in processing: {str(e)}")
1247
- # # Return None or a custom error message to Gradio
1248
- # return None
1249
 
1250
  def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
1251
  logger.debug(f"Starting function")
@@ -1504,9 +1126,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1504
  # Construct the final encoded link
1505
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1506
 
1507
- # Correctly construct the final URL with page and zoom
1508
- # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1509
-
1510
  # Get current date and time
1511
  now = datetime.now()
1512
 
@@ -1608,9 +1227,6 @@ def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,
1608
  # Construct the final encoded link
1609
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1610
 
1611
- # Correctly construct the final URL with page and zoom
1612
- # final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1613
-
1614
  # Get current date and time
1615
  now = datetime.now()
1616
 
@@ -1975,9 +1591,6 @@ def testFunction(pdf_path, model,LLM_prompt):
1975
  # Construct the final encoded link
1976
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1977
 
1978
- # Correctly construct the final URL with page and zoom
1979
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
1980
-
1981
  # Get current date and time
1982
  now = datetime.now()
1983
 
@@ -2080,9 +1693,6 @@ def testFunction(pdf_path, model,LLM_prompt):
2080
  # Construct the final encoded link
2081
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
2082
 
2083
- # Correctly construct the final URL with page and zoom
2084
- final_url = f"{baselink}{encoded_link}#page={str(pageNumberFound)}&zoom={zoom_str}"
2085
-
2086
  # Get current date and time
2087
  now = datetime.now()
2088
 
@@ -2160,10 +1770,7 @@ def testFunction(pdf_path, model,LLM_prompt):
2160
  # for header in allheaders_LLM
2161
  # )
2162
 
2163
- # # ✅ FINAL header condition
2164
- # line_is_header = text_matches_header and max_font_size > 11
2165
-
2166
-
2167
  if line_is_header:
2168
  header_font_size = max(span["size"] for span in spans)
2169
  is_probably_real_header = (
 
484
  logger.info(f"PDF opened successfully, {len(doc)} pages")
485
  return doc
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  def process_document_in_chunks(
489
  lengthofDoc,
 
555
 
556
  for pno in range(start_page, end_page):
557
  page = doc.load_page(pno)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
558
  page_height = page.rect.height
559
  lines_on_page = 0
560
  text_dict = page.get_text("dict")
 
868
  logger.info(f"Returning {len(out)} valid header entries")
869
  return out
870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
871
 
872
  def extract_section_under_header_tobebilledMultiplePDFS(multiplePDF_Paths,model,identified_headers):
873
  logger.debug(f"Starting function")
 
1126
  # Construct the final encoded link
1127
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1128
 
 
 
 
1129
  # Get current date and time
1130
  now = datetime.now()
1131
 
 
1227
  # Construct the final encoded link
1228
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1229
 
 
 
 
1230
  # Get current date and time
1231
  now = datetime.now()
1232
 
 
1591
  # Construct the final encoded link
1592
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1593
 
 
 
 
1594
  # Get current date and time
1595
  now = datetime.now()
1596
 
 
1693
  # Construct the final encoded link
1694
  encoded_link = '&'.join([f"{key}={value}" for key, value in encoded_params.items()])
1695
 
 
 
 
1696
  # Get current date and time
1697
  now = datetime.now()
1698
 
 
1770
  # for header in allheaders_LLM
1771
  # )
1772
 
1773
+ # # ✅ FINAL header
 
 
 
1774
  if line_is_header:
1775
  header_font_size = max(span["size"] for span in spans)
1776
  is_probably_real_header = (