rawanessam commited on
Commit
b13a7a5
·
verified ·
1 Parent(s): 9322e01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -17
app.py CHANGED
@@ -102,19 +102,27 @@ def headers_with_location(doc, llm_headers):
102
  Always include all headers, even if location not found.
103
  """
104
  headersJson = []
105
-
106
- for h in llm_headers:
107
- text = h["text"]
108
- llm_page = h["page"]
109
-
 
 
 
 
 
110
  # Attempt to locate the header on the page
111
- locations = getLocation_of_header(doc, text,llm_page)
112
-
 
 
113
  if locations:
114
  for loc in locations:
115
  page = doc.load_page(loc["page"])
116
  fontsize = None
117
-
 
118
  for block in page.get_text("dict")["blocks"]:
119
  if block.get("type") != 0:
120
  continue
@@ -125,16 +133,32 @@ def headers_with_location(doc, llm_headers):
125
  break
126
  if fontsize:
127
  break
 
128
  entry = [
129
  text,
130
  fontsize,
131
  loc["page"],
132
  loc["y"],
133
- h["suggested_level"],
134
-
135
  ]
136
  if entry not in headersJson:
137
  headersJson.append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return headersJson
139
 
140
 
@@ -809,26 +833,30 @@ def process_document_in_chunks(
809
  LLM_prompt,
810
  model,
811
  chunk_size=15,
812
-
813
  ):
814
  total_pages = lengthofDoc
815
  all_results = []
816
-
 
 
817
  for start in range(0, total_pages, chunk_size):
818
  end = start + chunk_size
819
-
820
- logger.info(f"Processing pages {start + 1} → {min(end, total_pages)}")
821
-
822
  result = identify_headers_with_openrouterNEWW(
823
  pdf_path=pdf_path,
824
  model=model,
825
  LLM_prompt=LLM_prompt,
826
  pages_to_check=(start, end)
827
  )
828
-
 
829
  if result:
 
830
  all_results.extend(result)
831
-
 
832
  return all_results
833
 
834
 
 
102
  Always include all headers, even if location not found.
103
  """
104
  headersJson = []
105
+ print(f"DEBUG: Processing {len(llm_headers)} LLM headers in headers_with_location")
106
+
107
+ for i, h in enumerate(llm_headers):
108
+ text = h.get("text", "")
109
+ llm_page = h.get("page", 0)
110
+ suggested_level = h.get("suggested_level")
111
+ confidence = h.get("confidence", 1.0)
112
+
113
+ print(f"DEBUG: Header {i}: '{text}' on page {llm_page}")
114
+
115
  # Attempt to locate the header on the page
116
+ locations = getLocation_of_header(doc, text, llm_page)
117
+
118
+ print(f"DEBUG: Found {len(locations)} locations for '{text}'")
119
+
120
  if locations:
121
  for loc in locations:
122
  page = doc.load_page(loc["page"])
123
  fontsize = None
124
+
125
+ # Try to find fontsize
126
  for block in page.get_text("dict")["blocks"]:
127
  if block.get("type") != 0:
128
  continue
 
133
  break
134
  if fontsize:
135
  break
136
+
137
  entry = [
138
  text,
139
  fontsize,
140
  loc["page"],
141
  loc["y"],
142
+ suggested_level,
143
+ confidence
144
  ]
145
  if entry not in headersJson:
146
  headersJson.append(entry)
147
+ print(f"DEBUG: Added header with location: page={loc['page']}, y={loc['y']}")
148
+ else:
149
+ # If header not found, still include it with placeholder values
150
+ print(f"DEBUG: Header '{text}' not found on page {llm_page}, using placeholders")
151
+ entry = [
152
+ text,
153
+ None, # fontsize
154
+ llm_page,
155
+ None, # y coordinate
156
+ suggested_level,
157
+ confidence
158
+ ]
159
+ headersJson.append(entry)
160
+
161
+ print(f"DEBUG: headers_with_location returning {len(headersJson)} headers")
162
  return headersJson
163
 
164
 
 
833
  LLM_prompt,
834
  model,
835
  chunk_size=15,
 
836
  ):
837
  total_pages = lengthofDoc
838
  all_results = []
839
+
840
+ print(f"DEBUG: process_document_in_chunks - Total pages: {total_pages}")
841
+
842
  for start in range(0, total_pages, chunk_size):
843
  end = start + chunk_size
844
+
845
+ print(f"DEBUG: Processing pages {start + 1} → {min(end, total_pages)}")
846
+
847
  result = identify_headers_with_openrouterNEWW(
848
  pdf_path=pdf_path,
849
  model=model,
850
  LLM_prompt=LLM_prompt,
851
  pages_to_check=(start, end)
852
  )
853
+
854
+ print(f"DEBUG: Chunk returned {len(result) if result else 0} headers")
855
  if result:
856
+ print(f"DEBUG: Sample header from chunk: {result[0]}")
857
  all_results.extend(result)
858
+
859
+ print(f"DEBUG: Total headers collected: {len(all_results)}")
860
  return all_results
861
 
862