Marthee commited on
Commit
da33c89
·
verified ·
1 Parent(s): dbb038b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -24
app.py CHANGED
@@ -140,39 +140,69 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
140
  # if pno in toc_pages:
141
  # logger.debug(f"Skipping TOC page {pno}")
142
  # continue
143
-
144
  page = doc.load_page(pno)
145
  page_height = page.rect.height
146
- lines_on_page = 0
147
  text_dict = page.get_text("dict")
148
- lines = []
149
- y_tolerance = 0.2 # tweak if needed (1–3 usually works)
150
- for block in page.get_text("dict").get('blocks', []):
151
- if block.get('type') != 0:
 
152
  continue
153
- for line in block.get('lines', []):
154
- spans = line.get('spans', [])
 
155
  if not spans:
156
  continue
157
- y0 = spans[0]['bbox'][1]
158
- y1 = spans[0]['bbox'][3]
159
- if y0 < top_margin or y1 > (page_height - bottom_margin):
160
- continue
161
- for s in spans:
162
- # text,font,size,flags,color
163
- # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- # prefix with page for easier mapping back
166
- text = s["text"].strip()
167
- lines_for_prompt.append(f"PAGE {pno+1}: {text}")
168
 
169
- # if not lines_for_prompt:
170
- # return []
171
 
172
- if text:
173
- # prefix with page for easier mapping back
174
- # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
175
- lines_on_page += 1
176
 
177
 
178
  if lines_on_page > 0:
 
140
  # if pno in toc_pages:
141
  # logger.debug(f"Skipping TOC page {pno}")
142
  # continue
 
143
  page = doc.load_page(pno)
144
  page_height = page.rect.height
145
+
146
  text_dict = page.get_text("dict")
147
+ lines_for_prompt = []
148
+ lines_on_page = 0
149
+
150
+ for block in text_dict.get("blocks", []):
151
+ if block.get("type") != 0: # text blocks only
152
  continue
153
+
154
+ for line in block.get("lines", []):
155
+ spans = line.get("spans", [])
156
  if not spans:
157
  continue
158
+
159
+ # Use first span to check vertical position
160
+ y0 = spans[0]["bbox"][1]
161
+ y1 = spans[0]["bbox"][3]
162
+
163
+ # if y0 < top_margin or y1 > (page_height - bottom_margin):
164
+ # continue
165
+
166
+ for span in spans:
167
+ text = span.get("text", "").strip()
168
+ if not text:
169
+ continue
170
+
171
+ lines_for_prompt.append(f"PAGE {pno + 1}: {text}")
172
+ lines_on_page += 1
173
+
174
+ # page = doc.load_page(pno)
175
+ # page_height = page.rect.height
176
+ # lines_on_page = 0
177
+ # text_dict = page.get_text("dict")
178
+ # lines = []
179
+ # y_tolerance = 0.2 # tweak if needed (1–3 usually works)
180
+ # for block in page.get_text("dict").get('blocks', []):
181
+ # if block.get('type') != 0:
182
+ # continue
183
+ # for line in block.get('lines', []):
184
+ # spans = line.get('spans', [])
185
+ # if not spans:
186
+ # continue
187
+ # y0 = spans[0]['bbox'][1]
188
+ # y1 = spans[0]['bbox'][3]
189
+ # if y0 < top_margin or y1 > (page_height - bottom_margin):
190
+ # continue
191
+ # for s in spans:
192
+ # # text,font,size,flags,color
193
+ # # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
194
 
195
+ # # prefix with page for easier mapping back
196
+ # text = s["text"].strip()
197
+ # lines_for_prompt.append(f"PAGE {pno+1}: {text}")
198
 
199
+ # # if not lines_for_prompt:
200
+ # # return []
201
 
202
+ # if text:
203
+ # # prefix with page for easier mapping back
204
+ # # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
205
+ # lines_on_page += 1
206
 
207
 
208
  if lines_on_page > 0: