Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -140,39 +140,69 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
|
|
| 140 |
# if pno in toc_pages:
|
| 141 |
# logger.debug(f"Skipping TOC page {pno}")
|
| 142 |
# continue
|
| 143 |
-
|
| 144 |
page = doc.load_page(pno)
|
| 145 |
page_height = page.rect.height
|
| 146 |
-
|
| 147 |
text_dict = page.get_text("dict")
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
|
|
|
| 152 |
continue
|
| 153 |
-
|
| 154 |
-
|
|
|
|
| 155 |
if not spans:
|
| 156 |
continue
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
|
| 177 |
|
| 178 |
if lines_on_page > 0:
|
|
|
|
| 140 |
# if pno in toc_pages:
|
| 141 |
# logger.debug(f"Skipping TOC page {pno}")
|
| 142 |
# continue
|
|
|
|
| 143 |
page = doc.load_page(pno)
|
| 144 |
page_height = page.rect.height
|
| 145 |
+
|
| 146 |
text_dict = page.get_text("dict")
|
| 147 |
+
lines_for_prompt = []
|
| 148 |
+
lines_on_page = 0
|
| 149 |
+
|
| 150 |
+
for block in text_dict.get("blocks", []):
|
| 151 |
+
if block.get("type") != 0: # text blocks only
|
| 152 |
continue
|
| 153 |
+
|
| 154 |
+
for line in block.get("lines", []):
|
| 155 |
+
spans = line.get("spans", [])
|
| 156 |
if not spans:
|
| 157 |
continue
|
| 158 |
+
|
| 159 |
+
# Use first span to check vertical position
|
| 160 |
+
y0 = spans[0]["bbox"][1]
|
| 161 |
+
y1 = spans[0]["bbox"][3]
|
| 162 |
+
|
| 163 |
+
# if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 164 |
+
# continue
|
| 165 |
+
|
| 166 |
+
for span in spans:
|
| 167 |
+
text = span.get("text", "").strip()
|
| 168 |
+
if not text:
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
lines_for_prompt.append(f"PAGE {pno + 1}: {text}")
|
| 172 |
+
lines_on_page += 1
|
| 173 |
+
|
| 174 |
+
# page = doc.load_page(pno)
|
| 175 |
+
# page_height = page.rect.height
|
| 176 |
+
# lines_on_page = 0
|
| 177 |
+
# text_dict = page.get_text("dict")
|
| 178 |
+
# lines = []
|
| 179 |
+
# y_tolerance = 0.2 # tweak if needed (1–3 usually works)
|
| 180 |
+
# for block in page.get_text("dict").get('blocks', []):
|
| 181 |
+
# if block.get('type') != 0:
|
| 182 |
+
# continue
|
| 183 |
+
# for line in block.get('lines', []):
|
| 184 |
+
# spans = line.get('spans', [])
|
| 185 |
+
# if not spans:
|
| 186 |
+
# continue
|
| 187 |
+
# y0 = spans[0]['bbox'][1]
|
| 188 |
+
# y1 = spans[0]['bbox'][3]
|
| 189 |
+
# if y0 < top_margin or y1 > (page_height - bottom_margin):
|
| 190 |
+
# continue
|
| 191 |
+
# for s in spans:
|
| 192 |
+
# # text,font,size,flags,color
|
| 193 |
+
# # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
|
| 194 |
|
| 195 |
+
# # prefix with page for easier mapping back
|
| 196 |
+
# text = s["text"].strip()
|
| 197 |
+
# lines_for_prompt.append(f"PAGE {pno+1}: {text}")
|
| 198 |
|
| 199 |
+
# # if not lines_for_prompt:
|
| 200 |
+
# # return []
|
| 201 |
|
| 202 |
+
# if text:
|
| 203 |
+
# # prefix with page for easier mapping back
|
| 204 |
+
# # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
|
| 205 |
+
# lines_on_page += 1
|
| 206 |
|
| 207 |
|
| 208 |
if lines_on_page > 0:
|