Marthee commited on
Commit
37cab2d
·
verified ·
1 Parent(s): 5db2be4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -29
app.py CHANGED
@@ -147,41 +147,31 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
147
  text_dict = page.get_text("dict")
148
  lines = []
149
  y_tolerance = 0.2 # tweak if needed (1–3 usually works)
150
- for block in text_dict["blocks"]:
151
- if block["type"] != 0:
152
  continue
153
- for line in block["lines"]:
154
- for span in line["spans"]:
 
 
 
 
 
 
 
 
 
 
 
155
  text = span["text"].strip()
156
- if not text:
157
- continue
158
- x0, y0, x1, y1 = span["bbox"]
159
- matched = False
160
- for l in lines:
161
- if abs(l["y"] - y0) <= y_tolerance:
162
- l["spans"].append((x0, text))
163
- matched = True
164
- break
165
- if not matched:
166
- lines.append({
167
- "y": y0,
168
- "spans": [(x0, text)]
169
- })
170
- lines.sort(key=lambda l: l["y"])
171
-
172
- # Join text inside each line
173
- final_lines = []
174
- for l in lines:
175
- l["spans"].sort(key=lambda s: s[0]) # left → right
176
- line_text = " ".join(text for _, text in l["spans"])
177
- final_lines.append(line_text)
178
 
179
- # Result
180
- for line in final_lines:
181
 
182
  if text:
183
  # prefix with page for easier mapping back
184
- lines_for_prompt.append(f"PAGE {pno+1}: {line}")
185
  lines_on_page += 1
186
 
187
 
 
147
  text_dict = page.get_text("dict")
148
  lines = []
149
  y_tolerance = 0.2 # tweak if needed (1–3 usually works)
150
+ for block in page.get_text("dict").get('blocks', []):
151
+ if block.get('type') != 0:
152
  continue
153
+ for line in block.get('lines', []):
154
+ spans = line.get('spans', [])
155
+ if not spans:
156
+ continue
157
+ y0 = spans[0]['bbox'][1]
158
+ y1 = spans[0]['bbox'][3]
159
+ if y0 < top_margin or y1 > (page_height - bottom_margin):
160
+ continue
161
+ for s in spans:
162
+ # text,font,size,flags,color
163
+ # ArrayofTextWithFormat={'Font':s.get('font')},{'Size':s.get('size')},{'Flags':s.get('flags')},{'Color':s.get('color')},{'Text':s.get('text')}
164
+
165
+ # prefix with page for easier mapping back
166
  text = span["text"].strip()
167
+ lines_for_prompt.append(f"PAGE {pno+1}: {text}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
+ # if not lines_for_prompt:
170
+ # return []
171
 
172
  if text:
173
  # prefix with page for easier mapping back
174
+ # lines_for_prompt.append(f"PAGE {pno+1}: {line}")
175
  lines_on_page += 1
176
 
177