Marthee commited on
Commit
de76b1b
·
verified ·
1 Parent(s): 22e26c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -35
app.py CHANGED
@@ -103,27 +103,19 @@ def headers_with_location(doc, llm_headers):
103
  Always include all headers, even if location not found.
104
  """
105
  headersJson = []
106
- print(f"DEBUG: Processing {len(llm_headers)} LLM headers in headers_with_location")
107
-
108
- for i, h in enumerate(llm_headers):
109
- text = h.get("text", "")
110
- llm_page = h.get("page", 0)
111
- suggested_level = h.get("suggested_level")
112
- confidence = h.get("confidence", 1.0)
113
-
114
- print(f"DEBUG: Header {i}: '{text}' on page {llm_page}")
115
-
116
  # Attempt to locate the header on the page
117
- locations = getLocation_of_header(doc, text, llm_page)
118
-
119
- print(f"DEBUG: Found {len(locations)} locations for '{text}'")
120
-
121
  if locations:
122
  for loc in locations:
123
  page = doc.load_page(loc["page"])
124
  fontsize = None
125
-
126
- # Try to find fontsize
127
  for block in page.get_text("dict")["blocks"]:
128
  if block.get("type") != 0:
129
  continue
@@ -134,36 +126,19 @@ def headers_with_location(doc, llm_headers):
134
  break
135
  if fontsize:
136
  break
137
-
138
  entry = [
139
  text,
140
  fontsize,
141
  loc["page"],
142
  loc["y"],
143
- suggested_level,
144
- confidence
145
  ]
146
  if entry not in headersJson:
147
  headersJson.append(entry)
148
- print(f"DEBUG: Added header with location: page={loc['page']}, y={loc['y']}")
149
- else:
150
- # If header not found, still include it with placeholder values
151
- print(f"DEBUG: Header '{text}' not found on page {llm_page}, using placeholders")
152
- entry = [
153
- text,
154
- None, # fontsize
155
- llm_page,
156
- None, # y coordinate
157
- suggested_level,
158
- confidence
159
- ]
160
- headersJson.append(entry)
161
-
162
- print(f"DEBUG: headers_with_location returning {len(headersJson)} headers")
163
  return headersJson
164
 
165
 
166
-
167
  def build_hierarchy_from_llm(headers):
168
  nodes = []
169
 
 
103
  Always include all headers, even if location not found.
104
  """
105
  headersJson = []
106
+
107
+ for h in llm_headers:
108
+ text = h["text"]
109
+ llm_page = h["page"]
110
+
 
 
 
 
 
111
  # Attempt to locate the header on the page
112
+ locations = getLocation_of_header(doc, text,llm_page)
113
+
 
 
114
  if locations:
115
  for loc in locations:
116
  page = doc.load_page(loc["page"])
117
  fontsize = None
118
+
 
119
  for block in page.get_text("dict")["blocks"]:
120
  if block.get("type") != 0:
121
  continue
 
126
  break
127
  if fontsize:
128
  break
 
129
  entry = [
130
  text,
131
  fontsize,
132
  loc["page"],
133
  loc["y"],
134
+ h["suggested_level"],
135
+
136
  ]
137
  if entry not in headersJson:
138
  headersJson.append(entry)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  return headersJson
140
 
141
 
 
142
  def build_hierarchy_from_llm(headers):
143
  nodes = []
144