Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -103,27 +103,19 @@ def headers_with_location(doc, llm_headers):
|
|
| 103 |
Always include all headers, even if location not found.
|
| 104 |
"""
|
| 105 |
headersJson = []
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
suggested_level = h.get("suggested_level")
|
| 112 |
-
confidence = h.get("confidence", 1.0)
|
| 113 |
-
|
| 114 |
-
print(f"DEBUG: Header {i}: '{text}' on page {llm_page}")
|
| 115 |
-
|
| 116 |
# Attempt to locate the header on the page
|
| 117 |
-
locations = getLocation_of_header(doc, text,
|
| 118 |
-
|
| 119 |
-
print(f"DEBUG: Found {len(locations)} locations for '{text}'")
|
| 120 |
-
|
| 121 |
if locations:
|
| 122 |
for loc in locations:
|
| 123 |
page = doc.load_page(loc["page"])
|
| 124 |
fontsize = None
|
| 125 |
-
|
| 126 |
-
# Try to find fontsize
|
| 127 |
for block in page.get_text("dict")["blocks"]:
|
| 128 |
if block.get("type") != 0:
|
| 129 |
continue
|
|
@@ -134,36 +126,19 @@ def headers_with_location(doc, llm_headers):
|
|
| 134 |
break
|
| 135 |
if fontsize:
|
| 136 |
break
|
| 137 |
-
|
| 138 |
entry = [
|
| 139 |
text,
|
| 140 |
fontsize,
|
| 141 |
loc["page"],
|
| 142 |
loc["y"],
|
| 143 |
-
suggested_level,
|
| 144 |
-
|
| 145 |
]
|
| 146 |
if entry not in headersJson:
|
| 147 |
headersJson.append(entry)
|
| 148 |
-
print(f"DEBUG: Added header with location: page={loc['page']}, y={loc['y']}")
|
| 149 |
-
else:
|
| 150 |
-
# If header not found, still include it with placeholder values
|
| 151 |
-
print(f"DEBUG: Header '{text}' not found on page {llm_page}, using placeholders")
|
| 152 |
-
entry = [
|
| 153 |
-
text,
|
| 154 |
-
None, # fontsize
|
| 155 |
-
llm_page,
|
| 156 |
-
None, # y coordinate
|
| 157 |
-
suggested_level,
|
| 158 |
-
confidence
|
| 159 |
-
]
|
| 160 |
-
headersJson.append(entry)
|
| 161 |
-
|
| 162 |
-
print(f"DEBUG: headers_with_location returning {len(headersJson)} headers")
|
| 163 |
return headersJson
|
| 164 |
|
| 165 |
|
| 166 |
-
|
| 167 |
def build_hierarchy_from_llm(headers):
|
| 168 |
nodes = []
|
| 169 |
|
|
|
|
| 103 |
Always include all headers, even if location not found.
|
| 104 |
"""
|
| 105 |
headersJson = []
|
| 106 |
+
|
| 107 |
+
for h in llm_headers:
|
| 108 |
+
text = h["text"]
|
| 109 |
+
llm_page = h["page"]
|
| 110 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
# Attempt to locate the header on the page
|
| 112 |
+
locations = getLocation_of_header(doc, text,llm_page)
|
| 113 |
+
|
|
|
|
|
|
|
| 114 |
if locations:
|
| 115 |
for loc in locations:
|
| 116 |
page = doc.load_page(loc["page"])
|
| 117 |
fontsize = None
|
| 118 |
+
|
|
|
|
| 119 |
for block in page.get_text("dict")["blocks"]:
|
| 120 |
if block.get("type") != 0:
|
| 121 |
continue
|
|
|
|
| 126 |
break
|
| 127 |
if fontsize:
|
| 128 |
break
|
|
|
|
| 129 |
entry = [
|
| 130 |
text,
|
| 131 |
fontsize,
|
| 132 |
loc["page"],
|
| 133 |
loc["y"],
|
| 134 |
+
h["suggested_level"],
|
| 135 |
+
|
| 136 |
]
|
| 137 |
if entry not in headersJson:
|
| 138 |
headersJson.append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return headersJson
|
| 140 |
|
| 141 |
|
|
|
|
| 142 |
def build_hierarchy_from_llm(headers):
|
| 143 |
nodes = []
|
| 144 |
|