Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -102,19 +102,27 @@ def headers_with_location(doc, llm_headers):
|
|
| 102 |
Always include all headers, even if location not found.
|
| 103 |
"""
|
| 104 |
headersJson = []
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# Attempt to locate the header on the page
|
| 111 |
-
locations = getLocation_of_header(doc, text,llm_page)
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
if locations:
|
| 114 |
for loc in locations:
|
| 115 |
page = doc.load_page(loc["page"])
|
| 116 |
fontsize = None
|
| 117 |
-
|
|
|
|
| 118 |
for block in page.get_text("dict")["blocks"]:
|
| 119 |
if block.get("type") != 0:
|
| 120 |
continue
|
|
@@ -125,16 +133,32 @@ def headers_with_location(doc, llm_headers):
|
|
| 125 |
break
|
| 126 |
if fontsize:
|
| 127 |
break
|
|
|
|
| 128 |
entry = [
|
| 129 |
text,
|
| 130 |
fontsize,
|
| 131 |
loc["page"],
|
| 132 |
loc["y"],
|
| 133 |
-
|
| 134 |
-
|
| 135 |
]
|
| 136 |
if entry not in headersJson:
|
| 137 |
headersJson.append(entry)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return headersJson
|
| 139 |
|
| 140 |
|
|
@@ -809,26 +833,30 @@ def process_document_in_chunks(
|
|
| 809 |
LLM_prompt,
|
| 810 |
model,
|
| 811 |
chunk_size=15,
|
| 812 |
-
|
| 813 |
):
|
| 814 |
total_pages = lengthofDoc
|
| 815 |
all_results = []
|
| 816 |
-
|
|
|
|
|
|
|
| 817 |
for start in range(0, total_pages, chunk_size):
|
| 818 |
end = start + chunk_size
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
result = identify_headers_with_openrouterNEWW(
|
| 823 |
pdf_path=pdf_path,
|
| 824 |
model=model,
|
| 825 |
LLM_prompt=LLM_prompt,
|
| 826 |
pages_to_check=(start, end)
|
| 827 |
)
|
| 828 |
-
|
|
|
|
| 829 |
if result:
|
|
|
|
| 830 |
all_results.extend(result)
|
| 831 |
-
|
|
|
|
| 832 |
return all_results
|
| 833 |
|
| 834 |
|
|
|
|
| 102 |
Always include all headers, even if location not found.
|
| 103 |
"""
|
| 104 |
headersJson = []
|
| 105 |
+
print(f"DEBUG: Processing {len(llm_headers)} LLM headers in headers_with_location")
|
| 106 |
+
|
| 107 |
+
for i, h in enumerate(llm_headers):
|
| 108 |
+
text = h.get("text", "")
|
| 109 |
+
llm_page = h.get("page", 0)
|
| 110 |
+
suggested_level = h.get("suggested_level")
|
| 111 |
+
confidence = h.get("confidence", 1.0)
|
| 112 |
+
|
| 113 |
+
print(f"DEBUG: Header {i}: '{text}' on page {llm_page}")
|
| 114 |
+
|
| 115 |
# Attempt to locate the header on the page
|
| 116 |
+
locations = getLocation_of_header(doc, text, llm_page)
|
| 117 |
+
|
| 118 |
+
print(f"DEBUG: Found {len(locations)} locations for '{text}'")
|
| 119 |
+
|
| 120 |
if locations:
|
| 121 |
for loc in locations:
|
| 122 |
page = doc.load_page(loc["page"])
|
| 123 |
fontsize = None
|
| 124 |
+
|
| 125 |
+
# Try to find fontsize
|
| 126 |
for block in page.get_text("dict")["blocks"]:
|
| 127 |
if block.get("type") != 0:
|
| 128 |
continue
|
|
|
|
| 133 |
break
|
| 134 |
if fontsize:
|
| 135 |
break
|
| 136 |
+
|
| 137 |
entry = [
|
| 138 |
text,
|
| 139 |
fontsize,
|
| 140 |
loc["page"],
|
| 141 |
loc["y"],
|
| 142 |
+
suggested_level,
|
| 143 |
+
confidence
|
| 144 |
]
|
| 145 |
if entry not in headersJson:
|
| 146 |
headersJson.append(entry)
|
| 147 |
+
print(f"DEBUG: Added header with location: page={loc['page']}, y={loc['y']}")
|
| 148 |
+
else:
|
| 149 |
+
# If header not found, still include it with placeholder values
|
| 150 |
+
print(f"DEBUG: Header '{text}' not found on page {llm_page}, using placeholders")
|
| 151 |
+
entry = [
|
| 152 |
+
text,
|
| 153 |
+
None, # fontsize
|
| 154 |
+
llm_page,
|
| 155 |
+
None, # y coordinate
|
| 156 |
+
suggested_level,
|
| 157 |
+
confidence
|
| 158 |
+
]
|
| 159 |
+
headersJson.append(entry)
|
| 160 |
+
|
| 161 |
+
print(f"DEBUG: headers_with_location returning {len(headersJson)} headers")
|
| 162 |
return headersJson
|
| 163 |
|
| 164 |
|
|
|
|
| 833 |
LLM_prompt,
|
| 834 |
model,
|
| 835 |
chunk_size=15,
|
|
|
|
| 836 |
):
|
| 837 |
total_pages = lengthofDoc
|
| 838 |
all_results = []
|
| 839 |
+
|
| 840 |
+
print(f"DEBUG: process_document_in_chunks - Total pages: {total_pages}")
|
| 841 |
+
|
| 842 |
for start in range(0, total_pages, chunk_size):
|
| 843 |
end = start + chunk_size
|
| 844 |
+
|
| 845 |
+
print(f"DEBUG: Processing pages {start + 1} → {min(end, total_pages)}")
|
| 846 |
+
|
| 847 |
result = identify_headers_with_openrouterNEWW(
|
| 848 |
pdf_path=pdf_path,
|
| 849 |
model=model,
|
| 850 |
LLM_prompt=LLM_prompt,
|
| 851 |
pages_to_check=(start, end)
|
| 852 |
)
|
| 853 |
+
|
| 854 |
+
print(f"DEBUG: Chunk returned {len(result) if result else 0} headers")
|
| 855 |
if result:
|
| 856 |
+
print(f"DEBUG: Sample header from chunk: {result[0]}")
|
| 857 |
all_results.extend(result)
|
| 858 |
+
|
| 859 |
+
print(f"DEBUG: Total headers collected: {len(all_results)}")
|
| 860 |
return all_results
|
| 861 |
|
| 862 |
|