Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

rawanessam commited on 7 days ago

Commit

b13a7a5

verified ·

1 Parent(s): 9322e01

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -17

app.py CHANGED Viewed

@@ -102,19 +102,27 @@ def headers_with_location(doc, llm_headers):
     Always include all headers, even if location not found.
     """
     headersJson = []
-    for h in llm_headers:
-        text = h["text"]
-        llm_page = h["page"]
         # Attempt to locate the header on the page
-        locations = getLocation_of_header(doc, text,llm_page)
         if locations:
             for loc in locations:
                 page = doc.load_page(loc["page"])
                 fontsize = None
                 for block in page.get_text("dict")["blocks"]:
                     if block.get("type") != 0:
                         continue
@@ -125,16 +133,32 @@ def headers_with_location(doc, llm_headers):
                             break
                     if fontsize:
                         break
                 entry = [
                     text,
                     fontsize,
                     loc["page"],
                     loc["y"],
-                    h["suggested_level"],
                 ]
                 if entry not in headersJson:
                     headersJson.append(entry)
     return headersJson
@@ -809,26 +833,30 @@ def process_document_in_chunks(
     LLM_prompt,
     model,
     chunk_size=15,
 ):
     total_pages = lengthofDoc
     all_results = []
     for start in range(0, total_pages, chunk_size):
         end = start + chunk_size
-        logger.info(f"Processing pages {start + 1} → {min(end, total_pages)}")
         result = identify_headers_with_openrouterNEWW(
             pdf_path=pdf_path,
             model=model,
             LLM_prompt=LLM_prompt,
             pages_to_check=(start, end)
         )
         if result:
             all_results.extend(result)
     return all_results

     Always include all headers, even if location not found.
     """
     headersJson = []
+    print(f"DEBUG: Processing {len(llm_headers)} LLM headers in headers_with_location")
+    for i, h in enumerate(llm_headers):
+        text = h.get("text", "")
+        llm_page = h.get("page", 0)
+        suggested_level = h.get("suggested_level")
+        confidence = h.get("confidence", 1.0)
+        print(f"DEBUG: Header {i}: '{text}' on page {llm_page}")
         # Attempt to locate the header on the page
+        locations = getLocation_of_header(doc, text, llm_page)
+        print(f"DEBUG: Found {len(locations)} locations for '{text}'")
         if locations:
             for loc in locations:
                 page = doc.load_page(loc["page"])
                 fontsize = None
+                # Try to find fontsize
                 for block in page.get_text("dict")["blocks"]:
                     if block.get("type") != 0:
                         continue
                             break
                     if fontsize:
                         break
                 entry = [
                     text,
                     fontsize,
                     loc["page"],
                     loc["y"],
+                    suggested_level,
+                    confidence
                 ]
                 if entry not in headersJson:
                     headersJson.append(entry)
+                    print(f"DEBUG: Added header with location: page={loc['page']}, y={loc['y']}")
+        else:
+            # If header not found, still include it with placeholder values
+            print(f"DEBUG: Header '{text}' not found on page {llm_page}, using placeholders")
+            entry = [
+                text,
+                None,  # fontsize
+                llm_page,
+                None,  # y coordinate
+                suggested_level,
+                confidence
+            ]
+            headersJson.append(entry)
+    print(f"DEBUG: headers_with_location returning {len(headersJson)} headers")
     return headersJson
     LLM_prompt,
     model,
     chunk_size=15,
 ):
     total_pages = lengthofDoc
     all_results = []
+    print(f"DEBUG: process_document_in_chunks - Total pages: {total_pages}")
     for start in range(0, total_pages, chunk_size):
         end = start + chunk_size
+        print(f"DEBUG: Processing pages {start + 1} → {min(end, total_pages)}")
         result = identify_headers_with_openrouterNEWW(
             pdf_path=pdf_path,
             model=model,
             LLM_prompt=LLM_prompt,
             pages_to_check=(start, end)
         )
+        print(f"DEBUG: Chunk returned {len(result) if result else 0} headers")
         if result:
+            print(f"DEBUG: Sample header from chunk: {result[0]}")
             all_results.extend(result)
+    print(f"DEBUG: Total headers collected: {len(all_results)}")
     return all_results