Spaces:

findConsole
/

PromptTesting

Sleeping

App Files Files Community

rawanessam commited on 19 days ago

Commit

0bd12fb

verified ·

1 Parent(s): 0ff391f

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -47

app.py CHANGED Viewed

@@ -3,26 +3,10 @@ import os
 import json
 import requests
 from io import BytesIO
-import gradio as gr
 import pandas as pd
-from io import BytesIO
 import fitz  # PyMuPDF
 from urllib.parse import urlparse, unquote
-import os
-from io import BytesIO
-import re
-import requests
-import pandas as pd
-import fitz  # PyMuPDF
 import re
-import urllib.parse
-import difflib
-from fuzzywuzzy import fuzz
-import copy
-# import tsadropboxretrieval
-import urllib.parse
 import logging
 # Set up logging to see everything
@@ -111,9 +95,9 @@ def openPDF(pdf_path):
 def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
-    Returns a list of dicts: {text, page, suggested_level, confidence}.
     The function sends plain page-line strings to the LLM (including page numbers)
-    and asks for a JSON array containing only header lines with suggested levels.
     """
     logger.info("=" * 80)
     logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
@@ -157,17 +141,10 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
                 y1 = spans[0]['bbox'][3]
                 # if y0 < top_margin or y1 > (page_height - bottom_margin):
                 #     continue
-                for s in spans:
-                    # text,font,size,flags,color
-                    ArrayofTextWithFormat={s.get('text')}
                     # prefix with page for easier mapping back
-                    lines_for_prompt.append(f"PAGE {pno+1}: {ArrayofTextWithFormat}")
-                # text = " ".join(s.get('text','') for s in spans).strip()
-                # if text:
-                #     # prefix with page for easier mapping back
-                #     lines_for_prompt.append(f"PAGE {pno+1}: {text}")
                     lines_on_page += 1
         if lines_on_page > 0:
@@ -188,7 +165,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
-    # Changed: Print entire prompt, not truncated
     print("=" * 80)
     print("FULL LLM PROMPT:")
     print(prompt)
@@ -203,13 +179,11 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         logger.error(f"Could not save prompt to file: {e}")
     if not api_key:
-        # No API key: return empty so caller can fallback to heuristics
         logger.error("No API key provided")
         return []
     url = "https://openrouter.ai/api/v1/chat/completions"
-    # Build headers following the OpenRouter example
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
@@ -217,11 +191,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
     }
-    # Log request details (without exposing full API key)
     logger.info(f"Making request to OpenRouter with model: {model}")
     logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
-    # Wrap the prompt as the example 'content' array expected by OpenRouter
     body = {
         "model": model,
         "messages": [
@@ -234,12 +206,9 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         ]
     }
-    # Debug: log request body (truncated) and write raw response for inspection
     try:
-        # Changed: Log full body (excluding prompt text which is already logged)
         logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
-        # Removed timeout parameter
         resp = requests.post(
             url=url,
             headers=headers,
@@ -250,7 +219,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         resp.raise_for_status()
         resp_text = resp.text
-        # Changed: Print entire response
         print("=" * 80)
         print("FULL LLM RESPONSE:")
         print(resp_text)
@@ -258,7 +226,6 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         logger.info(f"LLM raw response length: {len(resp_text)}")
-        # Save raw response for offline inspection
         try:
             with open("llm_debug.json", "w", encoding="utf-8") as fh:
                 fh.write(resp_text)
@@ -317,14 +284,12 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     if not text_reply:
         logger.error("Could not extract text reply from response")
-        # Changed: Print the entire response structure for debugging
         print("=" * 80)
         print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
         print(json.dumps(rj, indent=2))
         print("=" * 80)
         return []
-    # Changed: Print the extracted text reply
     print("=" * 80)
     print("EXTRACTED TEXT REPLY:")
     print(text_reply)
@@ -347,9 +312,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     except json.JSONDecodeError as e:
         logger.error(f"Failed to parse JSON: {e}")
         logger.error(f"JSON string that failed to parse: {js[:1000]}")
-        # Try to find any JSON-like structure
         try:
-            # Try to extract any JSON array
             import re
             json_pattern = r'\[\s*\{.*?\}\s*\]'
             matches = re.findall(json_pattern, text_reply, re.DOTALL)
@@ -375,7 +338,7 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
     # Log parsed results
     logger.info(f"Parsed {len(parsed)} header items:")
-    for i, obj in enumerate(parsed[:10]):  # Log first 10 items
         logger.info(f"  Item {i}: {obj}")
     # Normalize parsed entries and return
@@ -385,10 +348,24 @@ def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check
         page = int(obj.get('page')) if obj.get('page') else None
         level = obj.get('suggested_level')
         conf = float(obj.get('confidence') or 0)
         if t and page is not None:
-            out.append({'text': t, 'page': page-1, 'suggested_level': level, 'confidence': conf})
-    logger.info(f"Returning {len(out)} valid header entries")
     return out
@@ -413,13 +390,17 @@ def identify_headers_and_save_excel(pdf_path, model, llm_prompt):
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
         df.to_excel(output_path, index=False, engine='openpyxl')
         logger.info(f"Excel file saved successfully to: {output_path}")
-        # Verify file was created
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
@@ -440,7 +421,6 @@ iface = gr.Interface(
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")
 )
 if __name__ == "__main__":

 import json
 import requests
 from io import BytesIO
 import pandas as pd
 import fitz  # PyMuPDF
 from urllib.parse import urlparse, unquote
 import re
 import logging
 # Set up logging to see everything
 def identify_headers_with_openrouter(pdf_path, model, LLM_prompt, pages_to_check=None, top_margin=0, bottom_margin=0):
     """Ask an LLM (OpenRouter) to identify headers in the document.
+    Returns a list of dicts: {text, page, suggested_level, confidence, body}.
     The function sends plain page-line strings to the LLM (including page numbers)
+    and asks for a JSON array containing headers with suggested levels and body for the last header.
     """
     logger.info("=" * 80)
     logger.info("STARTING IDENTIFY_HEADERS_WITH_OPENROUTER")
                 y1 = spans[0]['bbox'][3]
                 # if y0 < top_margin or y1 > (page_height - bottom_margin):
                 #     continue
+                text = " ".join(s.get('text','') for s in spans).strip()
+                if text:
                     # prefix with page for easier mapping back
+                    lines_for_prompt.append(f"PAGE {pno+1}: {text}")
                     lines_on_page += 1
         if lines_on_page > 0:
     prompt = LLM_prompt + "\n\nLines:\n" + "\n".join(lines_for_prompt)
     logger.debug(f"Full prompt length: {len(prompt)} characters")
     print("=" * 80)
     print("FULL LLM PROMPT:")
     print(prompt)
         logger.error(f"Could not save prompt to file: {e}")
     if not api_key:
         logger.error("No API key provided")
         return []
     url = "https://openrouter.ai/api/v1/chat/completions"
     headers = {
         "Authorization": f"Bearer {api_key}",
         "Content-Type": "application/json",
         "X-Title": os.getenv("OPENROUTER_X_TITLE", "")
     }
     logger.info(f"Making request to OpenRouter with model: {model}")
     logger.debug(f"Headers (API key masked): { {k: '***' if k == 'Authorization' else v for k, v in headers.items()} }")
     body = {
         "model": model,
         "messages": [
         ]
     }
     try:
         logger.debug(f"Request body (without prompt text): { {k: v if k != 'messages' else '[...prompt...]' for k, v in body.items()} }")
         resp = requests.post(
             url=url,
             headers=headers,
         resp.raise_for_status()
         resp_text = resp.text
         print("=" * 80)
         print("FULL LLM RESPONSE:")
         print(resp_text)
         logger.info(f"LLM raw response length: {len(resp_text)}")
         try:
             with open("llm_debug.json", "w", encoding="utf-8") as fh:
                 fh.write(resp_text)
     if not text_reply:
         logger.error("Could not extract text reply from response")
         print("=" * 80)
         print("FAILED TO EXTRACT TEXT REPLY. FULL RESPONSE STRUCTURE:")
         print(json.dumps(rj, indent=2))
         print("=" * 80)
         return []
     print("=" * 80)
     print("EXTRACTED TEXT REPLY:")
     print(text_reply)
     except json.JSONDecodeError as e:
         logger.error(f"Failed to parse JSON: {e}")
         logger.error(f"JSON string that failed to parse: {js[:1000]}")
         try:
             import re
             json_pattern = r'\[\s*\{.*?\}\s*\]'
             matches = re.findall(json_pattern, text_reply, re.DOTALL)
     # Log parsed results
     logger.info(f"Parsed {len(parsed)} header items:")
+    for i, obj in enumerate(parsed[:10]):
         logger.info(f"  Item {i}: {obj}")
     # Normalize parsed entries and return
         page = int(obj.get('page')) if obj.get('page') else None
         level = obj.get('suggested_level')
         conf = float(obj.get('confidence') or 0)
+        body = obj.get('body', '')  # Get body content, default to empty string
         if t and page is not None:
+            out.append({
+                'text': t,
+                'page': page-1,
+                'suggested_level': level,
+                'confidence': conf,
+                'body': body  # Add body to output
+            })
+    logger.info(f"Returning {len(out)} valid header entries with body content for last header")
+    # Log which entries have body content
+    for i, item in enumerate(out):
+        if item.get('body'):
+            logger.info(f"Entry {i} has body content (length: {len(item['body'])})")
     return out
     logger.info("DataFrame head:")
     logger.info(df.head().to_string())
+    # Check which rows have body content
+    bodies = df['body'].tolist()
+    non_empty_bodies = [b for b in bodies if b and str(b).strip()]
+    logger.info(f"Found {len(non_empty_bodies)} entries with body content")
     # Save Excel to a file on disk
     output_path = "output.xlsx"
     try:
         df.to_excel(output_path, index=False, engine='openpyxl')
         logger.info(f"Excel file saved successfully to: {output_path}")
         if os.path.exists(output_path):
             file_size = os.path.getsize(output_path)
             logger.info(f"Output file exists, size: {file_size} bytes")
         gr.Textbox(label="LLM Prompt")
     ],
     outputs = gr.File(file_count="single", label="Download Excel")
 )
 if __name__ == "__main__":